In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
%cd /Users/alex/PETsARD

/Users/alex/PETsARD


In [4]:
def check(data: pd.DataFrame) -> None:

    # check data size

    print(f'Data shape: {data.shape}\n')

    if data.shape[0] < 5000:
        print(f'>>> Data Size: Too small ({data.shape[0]} records)')
    else:
        print('    Data Size: OK')

    # check data type
    
    dtype_ratio = {
        'obj': data.apply(pd.api.types.is_object_dtype, axis=0).mean(),
        'cat': data.apply(pd.api.types.is_categorical_dtype, axis=0).mean(),
        'dt': data.apply(pd.api.types.is_datetime64_any_dtype, axis=0).mean(),
        'num': data.apply(pd.api.types.is_numeric_dtype, axis=0).mean(),
        'bool': data.apply(pd.api.types.is_bool_dtype, axis=0).mean(),
    }

    d_summary = {'categorical': dtype_ratio['obj'] + dtype_ratio['cat'],
                 'numerical': dtype_ratio['dt']+ dtype_ratio['num'] + dtype_ratio['bool']}
    
    dominant_type = False

    for d, r in d_summary.items():
        if r >= 0.75:
            print(f'>>> Dominant Dtype: {d}')
            dominant_type = True

    if not dominant_type:
        print('    Dominant Dtype: OK')

    # check skewness
    
    extreme_skew = data.skew(numeric_only=True)\
        .reset_index().set_axis(['col_name', 'skewness'], axis=1)\
            .query('abs(skewness) >= 3')['col_name'].tolist()
    
    if len(extreme_skew) > 0:
        if len(extreme_skew) > 5:
            print(f'>>> Extreme Column: {extreme_skew[:5]} and more.',
                  f'{len(extreme_skew)} in total.')
        else:
            print(f'>>> Extreme Column: {extreme_skew}')
    else:
        print('    Extreme Values: OK')
        
    # check cardinality
    
    obj_col = set(data.columns[data.apply(pd.api.types.is_object_dtype, axis=0)])
    cat_col = set(data.columns[data.apply(pd.api.types.is_categorical_dtype, axis=0)])

    categories = obj_col.union(cat_col)
    high_car = []
    for c in categories:
        if data[c].nunique() >= 10:
            high_car.append(c)

    if len(high_car) > 0:
        if len(high_car) > 5:
            print(f'>>> High-cardinality Column: {high_car[:5]} and more.',
                  f'{len(high_car)} in total.')
        else:
            print(f'>>> High-cardinality Column: {high_car}')
    else:
        print('    High-cardinality: OK')
            
    

    

# Start Checking

In [39]:
df = pd.read_csv('poker-hand-training-true.data')

In [40]:
df

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,class
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9
...,...,...,...,...,...,...,...,...,...,...,...
25005,3,9,2,6,4,11,4,12,2,4,0
25006,4,1,4,10,3,13,3,4,1,10,1
25007,2,1,2,10,4,4,4,1,4,13,1
25008,2,12,4,3,1,10,1,12,4,9,1


In [44]:
check(df)

Data shape: (1025010, 11)

    Data Size: OK
>>> Dominant Dtype: numerical
    Extreme Values: OK
    High-cardinality: OK


In [45]:
df.to_csv('Upload/poker_hand.csv', index=False)