In [1]:
%matplotlib inline

In [2]:
import pathlib

import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from scipy import stats

# Funcs

In [3]:
def summarize_cats(df):
    '''Create table summarizing categorical variables'''

    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Column Name'] = summary['index']
    summary = summary[['Column Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values

    for name in summary['Column Name'].value_counts().index:
        
        # List unique values
        list_uniques = [str(v) for v in df[name].unique()]
        summary.loc[summary['Column Name'] == name, 'Unique Values'] = ' '.join(list_uniques)
        
        # Calculate entropy
        shares = df[name].value_counts(normalize=True)
        summary.loc[summary['Column Name'] == name, 'Entropy'] = round(stats.entropy(shares, base=2), 2)

    return summary

In [4]:
def plotting_cat_fet(df, cols, vis_row=5, vis_col=2):
    pass

# Load data

In [45]:
DATA_PATH = pathlib.Path('../input/')

In [88]:
train_data = pd.read_csv(DATA_PATH / 'train.csv')
X_test = pd.read_csv(DATA_PATH / 'test.csv')

# analysis

In [89]:
summarize_cats(train_data)

Dataset Shape: (600000, 25)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,id,int64,0,600000,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,19.19
1,bin_0,float64,17894,2,0.0 1.0 nan,0.44
2,bin_1,float64,18003,2,0.0 1.0 nan,0.69
3,bin_2,float64,17930,2,0.0 1.0 nan,0.85
4,bin_3,object,18014,2,F T nan,0.95
5,bin_4,object,18047,2,N Y nan,1.0
6,nom_0,object,18252,3,Red Blue Green nan,1.31
7,nom_1,object,18156,6,Trapezoid Star nan Circle Triangle Polygon Square,2.27
8,nom_2,object,18035,6,Hamster Axolotl Lion Dog Cat Snake nan,2.27
9,nom_3,object,18121,6,Russia nan Canada Finland Costa Rica China India,2.27


# build features

In [90]:
features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
features_cyc = ['day', 'month']

## bin 0 to 4

In [91]:
summarize_cats(train_data[features_bin])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,bin_0,float64,17894,2,0.0 1.0 nan,0.44
1,bin_1,float64,18003,2,0.0 1.0 nan,0.69
2,bin_2,float64,17930,2,0.0 1.0 nan,0.85
3,bin_3,object,18014,2,F T nan,0.95
4,bin_4,object,18047,2,N Y nan,1.0


In [92]:
# MAIN Generate dummies for bin 0 to 4
dummies = pd.get_dummies(train_data[features_bin], dummy_na=True)
train_data = train_data.drop(features_bin, axis=1)
train_data = pd.concat([train_data, dummies], axis=1)

In [95]:
summarize_cats(train_data[[c for c in train_data.columns if 'bin' in c]])

Dataset Shape: (600000, 9)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,bin_0,float64,17894,2,0.0 1.0 nan,0.44
1,bin_1,float64,18003,2,0.0 1.0 nan,0.69
2,bin_2,float64,17930,2,0.0 1.0 nan,0.85
3,bin_3_F,uint8,0,2,1 0,0.96
4,bin_3_T,uint8,0,2,0 1,0.94
5,bin_3_nan,uint8,0,2,0 1,0.19
6,bin_4_N,uint8,0,2,1 0,1.0
7,bin_4_Y,uint8,0,2,0 1,0.99
8,bin_4_nan,uint8,0,2,0 1,0.19


## nom 0 to nom 4

In [97]:
summarize_cats(train_data[features_cat])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_0,object,18252,3,Red Blue Green nan,1.31
1,nom_1,object,18156,6,Trapezoid Star nan Circle Triangle Polygon Square,2.27
2,nom_2,object,18035,6,Hamster Axolotl Lion Dog Cat Snake nan,2.27
3,nom_3,object,18121,6,Russia nan Canada Finland Costa Rica China India,2.27
4,nom_4,object,18035,4,Bassoon Theremin nan Oboe Piano,1.52


## ord 0 to 5

In [50]:
# MAIN
map_ord_1 = {'Novice':1, 'Contributor':2, 'Expert':3, 'Master':4, 'Grandmaster':5}
map_ord_2 = {'Freezing': 1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot': 5, 'Lava Hot':6}
map_ord_3 = dict(zip(train_data['ord_3'].value_counts().sort_index().keys(),
                     range(1, len(train_data['ord_3'].value_counts())+1)))
map_ord_4 = dict(zip(train_data['ord_4'].value_counts().sort_index().keys(),
                     range(1, len(train_data['ord_4'].value_counts())+1)))

In [51]:
# MAIN
temp_ord_5 = pd.DataFrame(train_data['ord_5'].value_counts().sort_index().keys(), columns=['ord_5'])
temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper()
temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper()
temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4)
temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4)
temp_ord_5['Add'] = temp_ord_5['First']+temp_ord_5['Second']
temp_ord_5['Mul'] = temp_ord_5['First']*temp_ord_5['Second']
map_ord_5 = dict(zip(temp_ord_5['ord_5'],
                     temp_ord_5['Mul']))

In [52]:
# MAIN
train_data['ord_1'] = train_data['ord_1'].map(map_ord_1)
train_data['ord_2'] = train_data['ord_2'].map(map_ord_2)
train_data['ord_3'] = train_data['ord_3'].map(map_ord_3)
train_data['ord_4'] = train_data['ord_4'].map(map_ord_4)
train_data['ord_5'] = train_data['ord_5'].map(map_ord_5)

In [55]:
summarize_cats(train_data[features_ord])

Dataset Shape: (600000, 6)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,ord_0,float64,18288,3,3.0 1.0 2.0 nan,1.57
1,ord_1,float64,18041,5,2.0 5.0 nan 1.0 3.0 4.0,2.27
2,ord_2,float64,18075,6,4.0 3.0 1.0 6.0 2.0 5.0 nan,2.52
3,ord_3,float64,17916,15,3.0 5.0 14.0 1.0 8.0 2.0 4.0 11.0 7.0 15.0 9.0...,3.64
4,ord_4,float64,17930,26,21.0 24.0 16.0 3.0 17.0 18.0 25.0 14.0 9.0 15....,4.41
5,ord_5,float64,17713,121,368.0 80.0 70.0 nan 390.0 23.0 126.0 304.0 312...,6.57


In [20]:
def prepare_data(X,test=False):
    pass

In [21]:
X_train = train_data.drop('target', 1)
y_train = train_data.target

In [22]:
for c in X_train.drop('id', 1).columns:
    train_mode = X_train[c].mode()[0]
    X_train[c] = X_train[c].fillna(train_mode)
    X_test[c] = X_test[c].fillna(train_mode)

In [23]:
def encoder(df, encoder_models=None):
    if not encoder_models:
        encoder_models = {}
        for c in df.columns:
            le = LabelEncoder()
            df[c] = le.fit_transform(df[c].values)
            encoder_models[c] = le
        return df, encoder_models
    else:
        for c in df.columns:
            print("Applying encoder model to", c)
            df[c] = encoder_models[c].transform(df[c].values)
        return df, None

## To Do
- Treat missing values
- 