In [1]:
%matplotlib inline

In [2]:
import pathlib

import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from scipy import stats, sparse
from sklearn.model_selection import StratifiedKFold
from kaggler.preprocessing import LabelEncoder, TargetEncoder, OneHotEncoder

Using TensorFlow backend.


# Funcs

In [3]:
def summarize_cats(df):
    '''Create table summarizing categorical variables'''

    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Column Name'] = summary['index']
    summary = summary[['Column Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values

    for name in summary['Column Name'].value_counts().index:
        
        # List unique values
        list_uniques = [str(v) for v in df[name].unique()]
        summary.loc[summary['Column Name'] == name, 'Unique Values'] = ' '.join(list_uniques)
        
        # Calculate entropy
        shares = df[name].value_counts(normalize=True)
        summary.loc[summary['Column Name'] == name, 'Entropy'] = round(stats.entropy(shares, base=2), 2)

    return summary

In [4]:
def plotting_cat_fet(df, cols, vis_row=5, vis_col=2):
    pass

# Load data

In [5]:
DATA_PATH = pathlib.Path('../input/')

In [6]:
train_data = pd.read_csv(DATA_PATH / 'train.csv')
# X_test = pd.read_csv(DATA_PATH / 'test.csv')

# analysis

In [7]:
summarize_cats(train_data)

Dataset Shape: (600000, 25)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,id,int64,0,600000,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,19.19
1,bin_0,float64,17894,2,0.0 1.0 nan,0.44
2,bin_1,float64,18003,2,0.0 1.0 nan,0.69
3,bin_2,float64,17930,2,0.0 1.0 nan,0.85
4,bin_3,object,18014,2,F T nan,0.95
5,bin_4,object,18047,2,N Y nan,1.0
6,nom_0,object,18252,3,Red Blue Green nan,1.31
7,nom_1,object,18156,6,Trapezoid Star nan Circle Triangle Polygon Square,2.27
8,nom_2,object,18035,6,Hamster Axolotl Lion Dog Cat Snake nan,2.27
9,nom_3,object,18121,6,Russia nan Canada Finland Costa Rica China India,2.27


# build features

In [8]:
# MAIN
features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
features_cyc = ['day', 'month']

## bin 0 to 4
* X missing treatment

In [9]:
summarize_cats(train_data[features_bin])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,bin_0,float64,17894,2,0.0 1.0 nan,0.44
1,bin_1,float64,18003,2,0.0 1.0 nan,0.69
2,bin_2,float64,17930,2,0.0 1.0 nan,0.85
3,bin_3,object,18014,2,F T nan,0.95
4,bin_4,object,18047,2,N Y nan,1.0


In [10]:
# MAIN Generate dummies for bin 0 to 4

# convert bins 0, 1, 2 to object so that
# get_dummies recognizes them and creates missing indicators
bin_012 = ['bin_0', 'bin_1', 'bin_2']
train_data[bin_012] = train_data[bin_012].astype(object)

dummies = pd.get_dummies(train_data[features_bin], dummy_na=True)
train_data = train_data.drop(features_bin, axis=1)
train_data = pd.concat([train_data, dummies], axis=1)

In [11]:
summarize_cats(train_data[[c for c in train_data.columns if 'bin' in c]])

Dataset Shape: (600000, 15)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,bin_0_0.0,uint8,0,2,1 0,0.53
1,bin_0_1.0,uint8,0,2,0 1,0.43
2,bin_0_nan,uint8,0,2,0 1,0.19
3,bin_1_0.0,uint8,0,2,1 0,0.74
4,bin_1_1.0,uint8,0,2,0 1,0.68
5,bin_1_nan,uint8,0,2,0 1,0.19
6,bin_2_0.0,uint8,0,2,1 0,0.88
7,bin_2_1.0,uint8,0,2,0 1,0.84
8,bin_2_nan,uint8,0,2,0 1,0.19
9,bin_3_F,uint8,0,2,1 0,0.96


In [12]:
train_data.loc[train_data['bin_3_nan'] == 1,['bin_3_F', 'bin_3_T', 'bin_3_nan']]

Unnamed: 0,bin_3_F,bin_3_T,bin_3_nan
22,0,0,1
25,0,0,1
107,0,0,1
120,0,0,1
182,0,0,1
...,...,...,...
599760,0,0,1
599793,0,0,1
599830,0,0,1
599932,0,0,1


## nom 0 to nom 4
* X missing treatment

In [13]:
summarize_cats(train_data[features_cat])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_0,object,18252,3,Red Blue Green nan,1.31
1,nom_1,object,18156,6,Trapezoid Star nan Circle Triangle Polygon Square,2.27
2,nom_2,object,18035,6,Hamster Axolotl Lion Dog Cat Snake nan,2.27
3,nom_3,object,18121,6,Russia nan Canada Finland Costa Rica China India,2.27
4,nom_4,object,18035,4,Bassoon Theremin nan Oboe Piano,1.52


In [14]:
# MAIN
# le = LabelEncoder(min_obs=10)
# train_data.loc[:,features_cat] = le.fit_transform(train_data.loc[:,features_cat])

In [24]:
# EXPERIMENT
# train_data.loc[train_data['nom_0'].isna(), 'nom_0'] = 'missing'
# for c in features_cat:
#     train_data.loc[train_data[c].isna(), c] = 'missing'
#     display(train_data.groupby(c)['target'].mean())
#     display(train_data[c].value_counts())
#     print("---------------\n")

In [17]:
# EXPERIMENT
# N_FOLD = 5
# SEED = 20
# cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
# te = TargetEncoder(cv=cv)
# te.fit(train_data.loc[:,features_cat], train_data['target'])
# train_data.loc[:,features_cat] = te.transform(train_data.loc[:,features_cat])

In [19]:
summarize_cats(train_data[features_cat])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_0,float64,0,4,0.18306700880531296 0.1957630595254476 0.17980...,1.47
1,nom_1,float64,0,7,0.22546453679348666 0.13020073550666744 0.1828...,2.4
2,nom_2,float64,0,7,0.16801400333621477 0.20110410003550486 0.2098...,2.39
3,nom_3,float64,0,7,0.21953431626429265 0.18586060591550216 0.1598...,2.4
4,nom_4,float64,0,5,0.20737491255157087 0.17957297137978406 0.1855...,1.67


## nom 5 to 9
* X missing treatment

In [None]:
summarize_cats(train_data[features_hex])

In [None]:
# MAIN
N_FOLD = 5
SEED = 20
cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
te = TargetEncoder(cv=cv)
te.fit(train_data.loc[:,features_hex], train_data['target'])
train_data.loc[:,features_hex] = te.transform(train_data.loc[:,features_hex])

In [None]:
summarize_cats(train_data[features_hex])

## ord 0 to 5
* X missing treatment

In [None]:
# TESTING
# train_data = pd.read_csv(DATA_PATH / 'train.csv')

In [None]:
summarize_cats(train_data[features_ord])

In [None]:
# MAIN
map_ord_0 = None # already a numeric column
map_ord_1 = {'Novice':1, 'Contributor':2, 'Expert':3, 'Master':4, 'Grandmaster':5}
map_ord_2 = {'Freezing': 1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot': 5, 'Lava Hot':6}
map_ord_3 = dict(zip(train_data['ord_3'].value_counts().sort_index().keys(),
                     range(1, len(train_data['ord_3'].value_counts())+1)))
map_ord_4 = dict(zip(train_data['ord_4'].value_counts().sort_index().keys(),
                     range(1, len(train_data['ord_4'].value_counts())+1)))

In [None]:
# MAIN
temp_ord_5 = pd.DataFrame(train_data['ord_5'].value_counts().sort_index().keys(), columns=['ord_5'])
temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper()
temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper()
temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4)
temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4)
temp_ord_5['Add'] = temp_ord_5['First']+temp_ord_5['Second']
temp_ord_5['Mul'] = temp_ord_5['First']*temp_ord_5['Second']
map_ord_5 = dict(zip(temp_ord_5['ord_5'],
                     temp_ord_5['Mul']))

In [None]:
# MAIN
maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5]
for i,m in zip(range(0,6), maps):
    if i != 0:
        train_data[f'ord_{i}'] = train_data[f'ord_{i}'].map(m)    
    train_data[f'ord_{i}'] = (train_data[f'ord_{i}'].
                              fillna(train_data[f'ord_{i}'].median()))

In [None]:
summarize_cats(train_data[features_ord])

## cyclical features
* X missing treatment

In [None]:
summarize_cats(train_data[features_cyc])

In [None]:
# TESTING
# train_data = pd.read_csv(DATA_PATH / 'train.csv')

# MAIN
train_data[features_cyc] = train_data[features_cyc].astype(object)
dummies_cyc= pd.get_dummies(train_data[features_cyc], dummy_na=True)
train_data = train_data.drop(features_cyc, axis=1)
train_data = pd.concat([train_data, dummies_cyc], axis=1)

In [None]:
for f in features_cyc:
    display(summarize_cats(train_data[[c for c in train_data.columns if f in c]]))

In [None]:
train_data.info(verbose=2)

## To Do
- cyclical feature
- Treat missing values

## Experimental Code

In [None]:
import numpy as np
import sys

a = sparse.csr_matrix(np.arange(12).reshape((4,3)))
b = a.todense()
print(a.shape, type(a), b.shape, type(b))
print(sys.getsizeof(a), sys.getsizeof(b))

In [None]:
def prepare_data(X,test=False):
    pass

In [None]:
X_train = train_data.drop('target', 1)
y_train = train_data.target

In [None]:
for c in X_train.drop('id', 1).columns:
    train_mode = X_train[c].mode()[0]
    X_train[c] = X_train[c].fillna(train_mode)
    X_test[c] = X_test[c].fillna(train_mode)

In [None]:
def encoder(df, encoder_models=None):
    if not encoder_models:
        encoder_models = {}
        for c in df.columns:
            le = LabelEncoder()
            df[c] = le.fit_transform(df[c].values)
            encoder_models[c] = le
        return df, encoder_models
    else:
        for c in df.columns:
            print("Applying encoder model to", c)
            df[c] = encoder_models[c].transform(df[c].values)
        return df, None