In [1]:
%matplotlib inline

In [2]:
import pathlib

import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from scipy import stats, sparse
from sklearn.model_selection import StratifiedKFold
from kaggler.preprocessing import LabelEncoder, TargetEncoder, OneHotEncoder

Using TensorFlow backend.


# Funcs

In [3]:
def summarize_cats(df):
    '''Create table summarizing categorical variables'''

    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Column Name'] = summary['index']
    summary = summary[['Column Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values

    for name in summary['Column Name'].value_counts().index:
        
        # List unique values
        list_uniques = [str(v) for v in df[name].unique()]
        summary.loc[summary['Column Name'] == name, 'Unique Values'] = ' '.join(list_uniques)
        
        # Calculate entropy
        shares = df[name].value_counts(normalize=True)
        summary.loc[summary['Column Name'] == name, 'Entropy'] = round(stats.entropy(shares, base=2), 2)

    return summary

In [4]:
def plotting_cat_fet(df, cols, vis_row=5, vis_col=2):
    pass

# Load data

In [5]:
DATA_PATH = pathlib.Path('../input/')

In [6]:
train_data = pd.read_csv(DATA_PATH / 'train.csv')
# X_test = pd.read_csv(DATA_PATH / 'test.csv')

# analysis

In [7]:
summarize_cats(train_data)

Dataset Shape: (600000, 25)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,id,int64,0,600000,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,19.19
1,bin_0,float64,17894,2,0.0 1.0 nan,0.44
2,bin_1,float64,18003,2,0.0 1.0 nan,0.69
3,bin_2,float64,17930,2,0.0 1.0 nan,0.85
4,bin_3,object,18014,2,F T nan,0.95
5,bin_4,object,18047,2,N Y nan,1.0
6,nom_0,object,18252,3,Red Blue Green nan,1.31
7,nom_1,object,18156,6,Trapezoid Star nan Circle Triangle Polygon Square,2.27
8,nom_2,object,18035,6,Hamster Axolotl Lion Dog Cat Snake nan,2.27
9,nom_3,object,18121,6,Russia nan Canada Finland Costa Rica China India,2.27


# build features

In [8]:
# MAIN
features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
features_cyc = ['day', 'month']

## bin 0 to 4
* X missing treatment

In [9]:
summarize_cats(train_data[features_bin])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,bin_0,float64,17894,2,0.0 1.0 nan,0.44
1,bin_1,float64,18003,2,0.0 1.0 nan,0.69
2,bin_2,float64,17930,2,0.0 1.0 nan,0.85
3,bin_3,object,18014,2,F T nan,0.95
4,bin_4,object,18047,2,N Y nan,1.0


In [10]:
# MAIN Generate dummies for bin 0 to 4

# convert bins 0, 1, 2 to object so that
# get_dummies recognizes them and creates missing indicators
bin_012 = ['bin_0', 'bin_1', 'bin_2']
train_data[bin_012] = train_data[bin_012].astype(object)

dummies = pd.get_dummies(train_data[features_bin], dummy_na=True)
train_data = train_data.drop(features_bin, axis=1)
train_data = pd.concat([train_data, dummies], axis=1)

In [11]:
summarize_cats(train_data[[c for c in train_data.columns if 'bin' in c]])

Dataset Shape: (600000, 15)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,bin_0_0.0,uint8,0,2,1 0,0.53
1,bin_0_1.0,uint8,0,2,0 1,0.43
2,bin_0_nan,uint8,0,2,0 1,0.19
3,bin_1_0.0,uint8,0,2,1 0,0.74
4,bin_1_1.0,uint8,0,2,0 1,0.68
5,bin_1_nan,uint8,0,2,0 1,0.19
6,bin_2_0.0,uint8,0,2,1 0,0.88
7,bin_2_1.0,uint8,0,2,0 1,0.84
8,bin_2_nan,uint8,0,2,0 1,0.19
9,bin_3_F,uint8,0,2,1 0,0.96


In [12]:
train_data.loc[train_data['bin_3_nan'] == 1,['bin_3_F', 'bin_3_T', 'bin_3_nan']]

Unnamed: 0,bin_3_F,bin_3_T,bin_3_nan
22,0,0,1
25,0,0,1
107,0,0,1
120,0,0,1
182,0,0,1
...,...,...,...
599760,0,0,1
599793,0,0,1
599830,0,0,1
599932,0,0,1


## nom 0 to nom 4
* X missing treatment

In [13]:
summarize_cats(train_data[features_cat])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_0,object,18252,3,Red Blue Green nan,1.31
1,nom_1,object,18156,6,Trapezoid Star nan Circle Triangle Polygon Square,2.27
2,nom_2,object,18035,6,Hamster Axolotl Lion Dog Cat Snake nan,2.27
3,nom_3,object,18121,6,Russia nan Canada Finland Costa Rica China India,2.27
4,nom_4,object,18035,4,Bassoon Theremin nan Oboe Piano,1.52


In [14]:
# MAIN
le = LabelEncoder(min_obs=10)
train_data.loc[:,features_cat] = le.fit_transform(train_data.loc[:,features_cat])

In [15]:
summarize_cats(train_data[features_cat])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_0,int64,0,4,0 1 2 3,1.47
1,nom_1,int64,0,7,2 6 5 3 0 1 4,2.4
2,nom_2,int64,0,7,0 1 2 3 4 6 5,2.39
3,nom_3,int64,0,7,2 5 4 3 1 6 0,2.4
4,nom_4,int64,0,5,1 0 4 2 3,1.67


In [16]:
le.label_encoders

[{'Red': 0, 'Blue': 1, 'Green': 2, 7535805: 3},
 {'Triangle': 0,
  'Polygon': 1,
  'Trapezoid': 2,
  'Circle': 3,
  'Square': 4,
  7535805: 5,
  'Star': 6},
 {'Hamster': 0,
  'Axolotl': 1,
  'Lion': 2,
  'Dog': 3,
  'Cat': 4,
  7535805: 5,
  'Snake': 6},
 {'India': 0,
  'Costa Rica': 1,
  'Russia': 2,
  'Finland': 3,
  'Canada': 4,
  7535805: 5,
  'China': 6},
 {'Theremin': 0, 'Bassoon': 1, 'Oboe': 2, 'Piano': 3, 7535805: 4}]

## nom 5 to 9
* X missing treatment

In [17]:
summarize_cats(train_data[features_hex])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_5,object,17778,1220,de4c57ee2 2bb3c3e5c b574c9841 673bdf1f6 777d1a...,9.98
1,nom_6,object,18131,1519,a64bc7ddf 3a3a936e8 708248125 23edb8da3 3a7975...,10.3
2,nom_7,object,18003,222,598080a91 1dddb8473 5ddc9a726 3a33ef960 bc9cc2...,7.54
3,nom_8,object,17755,222,0256c7a4b 52ead350c 745b909d1 bdaa56dd1 nan 69...,7.54
4,nom_9,object,18073,2218,02e7c8990 f37df64af nan f9d456e57 c5361037c 05...,10.84


In [18]:
# MAIN
N_FOLD = 5
SEED = 20
cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
te = TargetEncoder(cv=cv)
te.fit(train_data.loc[:,features_hex], train_data['target'])
train_data.loc[:,features_hex] = te.transform(train_data.loc[:,features_hex])

In [19]:
summarize_cats(train_data[features_hex])

Dataset Shape: (600000, 5)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,nom_5,float64,0,1220,0.20974255517316634 0.1937704557331611 0.18127...,9.88
1,nom_6,float64,0,1519,0.1602769876171938 0.20070669211110728 0.16375...,10.19
2,nom_7,float64,0,223,0.10328949654065969 0.12901951622851243 0.2627...,7.51
3,nom_8,float64,0,223,0.12853258688478175 0.19731065709951645 0.1725...,7.51
4,nom_9,float64,0,2214,0.164291873647434 0.15279426897227605 0.187850...,10.7


## ord 0 to 5
* X missing treatment

In [20]:
# TESTING
# train_data = pd.read_csv(DATA_PATH / 'train.csv')

In [21]:
summarize_cats(train_data[features_ord])

Dataset Shape: (600000, 6)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,ord_0,float64,18288,3,3.0 1.0 2.0 nan,1.57
1,ord_1,object,18041,5,Contributor Grandmaster nan Novice Expert Master,2.27
2,ord_2,object,18075,6,Hot Warm Freezing Lava Hot Cold Boiling Hot nan,2.52
3,ord_3,object,17916,15,c e n a h b d k g o i m nan f l j,3.64
4,ord_4,object,17930,26,U X P C Q R Y N I O M E V K G B H nan T W A F ...,4.41
5,ord_5,object,17713,190,Pw pE eN nan OZ wa rg PS mX hG xF RV Nh dp SS ...,7.27


In [22]:
# MAIN
map_ord_0 = None # already a numeric column
map_ord_1 = {'Novice':1, 'Contributor':2, 'Expert':3, 'Master':4, 'Grandmaster':5}
map_ord_2 = {'Freezing': 1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot': 5, 'Lava Hot':6}
map_ord_3 = dict(zip(train_data['ord_3'].value_counts().sort_index().keys(),
                     range(1, len(train_data['ord_3'].value_counts())+1)))
map_ord_4 = dict(zip(train_data['ord_4'].value_counts().sort_index().keys(),
                     range(1, len(train_data['ord_4'].value_counts())+1)))

In [23]:
# MAIN
temp_ord_5 = pd.DataFrame(train_data['ord_5'].value_counts().sort_index().keys(), columns=['ord_5'])
temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper()
temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper()
temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4)
temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4)
temp_ord_5['Add'] = temp_ord_5['First']+temp_ord_5['Second']
temp_ord_5['Mul'] = temp_ord_5['First']*temp_ord_5['Second']
map_ord_5 = dict(zip(temp_ord_5['ord_5'],
                     temp_ord_5['Mul']))

In [24]:
# MAIN
maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5]
for i,m in zip(range(0,6), maps):
    if i != 0:
        train_data[f'ord_{i}'] = train_data[f'ord_{i}'].map(m)    
    train_data[f'ord_{i}'] = (train_data[f'ord_{i}'].
                              fillna(train_data[f'ord_{i}'].median()))

In [25]:
summarize_cats(train_data[features_ord])

Dataset Shape: (600000, 6)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,ord_0,float64,0,3,3.0 1.0 2.0,1.58
1,ord_1,float64,0,5,2.0 5.0 3.0 1.0 4.0,2.26
2,ord_2,float64,0,6,4.0 3.0 1.0 6.0 2.0 5.0,2.51
3,ord_3,float64,0,15,3.0 5.0 14.0 1.0 8.0 2.0 4.0 11.0 7.0 15.0 9.0...,3.62
4,ord_4,float64,0,26,21.0 24.0 16.0 3.0 17.0 18.0 25.0 14.0 9.0 15....,4.39
5,ord_5,float64,0,121,368.0 80.0 70.0 156.0 390.0 23.0 126.0 304.0 3...,6.56


## cyclical features
* X missing treatment

In [26]:
summarize_cats(train_data[features_cyc])

Dataset Shape: (600000, 2)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,day,float64,17952,7,6.0 7.0 5.0 3.0 1.0 2.0 nan 4.0,2.7
1,month,float64,17988,12,3.0 7.0 9.0 12.0 4.0 6.0 1.0 8.0 5.0 11.0 nan ...,3.37


In [27]:
# TESTING
# train_data = pd.read_csv(DATA_PATH / 'train.csv')

# MAIN
train_data[features_cyc] = train_data[features_cyc].astype(object)
dummies_cyc= pd.get_dummies(train_data[features_cyc], dummy_na=True)
train_data = train_data.drop(features_cyc, axis=1)
train_data = pd.concat([train_data, dummies_cyc], axis=1)

In [28]:
for f in features_cyc:
    display(summarize_cats(train_data[[c for c in train_data.columns if f in c]]))

Dataset Shape: (600000, 8)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,day_1.0,uint8,0,2,0 1,0.59
1,day_2.0,uint8,0,2,0 1,0.5
2,day_3.0,uint8,0,2,0 1,0.7
3,day_4.0,uint8,0,2,0 1,0.24
4,day_5.0,uint8,0,2,0 1,0.69
5,day_6.0,uint8,0,2,1 0,0.64
6,day_7.0,uint8,0,2,0 1,0.59
7,day_nan,uint8,0,2,0 1,0.19


Dataset Shape: (600000, 13)


Unnamed: 0,Column Name,dtypes,Missing,Uniques,Unique Values,Entropy
0,month_1.0,uint8,0,2,0 1,0.43
1,month_2.0,uint8,0,2,0 1,0.36
2,month_3.0,uint8,0,2,1 0,0.52
3,month_4.0,uint8,0,2,0 1,0.17
4,month_5.0,uint8,0,2,0 1,0.51
5,month_6.0,uint8,0,2,0 1,0.47
6,month_7.0,uint8,0,2,0 1,0.43
7,month_8.0,uint8,0,2,0 1,0.56
8,month_9.0,uint8,0,2,0 1,0.22
9,month_10.0,uint8,0,2,0 1,0.03


In [35]:
train_data.info(verbose=2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 54 columns):
id            600000 non-null int64
nom_0         600000 non-null int64
nom_1         600000 non-null int64
nom_2         600000 non-null int64
nom_3         600000 non-null int64
nom_4         600000 non-null int64
nom_5         600000 non-null float64
nom_6         600000 non-null float64
nom_7         600000 non-null float64
nom_8         600000 non-null float64
nom_9         600000 non-null float64
ord_0         600000 non-null float64
ord_1         600000 non-null float64
ord_2         600000 non-null float64
ord_3         600000 non-null float64
ord_4         600000 non-null float64
ord_5         600000 non-null float64
target        600000 non-null int64
bin_0_0.0     600000 non-null uint8
bin_0_1.0     600000 non-null uint8
bin_0_nan     600000 non-null uint8
bin_1_0.0     600000 non-null uint8
bin_1_1.0     600000 non-null uint8
bin_1_nan     600000 non-null uint8
bin

## To Do
- cyclical feature
- Treat missing values

## Experimental Code

In [29]:
import numpy as np
import sys

a = sparse.csr_matrix(np.arange(12).reshape((4,3)))
b = a.todense()
print(a.shape, type(a), b.shape, type(b))
print(sys.getsizeof(a), sys.getsizeof(b))

(4, 3) <class 'scipy.sparse.csr.csr_matrix'> (4, 3) <class 'numpy.matrix'>
64 144


In [30]:
def prepare_data(X,test=False):
    pass

In [31]:
X_train = train_data.drop('target', 1)
y_train = train_data.target

In [32]:
for c in X_train.drop('id', 1).columns:
    train_mode = X_train[c].mode()[0]
    X_train[c] = X_train[c].fillna(train_mode)
    X_test[c] = X_test[c].fillna(train_mode)

NameError: name 'X_test' is not defined

In [None]:
def encoder(df, encoder_models=None):
    if not encoder_models:
        encoder_models = {}
        for c in df.columns:
            le = LabelEncoder()
            df[c] = le.fit_transform(df[c].values)
            encoder_models[c] = le
        return df, encoder_models
    else:
        for c in df.columns:
            print("Applying encoder model to", c)
            df[c] = encoder_models[c].transform(df[c].values)
        return df, None