- https://github.com/felipeeeantunes/udacity_live/blob/master/porto_seguro.ipynb
- https://www.kaggle.com/c/santander-customer-satisfaction/

In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [21]:
df = pd.read_csv('train.csv')

In [22]:
df.dtypes.value_counts()

int64      260
float64    111
dtype: int64

In [23]:
for col in df.columns:
    print('[{0}][{1}] max:{2} --- min:{3}'.format(col, str(df[col].dtype), df[col].min(),df[col].max()))

[ID][int64] max:1 --- min:151838
[var3][int64] max:-999999 --- min:238
[var15][int64] max:5 --- min:105
[imp_ent_var16_ult1][float64] max:0.0 --- min:210000.0
[imp_op_var39_comer_ult1][float64] max:0.0 --- min:12888.03
[imp_op_var39_comer_ult3][float64] max:0.0 --- min:21024.81
[imp_op_var40_comer_ult1][float64] max:0.0 --- min:8237.82
[imp_op_var40_comer_ult3][float64] max:0.0 --- min:11073.57
[imp_op_var40_efect_ult1][float64] max:0.0 --- min:6600.0
[imp_op_var40_efect_ult3][float64] max:0.0 --- min:6600.0
[imp_op_var40_ult1][float64] max:0.0 --- min:8237.82
[imp_op_var41_comer_ult1][float64] max:0.0 --- min:12888.03
[imp_op_var41_comer_ult3][float64] max:0.0 --- min:16566.81
[imp_op_var41_efect_ult1][float64] max:0.0 --- min:45990.0
[imp_op_var41_efect_ult3][float64] max:0.0 --- min:131100.0
[imp_op_var41_ult1][float64] max:0.0 --- min:47598.09
[imp_op_var39_efect_ult1][float64] max:0.0 --- min:45990.0
[imp_op_var39_efect_ult3][float64] max:0.0 --- min:131100.0
[imp_op_var39_ult1][f

[num_aport_var17_hace3][int64] max:0 --- min:12
[num_aport_var17_ult1][int64] max:0 --- min:21
[num_aport_var33_hace3][int64] max:0 --- min:12
[num_aport_var33_ult1][int64] max:0 --- min:6
[num_var7_emit_ult1][int64] max:0 --- min:3
[num_var7_recib_ult1][int64] max:0 --- min:24
[num_compra_var44_hace3][int64] max:0 --- min:9
[num_compra_var44_ult1][int64] max:0 --- min:39
[num_ent_var16_ult1][int64] max:0 --- min:60
[num_var22_hace2][int64] max:0 --- min:123
[num_var22_hace3][int64] max:0 --- min:108
[num_var22_ult1][int64] max:0 --- min:96
[num_var22_ult3][int64] max:0 --- min:234
[num_med_var22_ult3][int64] max:0 --- min:78
[num_med_var45_ult3][int64] max:0 --- min:267
[num_meses_var5_ult3][int64] max:0 --- min:3
[num_meses_var8_ult3][int64] max:0 --- min:3
[num_meses_var12_ult3][int64] max:0 --- min:3
[num_meses_var13_corto_ult3][int64] max:0 --- min:3
[num_meses_var13_largo_ult3][int64] max:0 --- min:3
[num_meses_var13_medio_ult3][int64] max:0 --- min:2
[num_meses_var17_ult3][int64

In [24]:
def get_meta(train):
    data=[]
    for col in train.columns:
        # role
        if col=='TARGET':
            role='target'
        elif col=='ID':
            role='id'
        else:
            role='input'
            
        # level
        if 'ind_var' in col or col == 'TARGET':
            level='binary'
        elif col=='ID':
            level='nominal'
        elif train[col].dtype == np.float64:
            level = 'interval'
        elif train[col].dtype == np.int64:
            level = 'ordinal'  
    
        # keep
        keep = True
        if col == 'ID':
            keep = False
            
        # data type 
        dtype = train[col].dtype
        
        col_dict={
            'varname': col,
            'role'   :role,
            'level'  :level,
            'keep'   :keep,
            'dtype'  :dtype
        }
        data.append(col_dict)
    meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
    meta.set_index('varname', inplace=True)
    
    return meta

In [25]:
meta_data = get_meta(df)
meta_data

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ID,id,nominal,False,int64
var3,input,ordinal,True,int64
var15,input,ordinal,True,int64
imp_ent_var16_ult1,input,interval,True,float64
imp_op_var39_comer_ult1,input,interval,True,float64
imp_op_var39_comer_ult3,input,interval,True,float64
imp_op_var40_comer_ult1,input,interval,True,float64
imp_op_var40_comer_ult3,input,interval,True,float64
imp_op_var40_efect_ult1,input,interval,True,float64
imp_op_var40_efect_ult3,input,interval,True,float64


In [26]:
meta_data.groupby(['role','level']).agg({'dtype': lambda x: x.count()})

Unnamed: 0_level_0,Unnamed: 1_level_0,dtype
role,level,Unnamed: 2_level_1
id,nominal,1
input,binary,75
input,interval,111
input,ordinal,183
target,binary,1


In [27]:
is_null = df.isnull().sum()
is_null[is_null>0]

Series([], dtype: int64)

#### Removing columns

In [28]:
df.drop('ID', axis=1, inplace=True)

#### Feature importances

In [29]:
rf = RandomForestClassifier(n_estimators=100, max_depth=8, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=99)
rf.fit(df.drop('TARGET', axis=1), df.TARGET)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=0.2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=99, verbose=0, warm_start=False)

In [30]:
def get_feature_importances(columns, importances, qtd=30):
    imp_dict = dict(zip(columns, importances))
    
    return pd.DataFrame.from_dict(imp_dict, orient='index', columns=['importance']).sort_values('importance', ascending=False)[:qtd]

In [31]:
get_feature_importances(df.drop('TARGET', axis=1).columns.values, rf.feature_importances_)

Unnamed: 0,importance
var15,0.250617
var38,0.091127
saldo_var30,0.072741
saldo_var42,0.040874
num_var4,0.03309
ind_var30,0.029859
num_var30,0.027894
num_meses_var5_ult3,0.024544
saldo_medio_var5_hace2,0.021399
saldo_medio_var5_ult1,0.020972


#### Base models

All features

In [32]:
def cross_val_model(x, y, model):
    x = np.array(x)
    y = np.array(y)
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=99) # return index of x,y
    
    for i, (t1_index, t2_index) in enumerate(skf.split(x, y)):
        xt1, xt2 = x[t1_index], x[t2_index]
        yt1, yt2 = y[t1_index], y[t2_index]
        
        print ("Fit %s fold %d" % (str(model).split('(')[0], i))
        model.fit(xt1, yt1)
        cross_score = cross_val_score(model, xt2, yt2, cv=3, scoring='roc_auc')
        print("    cross_score: %.5f" % cross_score.mean())

In [33]:
x = df.drop('TARGET', axis=1)
y = df.TARGET

#### Logistic regression

In [34]:
lr_model = LogisticRegression(random_state=99, class_weight='balanced')

cross_val_model(x,y,lr_model)

Fit LogisticRegression fold 0
    cross_score: 0.66193
Fit LogisticRegression fold 1
    cross_score: 0.69420
Fit LogisticRegression fold 2
    cross_score: 0.68031


#### Random forest

In [35]:
#RandomForest params
rf_params = {}
rf_params['n_estimators'] = 200
rf_params['max_depth'] = 6
rf_params['min_samples_split'] = 70
rf_params['min_samples_leaf'] = 30

rf_model = RandomForestClassifier(**rf_params)

In [36]:
cross_val_model(x, y, rf_model)

Fit RandomForestClassifier fold 0
    cross_score: 0.80147
Fit RandomForestClassifier fold 1
    cross_score: 0.80554
Fit RandomForestClassifier fold 2
    cross_score: 0.78530


#### Selecting features

In [18]:
"""
Testing with k best features
"""
for n_feat in [5, 10, 15, 20, 25, 30]:  
    lst_feat = get_feature_importances(df.drop('TARGET', axis=1).columns.values, rf.feature_importances_, n_feat).index.values

    x = df[lst_feat]
    y = df.TARGET
    
    print('----- '+str(n_feat)+' best features ----\n')
    cross_val_model(x, y, rf_model)
    print('\n')

----- 5 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.82052
Fit RandomForestClassifier fold 1
    cross_score: 0.83184
Fit RandomForestClassifier fold 2
    cross_score: 0.81435


----- 10 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81833
Fit RandomForestClassifier fold 1
    cross_score: 0.83100
Fit RandomForestClassifier fold 2
    cross_score: 0.81454


----- 15 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81577
Fit RandomForestClassifier fold 1
    cross_score: 0.82474
Fit RandomForestClassifier fold 2
    cross_score: 0.80833


----- 20 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81817
Fit RandomForestClassifier fold 1
    cross_score: 0.82551
Fit RandomForestClassifier fold 2
    cross_score: 0.80783


----- 25 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81926
Fit RandomForestClassifier fold 1
    cross_score: 0.82637
Fit RandomForestClassi

#### All features One hot encoding

In [37]:
def one_hot_encoder(df, one_hot, limit):
    df_ = df.copy()
    for c in one_hot:
        if len(one_hot[c])>2 and len(one_hot[c]) < limit:
            for val in one_hot[c]:
                df_[c+'_oh_' + str(val)] = (df_[c].values == val).astype(np.int)
    return df_

In [38]:
one_hot = {c: list(df[c].unique()) for c in df.columns if c not in ['TARGET']}

In [39]:
df_ohe = one_hot_encoder(df, one_hot, 7)

In [40]:
x = df_ohe.drop('TARGET', axis=1)
y = df_ohe.TARGET
cross_val_model(x, y, rf_model)

Fit RandomForestClassifier fold 0
    cross_score: 0.79598
Fit RandomForestClassifier fold 1
    cross_score: 0.80130
Fit RandomForestClassifier fold 2
    cross_score: 0.78011


#### Select best with ohe

In [42]:
"""
Testing with k best features
"""
for n_feat in [5, 10, 15, 20, 25, 30]:  
    lst_feat = get_feature_importances(df.drop('TARGET', axis=1).columns.values, rf.feature_importances_, n_feat).index.values
    
    df_ = df[lst_feat]
    one_hot = {c: list(df_[c].unique()) for c in df_.columns if c not in ['TARGET']}
    df_ohe = one_hot_encoder(df_, one_hot, 7)
    
    x = df_ohe
    y = df.TARGET

    print('----- '+str(n_feat)+' best features ----\n')
    cross_val_model(x, y, rf_model)
    print('\n')

----- 5 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.82067
Fit RandomForestClassifier fold 1
    cross_score: 0.83187
Fit RandomForestClassifier fold 2
    cross_score: 0.81381


----- 10 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81229
Fit RandomForestClassifier fold 1
    cross_score: 0.82308
Fit RandomForestClassifier fold 2
    cross_score: 0.80491


----- 15 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81601
Fit RandomForestClassifier fold 1
    cross_score: 0.82489
Fit RandomForestClassifier fold 2
    cross_score: 0.80824


----- 20 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81438
Fit RandomForestClassifier fold 1
    cross_score: 0.82511
Fit RandomForestClassifier fold 2
    cross_score: 0.80576


----- 25 best features ----

Fit RandomForestClassifier fold 0
    cross_score: 0.81374
Fit RandomForestClassifier fold 1
    cross_score: 0.81951
Fit RandomForestClassi