# Read data

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
data=pd.read_csv("santander-train.csv")
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0.0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0.0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0.0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0.0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0.0


# Fill missing values by mode

In [3]:
for val,col in zip(data.isna().sum(),data.isna().sum().index):
    if(val>0):
        data[col]=data[col].fillna(data[col].mode()[0])
    



In [4]:
data.isna().sum()

ID                               0
var3                             0
var15                            0
imp_ent_var16_ult1               0
imp_op_var39_comer_ult1          0
imp_op_var39_comer_ult3          0
imp_op_var40_comer_ult1          0
imp_op_var40_comer_ult3          0
imp_op_var40_efect_ult1          0
imp_op_var40_efect_ult3          0
imp_op_var40_ult1                0
imp_op_var41_comer_ult1          0
imp_op_var41_comer_ult3          0
imp_op_var41_efect_ult1          0
imp_op_var41_efect_ult3          0
imp_op_var41_ult1                0
imp_op_var39_efect_ult1          0
imp_op_var39_efect_ult3          0
imp_op_var39_ult1                0
imp_sal_var16_ult1               0
ind_var1_0                       0
ind_var1                         0
ind_var2_0                       0
ind_var2                         0
ind_var5_0                       0
ind_var5                         0
ind_var6_0                       0
ind_var6                         0
ind_var8_0          

# Remove Constant Features

In [5]:
from sklearn.feature_selection import VarianceThreshold
constant_filter = VarianceThreshold(threshold=0)


In [6]:
x = data.drop('TARGET', axis = 1)
Y = data['TARGET']
constant_filter.fit(x)

VarianceThreshold(threshold=0)

In [7]:
#check constent features
constant_filter.get_support().sum()

279

In [8]:
constant_filter.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [9]:
constant_filter_features=x.columns[constant_filter.get_support()]
constant_filter_data=x[constant_filter_features]

In [10]:
x.shape,constant_filter_data.shape

((3948, 370), (3948, 279))

In [11]:
constant_filter_data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [12]:
from sklearn.model_selection import cross_val_score
def run_randomForest(x,Y):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1)
    cv=cross_val_score(clf,x,Y,scoring='accuracy',cv=10)
    print('Accuracy: ', cv.mean())

In [13]:
%%time
run_randomForest(constant_filter_data, Y)

Accuracy:  0.9508519624050049
Wall time: 5.58 s


# Remove Quasi constant features

In [14]:
quasi_filter = VarianceThreshold(threshold=0.01)
quasi_filter.fit(constant_filter_data)
quasi_filter.get_support().sum()

238

In [15]:
quasi_filter_features=constant_filter_data.columns[quasi_filter.get_support()]
quasi_filter_data=x[quasi_filter_features]


In [16]:
quasi_filter_data.shape

(3948, 238)

In [17]:
quasi_filter_data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [18]:
%%time
run_randomForest(quasi_filter_data, Y)

Accuracy:  0.9389236709444896
Wall time: 5.7 s


# Remove Duplicate Features

In [19]:
quasi_filter_data_t=pd.DataFrame(quasi_filter_data.T)

In [20]:
quasi_filter_data_t.shape

(238, 3948)

In [21]:
quasi_filter_data_t.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947
ID,1.0,3.0,4.0,8.0,10.0,13.0,14.0,18.0,20.0,23.0,...,7889.0,7891.0,7895.0,7896.0,7900.0,7901.0,7902.0,7904.0,7908.0,7910.0
var3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
var15,23.0,34.0,23.0,37.0,39.0,23.0,27.0,26.0,45.0,25.0,...,71.0,31.0,31.0,23.0,23.0,52.0,26.0,23.0,25.0,31.0
imp_ent_var16_ult1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,240.0,0.0,0.0,0.0,0.0
imp_op_var39_comer_ult1,0.0,0.0,0.0,195.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
quasi_filter_data_t.duplicated().sum()

22

In [23]:
unique_features= [not index for index in quasi_filter_data_t.duplicated()]

In [24]:
unique_data=quasi_filter_data_t[unique_features].T

In [25]:
unique_data.shape

(3948, 216)

In [26]:
unique_data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult1,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,1.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,4.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8.0,2.0,37.0,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [27]:
%%time
run_randomForest(unique_data, Y)

Accuracy:  0.9531362263643958
Wall time: 5.93 s


# Check multicolinearity

In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [29]:
vif = pd.DataFrame()

# here we make use of the variance_inflation_factor, which will basically output the respective VIFs 
vif["VIF"] = [variance_inflation_factor(unique_data.values, i) for i in range(unique_data.shape[1])]
# Finally, I like to include names so it is easier to explore the result
vif["Features"] = unique_data.columns

  vif = 1. / (1. - r_squared_i)


In [30]:
vif.sort_values(by="VIF",ascending=False)

Unnamed: 0,VIF,Features
108,inf,saldo_var30
104,inf,saldo_var20
106,inf,saldo_var26
107,inf,saldo_var25
213,inf,saldo_medio_var44_ult1
109,inf,saldo_var31
110,inf,saldo_var32
111,inf,saldo_var33
212,inf,saldo_medio_var44_hace3
113,inf,saldo_var42


In [31]:
vif=vif[vif["VIF"]>100]

# Remove multi co-related features

In [32]:
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j])> threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

In [33]:
corr_features = get_correlation(unique_data, 0.85)
corr_features

{'delta_imp_venta_var44_1y3',
 'delta_num_compra_var44_1y3',
 'imp_aport_var13_ult1',
 'imp_aport_var17_ult1',
 'imp_aport_var33_hace3',
 'imp_op_var39_comer_ult3',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_efect_ult3',
 'imp_op_var39_ult1',
 'imp_op_var40_comer_ult3',
 'imp_op_var40_ult1',
 'imp_op_var41_comer_ult1',
 'imp_op_var41_comer_ult3',
 'imp_reemb_var13_ult1',
 'imp_trasp_var17_in_ult1',
 'ind_var10_ult1',
 'ind_var10cte_ult1',
 'ind_var13',
 'ind_var13_corto',
 'ind_var13_corto_0',
 'ind_var24',
 'ind_var24_0',
 'ind_var25_0',
 'ind_var26_0',
 'ind_var26_cte',
 'ind_var30',
 'ind_var37_0',
 'ind_var41_0',
 'ind_var8',
 'ind_var9_cte_ult1',
 'ind_var9_ult1',
 'num_aport_var13_ult1',
 'num_aport_var17_ult1',
 'num_aport_var33_hace3',
 'num_compra_var44_hace3',
 'num_compra_var44_ult1',
 'num_med_var22_ult3',
 'num_meses_var12_ult3',
 'num_meses_var13_corto_ult3',
 'num_meses_var5_ult3',
 'num_meses_var8_ult3',
 'num_op_var39_comer_ult1',
 'num_op_var39_comer_ult3',
 'num_op_v

In [34]:
len(corr_features)

120

In [35]:
corrdata = unique_data.corr().abs().stack() 

In [36]:
corrdata

ID     ID                               1.000000
       var3                             0.002021
       var15                            0.004083
       imp_ent_var16_ult1               0.034911
       imp_op_var39_comer_ult1          0.007945
       imp_op_var39_comer_ult3          0.009596
       imp_op_var40_comer_ult1          0.011549
       imp_op_var40_comer_ult3          0.010328
       imp_op_var40_efect_ult1          0.009122
       imp_op_var40_efect_ult3          0.024764
       imp_op_var40_ult1                0.005814
       imp_op_var41_comer_ult1          0.002267
       imp_op_var41_comer_ult3          0.005287
       imp_op_var41_efect_ult1          0.013718
       imp_op_var41_efect_ult3          0.016647
       imp_op_var41_ult1                0.010037
       imp_op_var39_efect_ult1          0.013279
       imp_op_var39_efect_ult3          0.015493
       imp_op_var39_ult1                0.011405
       imp_sal_var16_ult1               0.019168
       ind_var1_0   

In [37]:
corrdata = corrdata.sort_values(ascending=False)

In [38]:
corrdata = corrdata[corrdata>0.85]
corrdata = corrdata[corrdata<1]

In [39]:
corrdata = pd.DataFrame(corrdata).reset_index()
corrdata.columns = ['features1', 'features2', 'corr_value']


In [40]:
grouped_feature_list = []
correlated_groups_list = []
for feature in corrdata.features1.unique():
    if feature not in grouped_feature_list:
        correlated_block = corrdata[corrdata.features1 == feature]
        grouped_feature_list = grouped_feature_list + list(correlated_block.features2.unique()) + [feature]
        correlated_groups_list.append(correlated_block)

In [41]:
len(correlated_groups_list)

56

In [42]:
for group in correlated_groups_list:
    print(group)

                    features1                   features2  corr_value
0  delta_imp_compra_var44_1y3  delta_num_compra_var44_1y3         1.0
     features1    features2  corr_value
2  num_var14_0  ind_var14_0         1.0
           features1                   features2  corr_value
4    ind_var13_corto             num_var13_corto    1.000000
74   ind_var13_corto           ind_var13_corto_0    0.990305
76   ind_var13_corto           num_var13_corto_0    0.990305
175  ind_var13_corto  num_meses_var13_corto_ult3    0.956835
380  ind_var13_corto                   ind_var13    0.884464
404  ind_var13_corto                 ind_var13_0    0.877585
441  ind_var13_corto                   num_var13    0.862211
                features1   features2  corr_value
8  saldo_medio_var29_ult1  saldo_var6         1.0
     features1             features2  corr_value
10   ind_var24             num_var24    1.000000
190  ind_var24           ind_var24_0    0.947012
194  ind_var24           num_var24_0    0.947

In [43]:
from sklearn.ensemble import RandomForestClassifier

important_features = []
for group in correlated_groups_list:
    features = list(group.features1.unique()) + list(group.features2.unique())
    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf.fit(unique_data[features], Y)
    importance = pd.concat([pd.Series(features), pd.Series(rf.feature_importances_)], axis = 1)
    importance.columns = ['features', 'importance']
    importance.sort_values(by = 'importance', ascending = False, inplace = True)
    feat = importance.iloc[0]
    important_features.append(feat)

In [44]:
important_features

[features      delta_imp_compra_var44_1y3
 importance                      0.518266
 Name: 0, dtype: object, features      ind_var14_0
 importance           0.51
 Name: 1, dtype: object, features      num_meses_var13_corto_ult3
 importance                      0.271997
 Name: 4, dtype: object, features      saldo_var6
 importance          0.33
 Name: 1, dtype: object, features      num_meses_var12_ult3
 importance                0.372923
 Name: 6, dtype: object, features      num_meses_var8_ult3
 importance               0.680636
 Name: 4, dtype: object, features      ind_var1_0
 importance          0.51
 Name: 1, dtype: object, features      num_aport_var33_hace3
 importance                     0.15
 Name: 6, dtype: object, features      num_op_var41_hace3
 importance              0.513734
 Name: 0, dtype: object, features      num_op_var41_efect_ult3
 importance                   0.301669
 Name: 2, dtype: object, features      imp_op_var41_efect_ult3
 importance                    0.

In [45]:
important_features = pd.DataFrame(important_features)
important_features

Unnamed: 0,features,importance
0,delta_imp_compra_var44_1y3,0.518266
1,ind_var14_0,0.51
4,num_meses_var13_corto_ult3,0.271997
1,saldo_var6,0.33
6,num_meses_var12_ult3,0.372923
4,num_meses_var8_ult3,0.680636
1,ind_var1_0,0.51
6,num_aport_var33_hace3,0.15
0,num_op_var41_hace3,0.513734
2,num_op_var41_efect_ult3,0.301669


In [46]:
len(important_features)

56

In [47]:
features_to_consider = set(important_features['features'])
features_to_discard = set(corr_features) - set(features_to_consider)

In [48]:
len(features_to_discard)

88

In [49]:
x_grouped_uncorr_data = unique_data.drop(labels = features_to_discard, axis = 1)

In [50]:
x_grouped_uncorr_data.shape,unique_data.shape

((3948, 128), (3948, 216))

In [51]:
x_grouped_uncorr_data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var8_hace3,saldo_medio_var12_hace2,saldo_medio_var12_hace3,saldo_medio_var13_corto_hace3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var13_largo_ult1,saldo_medio_var17_hace2,saldo_medio_var44_hace2,var38
0,1.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,122.22,0.0,0.0,0.0,0.0,0.0,49278.03
2,4.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8.0,2.0,37.0,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [52]:
%%time
run_randomForest(x_grouped_uncorr_data, Y)

Accuracy:  0.9340994115153982
Wall time: 5.69 s


# Annova test for numerical to catagorical variables 

In [53]:
Y.unique()

array([0., 1.])

In [54]:
from sklearn.feature_selection import f_classif, f_regression
sel = f_classif(x_grouped_uncorr_data, Y)

In [55]:
p_values = pd.Series(sel[1])
p_values.index = x_grouped_uncorr_data.columns
p_values.sort_values(ascending = True, inplace = True)

In [56]:
p_values

num_meses_var5_ult3             7.124905e-18
ind_var5                        1.892021e-15
num_var42                       1.353661e-14
var15                           1.453329e-14
var36                           6.222870e-08
num_var4                        4.269617e-05
num_var35                       6.293680e-05
imp_op_var39_efect_ult1         1.333748e-04
imp_op_var41_efect_ult1         2.545181e-04
imp_op_var40_efect_ult1         4.008155e-04
imp_op_var41_ult1               1.488313e-03
num_meses_var8_ult3             1.695983e-03
imp_op_var41_efect_ult3         1.732780e-03
imp_op_var39_ult1               1.781858e-03
ind_var12_0                     4.346234e-03
saldo_var30                     4.475803e-03
num_var30_0                     4.484988e-03
imp_op_var40_efect_ult3         4.839156e-03
num_var7_recib_ult1             5.505521e-03
ind_var13_0                     5.639554e-03
num_var25_0                     8.233199e-03
num_var1                        9.963332e-03
num_meses_

In [57]:
p_values = p_values[p_values<0.05]
x_data_p = x_grouped_uncorr_data[p_values.index]

In [58]:
x_data_p.shape,x_grouped_uncorr_data.shape

((3948, 30), (3948, 128))

In [59]:
%%time
run_randomForest(x_data_p, Y)

Accuracy:  0.9521274201891693
Wall time: 5.03 s
