### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2, SelectKBest
from scipy.stats.mstats import hmean

###  Importing Data

In [3]:
df_fe = pd.read_csv('dont-overfit-ii/processed_data/engineered_features_clean.csv')
df_fe

Unnamed: 0,id,target,0,1,2,3,4,5,6,7,...,exp_290,exp_291,exp_292,exp_293,exp_294,exp_295,exp_296,exp_297,exp_298,exp_299
0,0.0,1.0,-0.098,2.165,0.681,-0.614,1.309,-0.455,-0.236,0.276,...,2.379761,3.845871,1.655329,0.522568,1.958150,0.122824,2.860510,0.661001,2.823564,0.344728
1,1.0,0.0,1.081,-0.973,-0.383,0.326,-0.428,0.317,1.172,0.352,...,0.847894,0.183599,0.284506,3.892299,0.445749,0.197109,0.632547,0.333204,0.392193,2.645870
2,2.0,1.0,-0.523,-0.089,-0.348,0.148,-0.022,0.404,-0.023,-0.172,...,1.013085,1.300827,0.294640,2.066797,4.237612,0.311923,0.213525,1.004008,2.225541,0.297899
3,3.0,1.0,0.067,-0.021,0.392,-1.637,-0.446,-0.725,-1.035,0.834,...,0.667644,1.896481,0.551563,0.380602,2.459603,1.595201,0.570068,0.775692,0.586842,1.268709
4,4.0,1.0,2.347,-0.831,0.511,-0.021,1.225,1.594,0.585,1.509,...,2.454689,1.143393,11.189770,0.369354,0.365679,3.966960,3.476409,4.384169,1.534186,1.287883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,245.0,0.0,-1.199,0.466,-0.908,2.771,1.631,0.931,0.182,-0.652,...,2.062667,1.193631,0.961751,2.136139,1.585659,0.784272,1.690459,1.324454,0.774916,0.321101
226,246.0,0.0,0.237,0.233,-0.380,-1.748,0.839,-0.721,-0.114,0.005,...,2.356082,1.158354,1.823942,0.810584,0.463940,2.729177,0.375687,1.007025,1.118513,0.572353
227,247.0,0.0,1.411,-1.465,0.119,0.583,1.634,-0.207,1.173,1.622,...,0.607137,0.634448,2.136139,1.248571,1.110711,0.483357,1.585659,2.138276,1.182937,0.487239
228,248.0,1.0,0.620,1.040,0.184,-0.570,-0.087,-0.748,-1.559,-0.553,...,1.745428,0.224473,2.656475,2.415726,0.220469,1.612845,0.402524,0.447088,7.606476,0.655079


### Finding Correlation(Using Pearson correlation)

In [4]:
#find correlation of every engineered features with target by using batches as the entire data does not fit into memory
batch_size = 20000 
corr = pd.Series(dtype = 'float64')
i = 2
print('Total Size: ',len(df_fe.columns))

while i < len(df_fe.columns):
    print('Current Batch: '+str(i)+':'+str(i+batch_size))
    current_cols = np.append(df_fe.columns[i:i+batch_size].values, 'target') #getting the cols for current batch
    corr = corr.append(df_fe[current_cols].corr()['target'][:-1]) #finding correlation for the current batch
    i +=batch_size

Total Size:  270902
Current Batch: 2:20002
Current Batch: 20002:40002
Current Batch: 40002:60002
Current Batch: 60002:80002
Current Batch: 80002:100002
Current Batch: 100002:120002
Current Batch: 120002:140002
Current Batch: 140002:160002
Current Batch: 160002:180002
Current Batch: 180002:200002
Current Batch: 200002:220002
Current Batch: 220002:240002
Current Batch: 240002:260002
Current Batch: 260002:280002


### Finding Chi-square value

In [5]:
#https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame
#chi2 function will throw ValueError when Input contains NaN, infinity or a value too large for dtype('float64')
#the data should only contain non negative values, so normalizing the data
normalized_df_fe=(df_fe-df_fe.min())/(df_fe.max()-df_fe.min())
normalized_df_fe

Unnamed: 0,id,target,0,1,2,3,4,5,6,7,...,exp_290,exp_291,exp_292,exp_293,exp_294,exp_295,exp_296,exp_297,exp_298,exp_299
0,0.000000,1.0,0.454564,0.952523,0.503753,0.340156,0.708798,0.423909,0.496197,0.529993,...,0.180309,0.229567,0.103625,0.032844,0.226532,0.003348,0.109965,0.043524,0.081218,0.019727
1,0.004016,0.0,0.695866,0.365981,0.308988,0.523392,0.391074,0.560837,0.751177,0.543892,...,0.061212,0.005906,0.014388,0.285730,0.048164,0.006599,0.022112,0.019694,0.009848,0.164050
2,0.008032,1.0,0.367581,0.531215,0.315394,0.488694,0.465337,0.576268,0.534770,0.448061,...,0.074055,0.074137,0.015048,0.148733,0.495365,0.011623,0.005589,0.068458,0.063664,0.016790
3,0.012048,1.0,0.488334,0.543925,0.450851,0.140741,0.387781,0.376020,0.351503,0.632041,...,0.047198,0.110514,0.031773,0.022190,0.285672,0.067783,0.019648,0.051861,0.015562,0.077677
4,0.016064,1.0,0.954973,0.392523,0.472634,0.455750,0.693433,0.787336,0.644875,0.755486,...,0.186134,0.064522,0.724289,0.021345,0.038720,0.171577,0.134252,0.314179,0.043370,0.078879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,0.983936,0.0,0.229226,0.634953,0.212887,1.000000,0.767697,0.669741,0.571894,0.360278,...,0.155656,0.067590,0.058475,0.153937,0.182601,0.032294,0.063828,0.091753,0.021082,0.018245
226,0.987952,0.0,0.523127,0.591402,0.309537,0.119103,0.622828,0.376729,0.518290,0.480432,...,0.178468,0.065436,0.114601,0.054458,0.050309,0.117409,0.011983,0.068678,0.031168,0.034003
227,0.991968,0.0,0.763406,0.274019,0.400879,0.573489,0.768246,0.467896,0.751358,0.776152,...,0.042494,0.033440,0.134924,0.087328,0.126587,0.019126,0.059695,0.150914,0.033059,0.028665
228,0.995984,1.0,0.601515,0.742243,0.412777,0.348733,0.453448,0.371940,0.256610,0.378383,...,0.130992,0.008402,0.168796,0.174919,0.021595,0.068555,0.013042,0.027973,0.221614,0.039191


In [6]:
#getting the chi2 statistics  and p_val for all the features
score, p_val = chi2(normalized_df_fe.drop(['id', 'target'], axis = 1), normalized_df_fe['target'])

In [8]:
#creating a dataframe with the features and their corresponding correlation and chi2 statictics
feature_imp = pd.DataFrame(list(zip(corr, score, p_val)),
                           columns=['correlation', 'chi2_score', 'chi2_p_val'],
                           index = df_fe.columns[2:])
feature_imp

Unnamed: 0,correlation,chi2_score,chi2_p_val
0,0.118118,0.288789,0.590998
1,-0.062362,0.057894,0.809856
2,-0.016348,0.005082,0.943168
3,0.015924,0.004963,0.943835
4,-0.133754,0.295107,0.586967
...,...,...,...
exp_295,-0.153373,0.763597,0.382205
exp_296,-0.097177,0.302711,0.582188
exp_297,0.083976,0.221219,0.638113
exp_298,-0.020659,0.015240,0.901749


### Finding harmonic mean of correlation and the chi2 statictics

In [10]:
#finding the harmonic mean of normalized correlation and the normalized chi2 statictics
feature_imp['abs_correlation'] = abs(feature_imp['correlation'])
feature_imp_nor = (feature_imp-feature_imp.min())/(feature_imp.max()-feature_imp.min())
feature_imp['harmonic_mean'] = feature_imp_nor[['abs_correlation', 'chi2_score']].apply(hmean, axis = 1)
feature_imp

Unnamed: 0,correlation,chi2_score,chi2_p_val,abs_correlation,harmonic_mean
0,0.118118,0.288789,0.590998,0.118118,0.061797
1,-0.062362,0.057894,0.809856,0.062362,0.013383
2,-0.016348,0.005082,0.943168,0.016348,0.001215
3,0.015924,0.004963,0.943835,0.015924,0.001186
4,-0.133754,0.295107,0.586967,0.133754,0.063896
...,...,...,...,...,...
exp_295,-0.153373,0.763597,0.382205,0.153373,0.145345
exp_296,-0.097177,0.302711,0.582188,0.097177,0.062715
exp_297,0.083976,0.221219,0.638113,0.083976,0.046902
exp_298,-0.020659,0.015240,0.901749,0.020659,0.003559


### Feature importance

In [11]:
feature_imp['abs_correlation'].sort_values(ascending = False)[:300]

33+65     0.458179
33-217    0.413384
217-33    0.413384
33-133    0.404864
133-33    0.404864
            ...   
90-65     0.295964
229-33    0.295892
33-229    0.295892
245-33    0.295709
33-245    0.295709
Name: abs_correlation, Length: 300, dtype: float64

In [12]:
feature_imp['chi2_score'].sort_values(ascending = False)[:300]

sin_33    8.226232
exp_33    4.731209
sin_65    3.788271
90-33     3.388382
33-90     3.321150
            ...   
3+33      1.389016
33+178    1.388505
65+164    1.387771
220-33    1.386185
33-169    1.384738
Name: chi2_score, Length: 300, dtype: float64

In [13]:
feature_imp['harmonic_mean'].sort_values(ascending = False)[:300]

sin_33    0.908763
exp_33    0.637306
133-33    0.549151
33-73     0.537101
90-33     0.536220
            ...   
143-65    0.267320
94-33     0.266837
33-98     0.266808
65-91     0.266139
65-252    0.266107
Name: harmonic_mean, Length: 300, dtype: float64

### Saving feature importance

In [14]:
feature_imp.to_csv('dont-overfit-ii/processed_data/feature_importance.csv')