#  Putting all the Basic Filter Methods together

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [2]:
data = pd.read_csv('santander.csv')
data.shape

(76020, 371)

In [3]:
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


##  Train/Test Split 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['TARGET'],axis=1),
    data['TARGET'],
    test_size = 0.3,
    random_state=0
)

X_train.shape, X_test.shape

((53214, 370), (22806, 370))

#  Remove Constant features

In [5]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0    
]

X_train.drop(labels=constant_features,axis=1,inplace=True)
X_test.drop(labels=constant_features,axis=1,inplace=True)

X_train.shape, X_test.shape

((53214, 332), (22806, 332))

#  Remove Quasi-Constant Features

In [6]:
sel = VarianceThreshold(threshold = 0.01)

In [9]:
sel.fit(X_train)

VarianceThreshold(threshold=0.01)

In [10]:
#  number of non Quasi-Constant Features 
sum(sel.get_support())

268

In [11]:
features_to_keep = X_train.columns[sel.get_support()]

In [12]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((53214, 268), (22806, 268))

In [16]:
# sklearn returns numpy arrays
# transform them back to pandas dataframes
X_train = pd.DataFrame(X_train)
X_train.columns = features_to_keep
X_test = pd.DataFrame(X_test)
X_test.columns = features_to_keep

#  Removing Duplicate Features

In [19]:
#  check for duplicate features

duplicate_feat = []
for i in range(len(X_train.columns)):
    if i%10 ==0:
        print(i)
        
    col_1 = X_train.columns[i]
    
    # check if any remaining columns are equal to the ith column
    for col_2 in X_train.columns[i+1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicate_feat.append(col_2)

len(duplicate_feat)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260


16

In [20]:
X_train.drop(labels=duplicate_feat,axis=1,inplace=True)
X_test.drop(labels=duplicate_feat,axis=1,inplace=True)
X_train.shape, X_test.shape

((53214, 252), (22806, 252))

###  Features started at 371 and ended with 252 with these simple steps