**Feature Selection- Dropping Constant Features Using Threshold**

In this step I'll be removing the features which have constant features which are actually not important for solving the problem statement

In [198]:
# import pandas to create DataFrame
import pandas as pd

# Make DataFrame of the given data
data=pd.DataFrame({'attr1':[1,2,3,4,5,4,5,6,7],
                   'attr2':[2,4,5,6,7,8,3,2,6],
                   'attr3':[3,2,1,4,6,7,4,3,2],
                   'attr4':[0,0,0,0,0,0,0,0,0],
                   'attr5':[1,1,1,1,1,1,1,1,1],
                   'attr6':[1,2,1,2,1,2,1,2,1]})

In [199]:
data.head()

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,attr6
0,1,2,3,0,1,1
1,2,4,2,0,1,2
2,3,5,1,0,1,1
3,4,6,4,0,1,2
4,5,7,6,0,1,1


**Variance Threshold**

Feature selecctor that all low-variance features
This feature selection algorithm looks only at the features (X), not the desired output (y), and can thus be used for unsupervised learning

In [200]:
## It will remove zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(data)

VarianceThreshold(threshold=0)

In [201]:
var_thres.get_support()

array([ True,  True,  True, False, False,  True])

In [202]:
data.columns[var_thres.get_support()]

Index(['attr1', 'attr2', 'attr3', 'attr6'], dtype='object')

In [203]:
constant_columns=[column for column in data.columns
                  if column not in data.columns[var_thres.get_support()]]
print(len(constant_columns))

2


In [204]:
for feature in constant_columns:
  print(feature)

attr4
attr5


In [205]:
data.drop(constant_columns,axis=1,inplace=True)
data.head()

Unnamed: 0,attr1,attr2,attr3,attr6
0,1,2,3,1
1,2,4,2,2
2,3,5,1,1
3,4,6,4,2
4,5,7,6,1


**Apply on the Bigger Dataset**
[Santander-customoer-satisfaction](https://https://kaggle.com/c/santander-customer-satisfaction/data?select=train.csv)

In [206]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('drive/My Drive/Dataset/Santander customoer satisfaction.csv',nrows=10000)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [207]:
df.shape

(10000, 371)

In [208]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [209]:
X=df.drop(labels=['TARGET'],axis=1)
y=df['TARGET']

In [210]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=0)


In [211]:
X_train.shape

(7000, 370)

In [212]:
X_test.shape

(3000, 370)

**Variance Thrshold on Training data**

In [213]:
var_thres=VarianceThreshold(threshold=0.05)
var_thres.fit(X_train)

VarianceThreshold(threshold=0.05)

In [214]:
var_thres.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False, False,  True, False,
       False, False, False,  True, False,  True, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True, False, False,  True,
       False, False, False, False, False, False,  True, False,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True, False, False, False, False, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,

In [215]:
## Finding non constant features
sum(var_thres.get_support())

204

In [216]:
## Lets find non-constant features
len(X_train.columns[var_thres.get_support()])

204

In [217]:
constant_columns=[column for column in X_train.columns
                  if column not in X_train.columns[var_thres.get_support()]]
print(len(constant_columns))

166


In [218]:
for column in constant_columns:
  print(column)

ind_var1_0
ind_var1
ind_var2_0
ind_var2
ind_var5_0
ind_var6_0
ind_var6
ind_var8_0
ind_var8
ind_var12
ind_var13_corto_0
ind_var13_corto
ind_var13_largo_0
ind_var13_largo
ind_var13_medio_0
ind_var13_medio
ind_var14_0
ind_var14
ind_var17_0
ind_var17
ind_var18_0
ind_var18
ind_var19
ind_var20_0
ind_var20
ind_var24_0
ind_var24
ind_var25_cte
ind_var26_0
ind_var26_cte
ind_var26
ind_var25_0
ind_var25
ind_var27_0
ind_var28_0
ind_var28
ind_var27
ind_var29_0
ind_var29
ind_var30_0
ind_var31_0
ind_var31
ind_var32_cte
ind_var32_0
ind_var32
ind_var33_0
ind_var33
ind_var34_0
ind_var34
ind_var40_0
ind_var40
ind_var41
ind_var39
ind_var44_0
ind_var44
ind_var46_0
ind_var46
num_var1
num_var6_0
num_var6
num_var13_medio_0
num_var13_medio
num_var17
num_var18_0
num_var18
num_var20_0
num_var20
num_op_var40_hace3
num_var27_0
num_var28_0
num_var28
num_var27
num_var29_0
num_var29
num_var32_0
num_var32
num_var33_0
num_var33
num_var34_0
num_var34
num_var40
num_var41
num_var39
num_var44_0
num_var44
num_var46_0
num_var

In [219]:
X_train.drop(constant_columns,axis=1)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
7681,15431,2,42,840.0,4477.02,4989.54,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37491.21
9031,18181,2,31,0.0,52.32,52.32,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106685.94
3691,7411,2,51,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66144.66
202,407,2,36,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92121.36
5625,11280,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74650.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,18564,2,33,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117547.89
4859,9723,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71050.83
3264,6557,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,141069.33
9845,19796,2,38,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86412.15
