##### Dataset

In [1]:
# satander.csv, paribas.csv
# Location: https://mitu.co.in

##### Import the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##### Load the dataset

In [3]:
df = pd.read_csv('datasets/satandar.csv', nrows=20000 )

In [4]:
df.shape

(20000, 371)

In [5]:
df.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

In [6]:
x = df.drop('TARGET', axis = 1)

y = df['TARGET']

In [7]:
x.shape

(20000, 370)

##### Constant Feature Elimination

In [8]:
num = np.array([11,11,11,11,11,11,11,11,11])

In [9]:
num.var()

0.0

In [10]:
num = np.array([11,11,11,11,11,11,11,11,12])
num.var()

0.09876543209876544

In [11]:
x.var()

ID                         1.341291e+08
var3                       1.896491e+09
var15                      1.667130e+02
imp_ent_var16_ult1         4.207879e+06
imp_op_var39_comer_ult1    1.192042e+05
                               ...     
saldo_medio_var44_hace2    1.141930e+07
saldo_medio_var44_hace3    2.471069e+04
saldo_medio_var44_ult1     3.411952e+07
saldo_medio_var44_ult3     1.254457e+07
var38                      2.659698e+10
Length: 370, dtype: float64

In [12]:
sum(x.var() == 0)

64

In [13]:
from sklearn.feature_selection import VarianceThreshold

In [14]:
vt = VarianceThreshold(threshold=0)

In [15]:
vt.fit(x)

In [16]:
vt.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [17]:
x_new =x.iloc[:,vt.get_support()]

In [18]:
x_new.shape

(20000, 306)

##### Remove Duplicated Features

In [19]:
x.shape

(20000, 370)

In [20]:
x_t = x.T

In [21]:
x_t.shape

(370, 20000)

In [22]:
dup = x_t.duplicated()
sum(dup)

91

In [23]:
x_t.drop_duplicates(inplace=True)
x = x_t.T

In [24]:
x.shape

(20000, 279)

##### Correlated Feature Removal

In [25]:
df = pd.read_csv('datasets/paribas.csv', nrows=20000)

In [26]:
df.shape

(20000, 133)

In [27]:
df.columns

Index(['ID', 'target', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8',
       ...
       'v122', 'v123', 'v124', 'v125', 'v126', 'v127', 'v128', 'v129', 'v130',
       'v131'],
      dtype='object', length=133)

In [28]:
x = df.drop('target', axis = 1)
y = df['target']

In [29]:
x.shape

(20000, 132)

In [30]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 132 entries, ID to v131
dtypes: float64(108), int64(5), object(19)
memory usage: 20.1+ MB


In [31]:
cols = ['float64','int64']

In [32]:
numerical_columns = list(x.select_dtypes(include=cols).columns)

In [33]:
len(numerical_columns)

113

In [34]:
x_new = x[numerical_columns]

In [35]:
x_new.shape

(20000, 113)

In [52]:
x_new.corr()

Unnamed: 0,ID,v1,v2,v4,v5,v6,v7,v8,v9,v10,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
ID,1.000000,0.004817,0.001134,-0.003499,0.002015,-0.010675,-0.002102,-0.002202,0.000506,0.000844,...,0.017562,-0.001780,0.007956,0.001076,-0.003765,0.002461,0.001904,-0.004063,0.010092,-0.003948
v1,0.004817,1.000000,-0.205826,-0.145037,-0.049337,-0.020251,0.089950,0.160309,-0.035394,-0.006785,...,0.340779,-0.137999,0.292999,0.164169,0.025936,-0.018762,-0.010806,-0.015788,0.253119,0.709122
v2,0.001134,-0.205826,1.000000,0.542216,0.195646,0.025525,0.351704,-0.367639,-0.012757,0.055303,...,-0.571877,0.169172,-0.592534,-0.261420,-0.111644,0.011387,0.259107,0.074142,-0.576703,0.183870
v4,-0.003499,-0.145037,0.542216,1.000000,0.251530,0.387779,0.483067,-0.179696,-0.191921,0.085467,...,-0.618385,0.058058,-0.711291,-0.063270,-0.130919,0.299857,0.487635,0.109600,-0.702047,0.304914
v5,0.002015,-0.049337,0.195646,0.251530,1.000000,0.238449,0.045411,0.003512,-0.168616,0.045830,...,-0.341398,0.012811,-0.217315,0.080085,-0.102563,-0.224641,0.600132,0.045623,-0.240718,0.153459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
v127,0.002461,-0.018762,0.011387,0.299857,-0.224641,0.169701,0.247932,-0.015149,0.123628,0.002826,...,0.017564,0.094302,-0.162525,-0.050666,-0.079944,1.000000,-0.161000,-0.001564,-0.244848,0.054744
v128,0.001904,-0.010806,0.259107,0.487635,0.600132,0.240302,0.088061,0.019486,-0.268033,0.054684,...,-0.372598,-0.043031,-0.256838,0.279839,-0.154822,-0.161000,1.000000,0.059893,-0.287794,0.243623
v129,-0.004063,-0.015788,0.074142,0.109600,0.045623,0.050278,0.041350,-0.019332,-0.035212,0.505573,...,-0.090282,0.005121,-0.101049,-0.009529,-0.024423,-0.001564,0.059893,1.000000,-0.082056,0.045038
v130,0.010092,0.253119,-0.576703,-0.702047,-0.240718,-0.304129,-0.625145,0.323090,0.084667,-0.072778,...,0.809328,-0.190182,0.727563,0.227784,0.080568,-0.244848,-0.287794,-0.082056,1.000000,-0.264040


In [53]:
correlation_matrix = x_new.corr()
correlated_features = set()

In [54]:
for i in range(1, len(correlation_matrix)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            correlated_features.add(correlation_matrix.columns[i])

In [55]:
len(correlated_features)

55

In [56]:
df.shape

(20000, 133)

In [57]:
df.drop(correlated_features, axis = 1, inplace=True)

In [58]:
df.shape

(20000, 78)

In [59]:
correlated_features

{'v100',
 'v101',
 'v103',
 'v104',
 'v105',
 'v106',
 'v108',
 'v109',
 'v111',
 'v114',
 'v115',
 'v116',
 'v118',
 'v12',
 'v121',
 'v122',
 'v123',
 'v124',
 'v126',
 'v128',
 'v130',
 'v21',
 'v25',
 'v32',
 'v37',
 'v40',
 'v41',
 'v43',
 'v44',
 'v46',
 'v48',
 'v49',
 'v53',
 'v54',
 'v55',
 'v60',
 'v63',
 'v64',
 'v65',
 'v67',
 'v68',
 'v73',
 'v76',
 'v77',
 'v78',
 'v81',
 'v83',
 'v84',
 'v86',
 'v87',
 'v89',
 'v93',
 'v95',
 'v96',
 'v98'}

##### χ² Method

In [61]:
# Dataset: wisc_bc_data.csv
# Location: https://mitu.co.in/dataset

In [63]:
df = pd.read_csv('datasets/wisc_bc_data.csv')

In [64]:
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [65]:
df.shape

(569, 32)

In [66]:
x = df.drop(['id','diagnosis'], axis = 1)
y = df['diagnosis']

In [67]:
x.shape

(569, 30)

In [68]:
from sklearn.feature_selection import SelectKBest, chi2

In [69]:
skf = SelectKBest(k=5, score_func=chi2)

In [70]:
skf.fit(x, y)

In [71]:
skf.get_support()

array([False, False,  True,  True, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False, False])

In [72]:
x_new = x.iloc[:, skf.get_support()]
x_new

Unnamed: 0,perimeter_mean,area_mean,area_se,perimeter_worst,area_worst
0,122.80,1001.0,153.40,184.60,2019.0
1,132.90,1326.0,74.08,158.80,1956.0
2,130.00,1203.0,94.03,152.50,1709.0
3,77.58,386.1,27.23,98.87,567.7
4,135.10,1297.0,94.44,152.20,1575.0
...,...,...,...,...,...
564,142.00,1479.0,158.70,166.10,2027.0
565,131.20,1261.0,99.04,155.00,1731.0
566,108.30,858.1,48.55,126.70,1124.0
567,140.10,1265.0,86.22,184.60,1821.0
