# 1. Introduction

The document will walk you through the codes that I used to select variables using python. We use different techniques to select variables and then finally each technique votes whether they have selected the variable. Finally, the vote is counted and the variables with higher votes are used in the modeling process.

# 2. Load Dataset

In [2]:
#import packages
import os
import pandas as pd
import numpy as np

Data = pd.read_csv('count.csv')
df = Data.drop('id', axis = 1)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 33 columns):
yC     150 non-null int64
x1     150 non-null float64
x2     150 non-null float64
x3     150 non-null int64
x4     150 non-null int64
x5     150 non-null int64
x6     150 non-null int64
x7     150 non-null int64
x8     150 non-null float64
x9     150 non-null int64
x10    150 non-null float64
x11    150 non-null float64
x12    150 non-null float64
x13    150 non-null int64
x14    150 non-null int64
x15    150 non-null int64
x16    150 non-null int64
x17    150 non-null int64
x18    150 non-null int64
x19    150 non-null int64
x20    150 non-null int64
x21    150 non-null float64
x22    150 non-null float64
x23    150 non-null float64
x24    150 non-null float64
x25    150 non-null int64
x26    150 non-null int64
x27    150 non-null float64
x28    150 non-null int64
x29    150 non-null int64
x30    150 non-null int64
x31    150 non-null float64
x32    150 non-null float64
dtypes: flo

In [4]:
df.head()

Unnamed: 0,yC,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32
0,1,49.856,118.8,1,0,0,0,0,16.32096,0,...,0.9709,0.0003,6,6,24.7138,0,0,7,0.056546,0.11286
1,0,71.836,34.8,0,0,0,0,1,15.00535,44,...,0.42,0.0,8,4,38.0658,0,0,7,0.65526,0.886908
2,0,66.725,41.8,0,1,0,0,0,17.07023,27,...,0.965,0.0,6,6,48.3807,0,1,4,0.067855,0.133944
3,0,41.696,132.0,0,0,1,0,0,16.3146,0,...,0.0091,0.0001,6,4,59.841,0,0,7,0.13323,0.260102
4,2,71.351,24.2,1,0,0,0,0,11.1079,31,...,0.0051,0.0,5,7,175.925,0,0,3,0.318584,0.627729


In [5]:
df['target'] = df['yC']
df = df.drop('yC',axis=1) #drop the y column

In [6]:
df['target'].value_counts()

0    50
1    43
2    35
3    12
4     6
5     2
7     1
6     1
Name: target, dtype: int64

I am going to use a quick way to encode the class variables (Label Encoder). Since some of the variables in the train dataframe are categories

In [16]:
features = df[df.columns.difference(['target'])]
labels = df['target']

# 3. Variable Selection

## 3.1 Weight of Evidence and Information Value

In [19]:
# import packages
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv) 

In [20]:
final_iv, IV = data_vars(df,df.target)



In [21]:
IV = IV.rename(columns={'VAR_NAME':'index'})

In [22]:
IV.sort_values(['IV'],ascending=0)

Unnamed: 0,index,IV
22,x3,2.060888
18,x26,0.635131
15,x23,0.382837
17,x25,0.32832
31,x9,0.312026
4,x13,0.268411
6,x15,0.247984
1,x10,0.204686
9,x18,0.170405
13,x21,0.121221


## 3.2 Variable Importance

In [23]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(features,labels)

preds = clf.predict(features)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(preds,labels)
print(accuracy)

0.986666666667


In [24]:
from pandas import DataFrame
VI = DataFrame(clf.feature_importances_, columns = ["RF"], index=features.columns)

In [25]:
VI = VI.reset_index()

In [26]:
VI.sort_values(['RF'],ascending=0)

Unnamed: 0,index,RF
15,x23,0.081563
19,x27,0.080043
0,x1,0.074873
1,x10,0.07016
13,x21,0.065589
30,x8,0.063456
11,x2,0.063275
3,x12,0.058019
2,x11,0.047907
24,x31,0.04602


## 3.3 Recursive Feature Elimination

In [27]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
rfe = RFE(model, 20)
fit = rfe.fit(features, labels)

In [28]:
from pandas import DataFrame
Selected = DataFrame(rfe.support_, columns = ["RFE"], index=features.columns)
Selected = Selected.reset_index()

In [29]:
Selected[Selected['RFE'] == True]

Unnamed: 0,index,RFE
2,x11,True
5,x14,True
6,x15,True
7,x16,True
8,x17,True
9,x18,True
10,x19,True
12,x20,True
14,x22,True
16,x24,True


## 3.4 Variable Importance - Extratrees Classifier

In [30]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(features, labels)

print(model.feature_importances_)

[ 0.04963317  0.04613408  0.04043371  0.04255882  0.01781245  0.03401477
  0.01749815  0.01430951  0.01533713  0.01465102  0.01548553  0.04149701
  0.00098566  0.04714844  0.03910468  0.04529568  0.04099081  0.05072196
  0.02488478  0.04902632  0.02221463  0.00311884  0.02372692  0.05027219
  0.0486535   0.0579333   0.01601703  0.00847218  0.01274786  0.01545185
  0.04301143  0.05085657]


In [31]:
from pandas import DataFrame
FI = DataFrame(model.feature_importances_, columns = ["Extratrees"], index=features.columns)

In [32]:
FI = FI.reset_index()

In [33]:
FI.sort_values(['Extratrees'],ascending=0)

Unnamed: 0,index,Extratrees
25,x32,0.057933
31,x9,0.050857
17,x25,0.050722
23,x30,0.050272
0,x1,0.049633
19,x27,0.049026
24,x31,0.048653
13,x21,0.047148
1,x10,0.046134
15,x23,0.045296


## 3.5 Chi Square

In [34]:
df1 = df.copy()

from sklearn import preprocessing
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)
# Encoding the categorical variable
fit = df1.apply(lambda x: d[x.name].fit_transform(x))

#Convert the categorical columns based on encoding
for i in list(d.keys()):
    df1[i] = d[i].transform(df1[i])

In [35]:
features1 = df1[df1.columns.difference(['target'])]
labels1 = df1['target']

In [36]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

model = SelectKBest(score_func=chi2, k=5)
fit = model.fit(features1, labels1)

In [37]:
import numpy
numpy.set_printoptions(suppress=True)
print(fit.scores_)

[ 239.99034132  149.29474593  184.18508046  193.15828987    6.41167537
    6.28704319    6.25479584    8.04507198    4.24846257    9.20664452
   10.9233804   203.79700181    4.17303433  156.39647178  166.1535392
  346.70282605   58.58958543   13.46124962    8.23402661  158.72329238
   11.37284053    3.75747508   19.84400148   11.6267037    58.55628353
   72.42258378    8.12639686    2.90891473   14.95773348    9.4358804
  240.37358826   29.50485096]


In [38]:
from pandas import DataFrame
pd.options.display.float_format = '{:.2f}'.format
chi_sq = DataFrame(fit.scores_, columns = ["Chi_Square"], index=features1.columns)

In [39]:
chi_sq = chi_sq.reset_index()

In [40]:
chi_sq.sort_values('Chi_Square',ascending=0)

Unnamed: 0,index,Chi_Square
15,x23,346.7
30,x8,240.37
0,x1,239.99
11,x2,203.8
3,x12,193.16
2,x11,184.19
14,x22,166.15
19,x27,158.72
13,x21,156.4
1,x10,149.29


## 3.6 L1 feature selection

In [41]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [42]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(features, labels)
model = SelectFromModel(lsvc,prefit=True)

In [43]:
from pandas import DataFrame
l1 = DataFrame(model.get_support(), columns = ["L1"], index=features.columns)

In [44]:
l1 = l1.reset_index()

In [45]:
l1[l1['L1'] == True]

Unnamed: 0,index,L1
0,x1,True
1,x10,True
3,x12,True
11,x2,True
19,x27,True
31,x9,True


## 3.7 Combine all together

In [46]:
from functools import reduce
dfs = [IV, VI, Selected, FI, chi_sq, l1]
final_results = reduce(lambda left,right: pd.merge(left,right,on='index'), dfs)

In [47]:
final_results.head()

Unnamed: 0,index,IV,RF,RFE,Extratrees,Chi_Square,L1
0,x1,0.07,0.07,False,0.05,239.99,True
1,x10,0.2,0.07,False,0.05,149.29,True
2,x11,0.03,0.05,True,0.04,184.19,False
3,x12,0.0,0.06,False,0.04,193.16,True
4,x13,0.27,0.01,False,0.02,6.41,False


In [48]:
final_results.columns

Index(['index', 'IV', 'RF', 'RFE', 'Extratrees', 'Chi_Square', 'L1'], dtype='object')

## 3.8 Variable Score

In [49]:
columns = ['IV', 'RF', 'Extratrees', 'Chi_Square']

score_table = pd.DataFrame({},[])
score_table['index'] = final_results['index']

for i in columns:
    score_table[i] = final_results['index'].isin(list(final_results.nlargest(5,i)['index'])).astype(int)
    
score_table['RFE'] = final_results['RFE'].astype(int)
score_table['L1'] = final_results['L1'].astype(int)

In [50]:
score_table['final_score'] = score_table.sum(axis=1)

In [51]:
score_table.sort_values('final_score',ascending=0)

Unnamed: 0,index,IV,RF,Extratrees,Chi_Square,RFE,L1,final_score
0,x1,0,1,1,1,0,1,4
15,x23,1,1,0,1,0,0,3
17,x25,1,0,1,0,1,0,3
31,x9,1,0,1,0,0,1,3
18,x26,1,0,0,0,1,0,2
19,x27,0,1,0,0,0,1,2
11,x2,0,0,0,1,0,1,2
25,x32,0,0,1,0,1,0,2
22,x3,1,0,0,0,1,0,2
3,x12,0,0,0,1,0,1,2


## 3.9 Multicollinearity - VIF  (Addon)

In [52]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [53]:
def calculate_vif(features):
    vif = pd.DataFrame()
    vif["Features"] = features.columns
    vif["VIF"] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]    
    return(vif)

In [54]:
vif = calculate_vif(features)
while vif['VIF'][vif['VIF'] > 10].any():
    remove = vif.sort_values('VIF',ascending=0)['Features'][:1]
    features.drop(remove,axis=1,inplace=True)
    vif = calculate_vif(features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [55]:
vif

Unnamed: 0,Features,VIF
0,x10,1.88
1,x12,2.86
2,x13,2.76
3,x14,1.72
4,x15,2.92
5,x16,4.26
6,x17,6.83
7,x18,2.62
8,x19,2.97
9,x2,8.17
