# Feature selection with VIF: Variance Inflation Factor

In [1]:
import pandas as pd

In [2]:
# read the data
df = pd.read_csv('../../subject/data04/Train_knight.csv')
df.sample(5)

Unnamed: 0,Sensitivity,Hability,Strength,Power,Agility,Dexterity,Awareness,Prescience,Reactivity,Midi-chlorien,...,Evade,Stims,Sprint,Combo,Delay,Attunement,Empowered,Burst,Grasping,knight
215,19.1,26.29,129.1,1132.0,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,...,32.72,141.3,1298.0,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203,Jedi
271,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,...,22.0,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949,Sith
316,14.95,17.57,96.85,678.1,0.1167,0.1305,0.1539,0.08624,0.1957,0.06216,...,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147,Jedi
79,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,...,27.37,127.1,1095.0,0.1878,0.448,0.4704,0.2027,0.3585,0.1065,Jedi
153,19.73,19.82,130.7,1206.0,0.1062,0.1849,0.2417,0.0974,0.1733,0.06697,...,25.59,159.8,1933.0,0.171,0.5955,0.8489,0.2507,0.2749,0.1297,Jedi


In [3]:
# change 'knight' column to numeric values
df['knight'] = df['knight'].map({'Jedi': 1, 'Sith': 0})
df['knight']

0      0
1      1
2      0
3      0
4      0
      ..
393    0
394    0
395    0
396    0
397    1
Name: knight, Length: 398, dtype: int64

## VIF: Variance Inflation Factor

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [5]:
X = df.drop(columns=['knight'])
y = df['knight']

vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data['Tolerance'] = 1 / vif_data['VIF']
print(vif_data)

          feature           VIF  Tolerance
0     Sensitivity  76144.952570   0.000013
1        Hability    248.082778   0.004031
2        Strength  72772.222982   0.000014
3           Power   1509.801873   0.000662
4         Agility    392.206046   0.002550
5       Dexterity    232.375929   0.004303
6       Awareness    158.419461   0.006312
7      Prescience    145.898720   0.006854
8      Reactivity    185.452563   0.005392
9   Midi-chlorien    696.027648   0.001437
10          Slash    243.628949   0.004105
11           Push     23.629085   0.042321
12           Pull    212.770539   0.004700
13     Lightsaber     78.963720   0.012664
14       Survival     24.417911   0.040954
15        Repulse     48.538369   0.020602
16     Friendship     38.347628   0.026077
17       Blocking     68.174728   0.014668
18     Deflection     38.578276   0.025921
19           Mass     30.947245   0.032313
20       Recovery  10145.493187   0.000099
21          Evade    338.431648   0.002955
22         

## StandardScaler

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()
X_t = scaler.fit_transform(X)
vif_data_std = pd.DataFrame()
vif_data_std['feature'] = X.columns
vif_data_std['VIF'] = [variance_inflation_factor(X_t, i) for i in range(X_t.shape[1])]
vif_data_std['Tolerance'] = 1 / vif_data_std['VIF']
print(vif_data_std)

          feature          VIF  Tolerance
0     Sensitivity  4367.233204   0.000229
1        Hability    11.188825   0.089375
2        Strength  4489.653450   0.000223
3           Power   411.965245   0.002427
4         Agility     7.645518   0.130796
5       Dexterity    54.126657   0.018475
6       Awareness    69.852627   0.014316
7      Prescience    54.712644   0.018277
8      Reactivity     4.074260   0.245443
9   Midi-chlorien    14.650093   0.068259
10          Slash    80.754427   0.012383
11           Push     4.065017   0.246001
12           Pull    73.911907   0.013530
13     Lightsaber    47.550260   0.021030
14       Survival     3.630928   0.275412
15        Repulse    15.259597   0.065533
16     Friendship    19.040267   0.052520
17       Blocking    14.581072   0.068582
18     Deflection     4.550873   0.219738
19           Mass    10.477189   0.095445
20       Recovery   782.388684   0.001278
21          Evade    17.630598   0.056720
22          Stims   346.022133   0

In [8]:
vif_data_std.sort_values(by='VIF', ascending=False)

Unnamed: 0,feature,VIF,Tolerance
2,Strength,4489.65345,0.000223
0,Sensitivity,4367.233204,0.000229
20,Recovery,782.388684,0.001278
3,Power,411.965245,0.002427
23,Sprint,359.129874,0.002785
22,Stims,346.022133,0.00289
10,Slash,80.754427,0.012383
12,Pull,73.911907,0.01353
6,Awareness,69.852627,0.014316
7,Prescience,54.712644,0.018277


## Keep only the features so that the VIF goes under 5, and display the features

In [9]:
vif_data_std[vif_data_std['VIF'] < 5]

Unnamed: 0,feature,VIF,Tolerance
8,Reactivity,4.07426,0.245443
11,Push,4.065017,0.246001
14,Survival,3.630928,0.275412
18,Deflection,4.550873,0.219738
