In [5]:
import pandas as pd 
import numpy as np 

df = pd.read_csv("processed_data/numerical.csv", index_col="EmployeeID")
X = df[df.columns.difference(['Attrition'])]
y = df['Attrition']

# 7.1 Low variance elimination
Delete the features with the low variance. LOW VARIANCE == LESS INFORMATION ... May not be true in some case but ok. 

Pros:
- Easy to implement.
- Fast computation. 
- We don't need a predicted variable so it can be used with the supervised learning.

Cons: 
- Estimating the variance value may be hard. 
- Hard to measure the effect of the variance to the final results.
- Works welll only if the data has the same scale

In [19]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
variance = VarianceThreshold(threshold=.9)

X_scaled = scaler.fit_transform(X)
print(X_scaled.shape)
X_selected = variance.fit_transform(X_scaled)

(4382, 14)


ValueError: No feature in X meets the variance threshold 0.90000

In [25]:
pd.DataFrame(X_scaled,columns=list(X.columns)).var()

Age                        0.047330
Attrition_num              0.135032
DistanceFromHome           0.083798
Education                  0.065629
JobLevel                   0.076468
MonthlyIncome              0.061627
NumCompaniesWorked         0.077027
PercentSalaryHike          0.068457
StockOptionLevel           0.080731
TotalWorkingYears          0.037886
TrainingTimesLastYear      0.046182
YearsAtCompany             0.023481
YearsSinceLastPromotion    0.046225
YearsWithCurrManager       0.044092
dtype: float64

# 7.2 Univariate feature selection
Selecting top k features based on the univariate statistical tests.

Pros:
- Easy to implement.
- Fast computation. 

Cons: 
- Works with the target variable. 
- Hard to measure the effect of the variance to the final results.
- Works welll only if the data has the same scale.
- How to choose the right test ? (need some mathematical knowledge)
- How to choose the k ? 

Lets see example with the chi squared test 

In [31]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

scaler = MinMaxScaler()
feature_selection = SelectKBest(chi2, k=4)

X_scaled = scaler.fit_transform(X)
print(X_scaled.shape)
X_selected = feature_selection.fit_transform(X_scaled, y)
X_selected.shape

(4382, 14)


(4382, 4)

# Recursive feature elimination (RFE)
This approach uses an external estimator to measure the feature importance:
- Starts with all features and looks at the prediction efficiency and the feature importances
- Eliminates the feature with lowest importance
- Repeats recursively until the result is improving 

Pros:
- Very effective. 
- Good results. 

Cons:
- Can be very slow (triaining the classifier for each run)
- Can be run only for the predictors which provide the feature importances (not that many as we want)

In [40]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

print(X.shape)
scaler = MinMaxScaler()
feature_selection = RFE(LogisticRegression())

X_scaled = scaler.fit_transform(X)
feature_selection = feature_selection.fit(X_scaled, y)

print(feature_selection.support_)
print(feature_selection.ranking_)

X_selected = feature_selection.transform(X_scaled)
X_selected.shape

(4382, 14)
[ True  True False False False False  True False False  True  True False
  True  True]
[1 1 8 6 5 3 1 4 7 1 1 2 1 1]


(4382, 7)