# Feature Selection

Feature engineering is one of the most important task in any machine learning project. Feature selection is one of the subtask that is carried out to select the features which best represent the target variable. There are various methods for feature selection that will be using the project including feature selection using correlation, select k best using chi square , select from models such as logistic regression, random forest etc.
Feature selection can be:
1. Filter-based
2. Wrapper-based
3. Embedded methods

In [1]:
# Load the packages
import warnings
warnings.filterwarnings("ignore")
import json
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, RFE, SequentialFeatureSelector, SelectFromModel
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Load the data
df = pd.read_csv('./../../../data/train/train.csv')

Let us assume that out of 53 colums that are available we want around 30 columns.

In [3]:
# Declare the number of features required
n_feat = 30

In [4]:
# Separate out the data into features and target variable
y = df['Attrition']
X = df.drop('Attrition', axis=1)

# Filter-based methods

### High Correlation Filter

#### Correlation

Starting with our first method first-based method which is dependent on correlation, we eliminate columns whose correlation coefficients is greater than 0.75.

In [5]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.75
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

print(f"Columns to be dropped: {to_drop}")

Columns to be dropped: ['TotalWorkingYears', 'YearsInCurrentRole', 'YearsWithCurrManager', 'JobLevel', 'PerformanceRating', 'Department Sales', 'Gender Male', 'JobRole Human Resources']


In [6]:
# Create a resulting dataframe for the output
columns = df.columns
result = [True if column not in to_drop else False for column in columns]
correlation = pd.DataFrame(columns=['Correlation'], index=columns)
correlation['Correlation'] = result

### Univariate Selection Methods

#### SelectKBest using f_classif

In [7]:
# Declare the transformer
f_class_selector = SelectKBest(k=n_feat)
f_class_selector.fit(X, y)

SelectKBest(k=30)

In [8]:
# Get the features from the support
f_class_support = f_class_selector.get_support()
f_class_features = X.loc[:, f_class_support].columns.tolist()
print(f"Feature select with f classif: {f_class_features}")

Feature select with f classif: ['Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'OverTime', 'StockOptionLevel', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'EducationField Life Sciences', 'EducationField Medical', 'EducationField Other', 'Gender Female', 'JobRole Healthcare Representative', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Sales Representative', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']


In [9]:
# Store the result in dataframe
columns = df.columns
result = [True if column in f_class_features else False for column in columns]
f_class_df = pd.DataFrame(columns=['F classif'], index=columns)
f_class_df['F classif'] = result

#### SelectKBest using chi square

In [10]:
# Declare the transformer
chi2 = SelectKBest(chi2, k=n_feat)
chi2.fit(X,y)

SelectKBest(k=30, score_func=<function chi2 at 0x1238bca60>)

In [11]:
# Get the features from the support
chi2_support = chi2.get_support()
chi2_features = X.loc[:, chi2_support].columns.tolist()
print(f"Features selected with chi2: {chi2_features}")

Features selected with chi2: ['Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'OverTime', 'StockOptionLevel', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'EducationField Life Sciences', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Sales Representative', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']


In [12]:
# Store the result in dataframe
columns = df.columns
result = [True if column in chi2_features else False for column in columns]
chi2_df = pd.DataFrame(columns=['Chi2'], index=columns)
chi2_df['Chi2'] = result

#### SelectKBest using mutual information

In [13]:
# Declare the transformer
mutual_info = SelectKBest(mutual_info_classif, k=n_feat)
mutual_info.fit(X, y)

SelectKBest(k=30, score_func=<function mutual_info_classif at 0x123f31700>)

In [14]:
# Get the features from the support
mutual_info_support = mutual_info.get_support()
mutual_info_features = X.loc[:, mutual_info_support].columns.tolist()

In [15]:
# Store the result in dataframe
columns = df.columns
result = [True if column in mutual_info_features else False for column in columns]
mi_df = pd.DataFrame(columns=['Mutual Informtion'], index=columns)
mi_df['Mutual Informtion'] = result

## Wrapper Methods

### Recursive Feature Elimination

In [16]:
# Declare the transformer
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=n_feat, step=3)
rfe_selector.fit(X, y)

RFE(estimator=LogisticRegression(), n_features_to_select=30, step=3)

In [17]:
# Get the features from support
rfe_support = rfe_selector.get_support()
rfe_features = X.loc[:, rfe_support].columns.tolist()
print(f"Feature selected with RFE for Logistic Regression: {rfe_features}")

Feature selected with RFE for Logistic Regression: ['Age', 'NumCompaniesWorked', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Frequently', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'Department Sales', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'Gender Male', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']


In [18]:
columns = df.columns
result = [True if column in rfe_features else False for column in columns]
rfe_lr_df = pd.DataFrame(columns=['RFE LR'], index=columns)
rfe_lr_df['RFE LR'] = result

### Sequential Feature Selector

In [19]:
# Declare the transformer
sfs_selector = SequentialFeatureSelector(estimator=LogisticRegression(max_iter=1000), n_features_to_select=n_feat, direction='backward', scoring='f1', cv=5, n_jobs=-1)
sfs_selector.fit(X, y)

SequentialFeatureSelector(direction='backward',
                          estimator=LogisticRegression(max_iter=1000),
                          n_features_to_select=30, n_jobs=-1, scoring='f1')

In [20]:
# Get the features from the ssupport
sfs_support = sfs_selector.get_support()
sfs_features = X.loc[:, sfs_support].columns.tolist()
print(f"Features selected with Sequential Feature Selector: {sfs_features}")

Features selected with Sequential Feature Selector: ['Age', 'DistanceFromHome', 'MonthlyRate', 'NumCompaniesWorked', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Rarely', 'Department Research & Development', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'JobRole Sales Executive', 'MaritalStatus Divorced', 'MaritalStatus Married']


In [21]:
columns = df.columns
result = [True if column in sfs_features else False for column in columns]
sfs_df = pd.DataFrame(columns=['SFS LR'], index=columns)
sfs_df['SFS LR'] = result

## Embedded Methods

### LASSO Regression

In [22]:
# Declare the transformer
lasso_selector = SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1', solver='saga'), max_features=n_feat)
lasso_selector.fit(X, y)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1', solver='saga'),
                max_features=30)

In [23]:
# Get the features from the support
lasso_support = lasso_selector.get_support()
lasso_features = X.loc[:, lasso_support].columns.tolist()
print(f"Features selected from Lasso Model: {lasso_features}")

Features selected from Lasso Model: ['Age', 'DistanceFromHome', 'NumCompaniesWorked', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Frequently', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'Gender Male', 'JobRole Healthcare Representative', 'JobRole Laboratory Technician', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'JobRole Sales Executive', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']


In [24]:
columns = df.columns
result = [True if column in lasso_features else False for column in columns]
lasso_df = pd.DataFrame(columns=['Lasso'], index=columns)
lasso_df['Lasso'] = result

No features are selected by Lasso regression.

### Random Forest

In [25]:
# Declare the transformer
rf_selector = SelectFromModel(estimator=RandomForestClassifier(), max_features=n_feat)
rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(), max_features=30)

In [26]:
# Get the features
rf_support = rfe_selector.get_support()
rf_features = X.loc[:, rfe_support].columns.tolist()
print(f"Features selected from Random Forest Classifier: {rfe_features}")

Features selected from Random Forest Classifier: ['Age', 'NumCompaniesWorked', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'OverTime', 'BusinessTravel Non-Travel', 'BusinessTravel Travel_Frequently', 'BusinessTravel Travel_Rarely', 'Department Human Resources', 'Department Research & Development', 'Department Sales', 'EducationField Human Resources', 'EducationField Life Sciences', 'EducationField Marketing', 'EducationField Medical', 'EducationField Other', 'EducationField Technical Degree', 'Gender Female', 'Gender Male', 'JobRole Healthcare Representative', 'JobRole Human Resources', 'JobRole Manager', 'JobRole Manufacturing Director', 'JobRole Research Director', 'JobRole Research Scientist', 'MaritalStatus Divorced', 'MaritalStatus Married', 'MaritalStatus Single']


In [27]:
columns = df.columns
result = [True if column in rfe_features else False for column in columns]
rf_df = pd.DataFrame(columns=['RF'], index=columns)
rf_df['RF'] = result

## Combined Result

In [28]:
# Combine all the result dataframes
res_df = pd.concat([correlation, f_class_df, chi2_df, mi_df, rfe_lr_df, sfs_df, lasso_df, rf_df], axis=1)

In [29]:
# Calculate the total votes for each features based on the different methods
res_df['Total'] = np.sum(res_df, axis=1)

In [30]:
# Sort the resultant dataframe based on votes
res_df.sort_values('Total', inplace=True, ascending=False)

In [31]:
res_df

Unnamed: 0,Correlation,F classif,Chi2,Mutual Informtion,RFE LR,SFS LR,Lasso,RF,Total
Age,True,True,True,True,True,True,True,True,8
MaritalStatus Married,True,True,True,True,True,True,True,True,8
MaritalStatus Divorced,True,True,True,True,True,True,True,True,8
BusinessTravel Non-Travel,True,True,True,True,True,True,True,True,8
EducationField Medical,True,True,True,True,True,True,True,True,8
OverTime,True,True,True,True,True,True,True,True,8
JobRole Manufacturing Director,True,True,True,True,True,True,True,True,8
YearsWithCurrManager,False,True,True,True,True,True,True,True,7
Department Research & Development,True,True,True,True,True,True,False,True,7
BusinessTravel Travel_Rarely,True,True,True,False,True,True,True,True,7


In [32]:
# Store the result
res_df.to_csv('./../../../data/feature_ranking.csv')