# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Reading the data

In [2]:
turnover = pd.read_csv('turnover.csv')
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
sales = pd.get_dummies(turnover['sales'], drop_first = True, dtype = int)
sales.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [4]:
salary = pd.get_dummies(turnover['salary'], drop_first = True, dtype = int)
salary.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [5]:
turnover = turnover.drop(columns = ['sales', 'salary'], axis = 1)
turnover = pd.concat([turnover, sales, salary], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


# Defining X & Y 

In [6]:
X = turnover.drop(columns=['left'], axis=1)
Y = turnover['left']

skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

# Random Forest

In [7]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

RF_numb_features = RFECV(estimator=RandomForestClassifier(n_estimators=100, 
                                                          max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md1 = Pipeline([('transformations', all_transformations),
                ('RF-numb-features', RF_numb_features)])

md1

In [8]:
%%time
md1_num_features = md1.fit(X, Y)
print(f"The suggested number of features of RF is {sum(md1_num_features['RF-numb-features'].support_)}")

The suggested number of features of RF is 10
CPU times: user 20.6 s, sys: 109 ms, total: 20.7 s
Wall time: 20.7 s


In [10]:
RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, 
                                                   max_depth=3),
                 n_features_to_select=11)

X_trans = all_transformations.fit_transform(X)
X_trans.head()

Unnamed: 0,step-1__power-tran__time_spend_company,step-2__0-1-tran__number_project,step-2__0-1-tran__average_montly_hours,step-3__interaction_1__satisfaction_level,step-3__interaction_1__time_spend_company,step-3__interaction_1__satisfaction_level time_spend_company,step-4__interaction_2__last_evaluation,step-4__interaction_2__promotion_last_5years,step-4__interaction_2__last_evaluation promotion_last_5years,step-5__remainder__Work_accident,...,step-5__remainder__accounting,step-5__remainder__hr,step-5__remainder__management,step-5__remainder__marketing,step-5__remainder__product_mng,step-5__remainder__sales,step-5__remainder__support,step-5__remainder__technical,step-5__remainder__low,step-5__remainder__medium
0,-0.143951,0.0,0.285047,0.38,3.0,1.14,0.53,0.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
1,1.575753,0.6,0.775701,0.8,6.0,4.8,0.86,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.662038,1.0,0.82243,0.11,4.0,0.44,0.88,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,1
3,1.196143,0.6,0.593458,0.72,5.0,3.6,0.87,0.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
4,-0.143951,0.0,0.294393,0.37,3.0,1.11,0.52,0.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0


In [11]:
RF_to_select = RF_selector.fit(X_trans, Y)
X_to_select = X_trans.columns[RF_to_select.support_].tolist()
X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-4__interaction_2__last_evaluation promotion_last_5years',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [13]:
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

RF_tuning = GridSearchCV(estimator=RandomForestClassifier(),
                        param_grid=RF_param_grid,
                        cv=skf,
                        scoring='roc_auc',
                        n_jobs=-1).fit(X_trans[X_to_select], Y)

# Reporting the best model
print(RF_tuning.best_estimator_)

# Reporting the best score
print(RF_tuning.best_score_)

RandomForestClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=10,
                       n_estimators=300)
0.9891364971999088


# Extra Trees

In [15]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

ET_numb_features = RFECV(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                        max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md2 = Pipeline([('transformations', all_transformations),
                ('ET-numb-features', ET_numb_features)])

md2

In [16]:
%%time
md2_num_features = md2.fit(X, Y)
print(f"The suggested number of features of ET is {sum(md2_num_features['ET-numb-features'].support_)}")

The suggested number of features of ET is 16
CPU times: user 12 s, sys: 98.7 ms, total: 12.1 s
Wall time: 12.2 s


In [17]:
ET_selector = RFE(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                 max_depth=3),
                 n_features_to_select=16)

ET_to_select = ET_selector.fit(X_trans, Y)
X_to_select = X_trans.columns[RF_to_select.support_].tolist()
X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-4__interaction_2__last_evaluation promotion_last_5years',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [18]:
ET_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

ET_tuning = GridSearchCV(estimator=ExtraTreesClassifier(),
                        param_grid=ET_param_grid,
                        cv=skf,
                        scoring='roc_auc',
                        n_jobs=-1).fit(X_trans[X_to_select], Y)

# Reporting the best model
print(ET_tuning.best_estimator_)

# Reporting the best score
print(ET_tuning.best_score_)

ExtraTreesClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=10,
                     n_estimators=500)
0.9822086359065777


In [17]:
# Based on my results, I would use the Random Forest model to predict left.