# Loading Libraries

In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Reading the data

In [5]:
turnover = pd.read_csv('turnover.csv')
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [6]:
turnover['left'].value_counts(normalize = True)

0    0.761917
1    0.238083
Name: left, dtype: float64

In [7]:
sales = pd.get_dummies(turnover['sales'], drop_first = True, dtype = int)
sales.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [8]:
salary = pd.get_dummies(turnover['salary'], drop_first = True, dtype = int)
salary.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [9]:
turnover = turnover.drop(columns = ['sales', 'salary'], axis = 1)
turnover = pd.concat([turnover, sales, salary], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


# Defining X & Y 

In [10]:
X = turnover.drop(columns = ['left'], axis = 1)
Y = turnover['left']

skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

In [11]:
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,0,0,0,0,0,1,0,0,1,0


# Random Forest

In [13]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)])

RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, 
                                                   max_depth=3),
                  n_features_to_select=15,
                  step=1
                 )

md1 = Pipeline([('transformations', all_transformations),
                ('RF-selector', RF_selector),
                ('RF', RandomForestClassifier(n_estimators = 100, 
                                              max_depth = 3))])

md1

In [14]:
%%time
RF_cv = cross_val_score(md1, X, Y, cv=skf, scoring='roc_auc', n_jobs=-1)

print(f"The average 5-folds (repeated 5 times) ROC-AUC score of the RF model is {RF_cv.mean()}")

The average 5-folds (repeated 5 times) ROC-AUC score of the RF model is 0.9685395021462962
CPU times: user 57.9 ms, sys: 127 ms, total: 185 ms
Wall time: 8.58 s


# Gradient Boosting

In [15]:
GB_selector = RFE(estimator=GradientBoostingClassifier(n_estimators=100, 
                                                       max_depth=3,
                                                       learning_rate=0.1),
                  n_features_to_select=15,
                  step=1
                 )


md2 = Pipeline([('transformations', all_transformations),
                ('GB-selector', GB_selector),
                ('GB', GradientBoostingClassifier(n_estimators=100, 
                                                  max_depth=3,
                                                  learning_rate=0.1))])

md2

In [16]:
%%time
GB_cv = cross_val_score(md2, X, Y, cv=skf, scoring='roc_auc', n_jobs=-1)

print(f"The average 5-folds (repeated 5 times) ROC-AUC score of the GB model is {GB_cv.mean()}")

The average 5-folds (repeated 5 times) ROC-AUC score of the GB model is 0.9885791582997943
CPU times: user 45.5 ms, sys: 26.6 ms, total: 72 ms
Wall time: 21.2 s


In [17]:
# Based on my results, I would use the GradientBoosting model to predict left.