# Loading Libraries

In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

# Reading the data

In [2]:
turnover = pd.read_csv('turnover.csv')
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
turnover['left'].value_counts(normalize = True)

0    0.761917
1    0.238083
Name: left, dtype: float64

In [4]:
sales = pd.get_dummies(turnover['sales'], drop_first = True, dtype = int)
sales.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [5]:
salary = pd.get_dummies(turnover['salary'], drop_first = True, dtype = int)
salary.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [6]:
turnover = turnover.drop(columns = ['sales', 'salary'], axis = 1)
turnover = pd.concat([turnover, sales, salary], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


# Defining X & Y 

In [7]:
X = turnover.drop(columns = ['left'], axis = 1)
Y = turnover['left']

skf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 5, random_state = 42)

# Random Forest

In [9]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])], 
                           remainder = 'passthrough')

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])], 
                           remainder = 'passthrough')

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True), ['satisfaction_level', 'time_spend_company'])],
                           remainder = 'passthrough')

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True), ['last_evaluation', 'promotion_last_5years'])],
                           remainder = 'passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4)])

md1 = Pipeline([('transformations', all_transformations),
                ('RF', RandomForestClassifier(n_estimators = 100, 
                                              max_depth = 3))])

md1

In [10]:
%%time
RF_cv = cross_val_score(md1, X, Y, cv = skf, scoring = 'roc_auc', n_jobs = -1)

print(f"The average 5-folds (repeated 5 times) ROC-AUC score of the RF model is {RF_cv.mean()}")

The average 5-folds (repeated 5 times) ROC-AUC score of the RF model is 0.9745293035361954
CPU times: user 106 ms, sys: 142 ms, total: 248 ms
Wall time: 6.09 s


# MLPClassifier

In [12]:
md2 = Pipeline([('transformations', all_transformations),
                ('MLP', MLPClassifier(hidden_layer_sizes = (8, 16, 16, 8),
                                      activation = 'relu',
                                      learning_rate = 'constant', 
                                      max_iter = 1000))])

md2

In [13]:
%%time
MLP_cv = cross_val_score(md2, X, Y, cv = skf, scoring = 'roc_auc', n_jobs = -1)

print(f"The average 5-folds (repeated 5 times) ROC-AUC score of the MLP model is {MLP_cv.mean()}")

The average 5-folds (repeated 5 times) ROC-AUC score of the MLP model is 0.9494851272833184
CPU times: user 106 ms, sys: 67.5 ms, total: 173 ms
Wall time: 23 s


# VotingClassifier

In [15]:
md3 = VotingClassifier(estimators = [('RF', md1),
                                     ('MLP', md2)],
                       weights = [0.6, 0.4], 
                       voting = 'soft', 
                       n_jobs = -1)

md3

In [16]:
%%time
Voting_cv = cross_val_score(md3, X, Y, cv = skf, scoring = 'roc_auc', n_jobs = -1)

print(f"The average 5-folds (repeated 5 times) ROC-AUC score of the VotingClassifier model is {Voting_cv.mean()}")

The average 5-folds (repeated 5 times) ROC-AUC score of the VotingClassifier model is 0.976102966949504
CPU times: user 114 ms, sys: 69.7 ms, total: 184 ms
Wall time: 23.5 s


In [17]:
# Based on my results, I would use the VotingClassifier model to predict left.