# Loading Libraries

In [27]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier, GradientBoostingClassifier

import optuna

# Reading the data

In [2]:
turnover = pd.read_csv('turnover.csv')
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
sales = pd.get_dummies(turnover['sales'], drop_first = True, dtype = int)
sales.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [4]:
salary = pd.get_dummies(turnover['salary'], drop_first = True, dtype = int)
salary.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [5]:
turnover = turnover.drop(columns = ['sales', 'salary'], axis = 1)
turnover = pd.concat([turnover, sales, salary], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


# Defining X & Y 

In [6]:
X = turnover.drop(columns=['left'], axis=1)
Y = turnover['left']

skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)
X = X_train
Y = Y_train

# Random Forest

In [7]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

RF_numb_features = RFECV(estimator=RandomForestClassifier(n_estimators=100, 
                                                          max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md1 = Pipeline([('transformations', all_transformations),
                ('RF-numb-features', RF_numb_features)])

md1

In [8]:
%%time
md1_num_features = md1.fit(X, Y)
print(f"The suggested number of features of RF is {sum(md1_num_features['RF-numb-features'].support_)}")

The suggested number of features of RF is 10
CPU times: user 18.6 s, sys: 83 ms, total: 18.6 s
Wall time: 18.7 s


In [11]:
%%time
RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, 
                                                   max_depth=3),
                 n_features_to_select=10)

X_trans = all_transformations.fit_transform(X)
X_test = all_transformations.fit_transform(X_test)

RF_to_select = RF_selector.fit(X_trans, Y)
RF_X_to_select = X_trans.columns[RF_to_select.support_].tolist()
RF_X_to_select

CPU times: user 3.33 s, sys: 30 ms, total: 3.36 s
Wall time: 3.39 s


['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [12]:
class RF_Objective:

    def __init__(self, X, Y, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.X = X
        self.Y = Y
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 500),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10) 
                     )
        
        skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=self.seed)

        cv_run = cross_val_score(RandomForestClassifier(**params),
                                 self.X, 
                                 self.Y, 
                                 scoring = 'roc_auc', 
                                 cv = skf,
                                 n_jobs = -1)

        return cv_run.mean()
        
SEED=42
N_TRIALS=50

# Execute an optimization
RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(RF_Objective(X_trans[RF_X_to_select], Y, SEED), n_trials=N_TRIALS, n_jobs=-1)

[32m[I 2024-03-20 12:31:52,036][0m A new study created in memory with name: no-name-e89f5c3e-49b2-4725-810d-6ea5c60d54f0[0m
[32m[I 2024-03-20 12:32:09,079][0m Trial 6 finished with value: 0.9782128918968503 and parameters: {'n_estimators': 161, 'min_samples_split': 17, 'min_samples_leaf': 14, 'max_depth': 4}. Best is trial 6 with value: 0.9782128918968503.[0m
[32m[I 2024-03-20 12:32:09,970][0m Trial 0 finished with value: 0.9874209233532067 and parameters: {'n_estimators': 219, 'min_samples_split': 22, 'min_samples_leaf': 10, 'max_depth': 7}. Best is trial 0 with value: 0.9874209233532067.[0m
[32m[I 2024-03-20 12:32:10,786][0m Trial 4 finished with value: 0.9785784645079942 and parameters: {'n_estimators': 454, 'min_samples_split': 19, 'min_samples_leaf': 18, 'max_depth': 4}. Best is trial 0 with value: 0.9874209233532067.[0m
[32m[I 2024-03-20 12:32:11,056][0m Trial 5 finished with value: 0.9783879204152622 and parameters: {'n_estimators': 358, 'min_samples_split': 16, 'm

In [13]:
RF_study.best_params

{'n_estimators': 386,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_depth': 10}

In [14]:
RF_study.best_trial.value

0.9920338048126984

In [15]:
%%time
RF_md = Pipeline([('RF-features', ColumnTransformer([('pass', 'passthrough', RF_X_to_select)])), 
                  ('RF', RandomForestClassifier(**RF_study.best_params))])

RF_fit = RF_md.fit(X_trans, Y)
RF_pred = RF_fit.predict_proba(X_test)[:, 1]

print(f"The ROC-AUC score of the RandomForestClassifier model on X_test is {roc_auc_score(Y_test, RF_pred)}")

The ROC-AUC score of the RandomForestClassifier model on X_test is 0.9913607612773894
CPU times: user 2.29 s, sys: 26.7 ms, total: 2.31 s
Wall time: 2.33 s


# Extra Trees

In [16]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

ET_numb_features = RFECV(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                        max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md2 = Pipeline([('transformations', all_transformations),
                ('ET-numb-features', ET_numb_features)])

md2

In [17]:
%%time
md2_num_features = md2.fit(X, Y)
print(f"The suggested number of features of ET is {sum(md2_num_features['ET-numb-features'].support_)}")

The suggested number of features of ET is 13
CPU times: user 11.4 s, sys: 93.9 ms, total: 11.5 s
Wall time: 11.6 s


In [18]:
ET_selector = RFE(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                 max_depth=3),
                 n_features_to_select=13)

ET_to_select = ET_selector.fit(X_trans, Y)
ET_X_to_select = X_trans.columns[ET_to_select.support_].tolist()
ET_X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-4__interaction_2__promotion_last_5years',
 'step-4__interaction_2__last_evaluation promotion_last_5years',
 'step-5__remainder__Work_accident',
 'step-5__remainder__hr',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [19]:
class ET_Objective:

    def __init__(self, X, Y, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.X = X
        self.Y = Y
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 500),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10) 
                     )
        
        skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=self.seed)

        cv_run = cross_val_score(ExtraTreesClassifier(**params),
                                 self.X, 
                                 self.Y, 
                                 scoring = 'roc_auc', 
                                 cv = skf,
                                 n_jobs = -1)

        return cv_run.mean()
        
SEED=42
N_TRIALS=50

# Execute an optimization
ET_study = optuna.create_study(direction='maximize')
ET_study.optimize(RF_Objective(X_trans[RF_X_to_select], Y, SEED), n_trials=N_TRIALS, n_jobs=-1)

[32m[I 2024-03-20 12:38:39,647][0m A new study created in memory with name: no-name-72582434-fdec-46ec-868b-2e49aa1a85d7[0m
[32m[I 2024-03-20 12:38:42,100][0m Trial 0 finished with value: 0.9885126487274685 and parameters: {'n_estimators': 141, 'min_samples_split': 30, 'min_samples_leaf': 20, 'max_depth': 10}. Best is trial 0 with value: 0.9885126487274685.[0m
[32m[I 2024-03-20 12:38:45,609][0m Trial 3 finished with value: 0.9744052934797652 and parameters: {'n_estimators': 431, 'min_samples_split': 17, 'min_samples_leaf': 18, 'max_depth': 3}. Best is trial 0 with value: 0.9885126487274685.[0m
[32m[I 2024-03-20 12:38:48,827][0m Trial 4 finished with value: 0.9740042017378041 and parameters: {'n_estimators': 104, 'min_samples_split': 18, 'min_samples_leaf': 16, 'max_depth': 3}. Best is trial 0 with value: 0.9885126487274685.[0m
[32m[I 2024-03-20 12:38:53,053][0m Trial 2 finished with value: 0.942348890692883 and parameters: {'n_estimators': 240, 'min_samples_split': 9, 'mi

In [20]:
ET_study.best_params

{'n_estimators': 376,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_depth': 10}

In [21]:
ET_study.best_trial.value

0.9921121757836252

In [22]:
%%time
ET_md = Pipeline([('ET-features', ColumnTransformer([('pass', 'passthrough', ET_X_to_select)])), 
                  ('ET', ExtraTreesClassifier(**ET_study.best_params))])

ET_fit = ET_md.fit(X_trans, Y)
ET_pred = ET_fit.predict_proba(X_test)[:, 1]

print(f"The ROC-AUC score of the ExtraTreesClassifier model on X_test is {roc_auc_score(Y_test, ET_pred)}")

The ROC-AUC score of the ExtraTreesClassifier model on X_test is 0.9870659549909202
CPU times: user 1.18 s, sys: 25.8 ms, total: 1.2 s
Wall time: 1.22 s


# VotingClassifier

In [23]:
voting_md = VotingClassifier(estimators=[('RF', RF_md),
                                         ('ET', ET_md)],
                             voting='soft', 
                             weights=[0.8, 0.2], 
                             n_jobs=-1)
voting_md

In [25]:
%%time
vot_cv = cross_val_score(voting_md, X_trans, Y, 
                         scoring='roc_auc', 
                         cv=skf,
                         n_jobs=-1)

print(f"The 10-fold ROC-AUC score of the VotingClassifier is {vot_cv.mean()}")

The 10-fold ROC-AUC score of the VotingClassifier is 0.9914525752260991
CPU times: user 44.2 ms, sys: 36.7 ms, total: 81 ms
Wall time: 8.14 s


In [None]:
%%time
voting_fit = voting_md.fit(X_trans, Y)

voting_pred = voting_fit.predict_proba(X_test)[:, 1]
print(f"The ROC-AUC score of the VotingClassifier on X_test is {roc_auc_score(Y_test, voting_pred)}")

# StackingClassifier

In [28]:
stacking_md = StackingClassifier(estimators=[('RF', RF_md),
                                             ('ET', ET_md)],
                                 final_estimator=GradientBoostingClassifier(n_estimators=300,
                                                                            max_depth=5,
                                                                            learning_rate=0.1),
                                 cv=5, 
                                 stack_method='predict_proba',
                                 n_jobs=-1)
stacking_md

In [29]:
%%time
stack_cv = cross_val_score(stacking_md, X_trans, Y, 
                           scoring='roc_auc', 
                           cv=skf,
                           n_jobs=-1)

print(f"The 10-fold ROC-AUC score of the StackingClassifier is {stack_cv.mean()}")

The 10-fold ROC-AUC score of the StackingClassifier is 0.9890708975159427
CPU times: user 48.2 ms, sys: 111 ms, total: 159 ms
Wall time: 54.5 s


In [30]:
%%time
stacking_fit = stacking_md.fit(X_trans, Y)

stacking_pred = stacking_fit.predict_proba(X_test)[:, 1]
print(f"The ROC-AUC score of the StackingClassifier on X_test is {roc_auc_score(Y_test, stacking_pred)}")

The ROC-AUC score of the StackingClassifier on X_test is 0.9904518675361658
CPU times: user 4.44 s, sys: 72.5 ms, total: 4.51 s
Wall time: 11.3 s


In [None]:
# Based on my results, I would use RandomForestClassifier to predict left.