# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier

import optuna

# Reading the data

In [2]:
turnover = pd.read_csv('turnover.csv')
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
sales = pd.get_dummies(turnover['sales'], drop_first = True, dtype = int)
sales.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [4]:
salary = pd.get_dummies(turnover['salary'], drop_first = True, dtype = int)
salary.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [5]:
turnover = turnover.drop(columns = ['sales', 'salary'], axis = 1)
turnover = pd.concat([turnover, sales, salary], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


# Defining X & Y 

In [6]:
X = turnover.drop(columns=['left'], axis=1)
Y = turnover['left']

# skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

# Random Forest

In [7]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

RF_numb_features = RFECV(estimator=RandomForestClassifier(n_estimators=100, 
                                                          max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md1 = Pipeline([('transformations', all_transformations),
                ('RF-numb-features', RF_numb_features)])

md1

In [8]:
%%time
md1_num_features = md1.fit(X, Y)
print(f"The suggested number of features of RF is {sum(md1_num_features['RF-numb-features'].support_)}")

The suggested number of features of RF is 11
CPU times: user 20.2 s, sys: 87.2 ms, total: 20.2 s
Wall time: 20.3 s


In [9]:
%%time
RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, 
                                                   max_depth=3),
                 n_features_to_select=11)

X_trans = all_transformations.fit_transform(X)

RF_to_select = RF_selector.fit(X_trans, Y)
RF_X_to_select = X_trans.columns[RF_to_select.support_].tolist()
RF_X_to_select

CPU times: user 3.32 s, sys: 28.6 ms, total: 3.35 s
Wall time: 3.37 s


['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-5__remainder__Work_accident',
 'step-5__remainder__RandD',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [10]:
class RF_Objective:

    def __init__(self, X, Y, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.X = X
        self.Y = Y
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 500),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10) 
                     )
        
        skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=self.seed)

        cv_run = cross_val_score(RandomForestClassifier(**params),
                                 self.X, 
                                 self.Y, 
                                 scoring = 'roc_auc', 
                                 cv = skf,
                                 n_jobs = -1)

        return cv_run.mean()
        
SEED=42
N_TRIALS=50

# Execute an optimization
RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(RF_Objective(X_trans[RF_X_to_select], Y, SEED), n_trials=N_TRIALS, n_jobs=-1)

[32m[I 2024-03-18 12:30:24,075][0m A new study created in memory with name: no-name-7b348c2f-cc66-4586-b08b-b31bc6445a35[0m
[32m[I 2024-03-18 12:30:43,379][0m Trial 6 finished with value: 0.9904180103323871 and parameters: {'n_estimators': 435, 'min_samples_split': 28, 'min_samples_leaf': 12, 'max_depth': 9}. Best is trial 6 with value: 0.9904180103323871.[0m
[32m[I 2024-03-18 12:30:45,213][0m Trial 1 finished with value: 0.9409647424973352 and parameters: {'n_estimators': 445, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_depth': 2}. Best is trial 6 with value: 0.9904180103323871.[0m
[32m[I 2024-03-18 12:30:48,496][0m Trial 0 finished with value: 0.9888495230249609 and parameters: {'n_estimators': 184, 'min_samples_split': 9, 'min_samples_leaf': 21, 'max_depth': 9}. Best is trial 6 with value: 0.9904180103323871.[0m
[32m[I 2024-03-18 12:30:50,634][0m Trial 4 finished with value: 0.9893303571674561 and parameters: {'n_estimators': 125, 'min_samples_split': 18, 'min_

In [11]:
RF_study.best_params

{'n_estimators': 344,
 'min_samples_split': 19,
 'min_samples_leaf': 6,
 'max_depth': 10}

In [12]:
RF_study.best_trial.value

0.9922689359343926

# Extra Trees

In [13]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

ET_numb_features = RFECV(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                        max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md2 = Pipeline([('transformations', all_transformations),
                ('ET-numb-features', ET_numb_features)])

md2

In [14]:
%%time
md2_num_features = md2.fit(X, Y)
print(f"The suggested number of features of ET is {sum(md2_num_features['ET-numb-features'].support_)}")

The suggested number of features of ET is 11
CPU times: user 12.8 s, sys: 66.1 ms, total: 12.9 s
Wall time: 12.9 s


In [15]:
ET_selector = RFE(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                 max_depth=3),
                 n_features_to_select=11)

ET_to_select = ET_selector.fit(X_trans, Y)
ET_X_to_select = X_trans.columns[ET_to_select.support_].tolist()
ET_X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-4__interaction_2__promotion_last_5years',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [16]:
class ET_Objective:

    def __init__(self, X, Y, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.X = X
        self.Y = Y
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 500),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10) 
                     )
        
        skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=self.seed)

        cv_run = cross_val_score(ExtraTreesClassifier(**params),
                                 self.X, 
                                 self.Y, 
                                 scoring = 'roc_auc', 
                                 cv = skf,
                                 n_jobs = -1)

        return cv_run.mean()
        
SEED=42
N_TRIALS=50

# Execute an optimization
ET_study = optuna.create_study(direction='maximize')
ET_study.optimize(RF_Objective(X_trans[RF_X_to_select], Y, SEED), n_trials=N_TRIALS, n_jobs=-1)

[32m[I 2024-03-18 12:36:48,607][0m A new study created in memory with name: no-name-8e4ee615-20bf-4259-8583-1f635045eb1d[0m
[32m[I 2024-03-18 12:36:51,201][0m Trial 2 finished with value: 0.9732398618715961 and parameters: {'n_estimators': 169, 'min_samples_split': 19, 'min_samples_leaf': 27, 'max_depth': 4}. Best is trial 2 with value: 0.9732398618715961.[0m
[32m[I 2024-03-18 12:36:51,554][0m Trial 1 finished with value: 0.9836998084624401 and parameters: {'n_estimators': 184, 'min_samples_split': 28, 'min_samples_leaf': 23, 'max_depth': 10}. Best is trial 1 with value: 0.9836998084624401.[0m
[32m[I 2024-03-18 12:36:55,402][0m Trial 0 finished with value: 0.9837135575275961 and parameters: {'n_estimators': 171, 'min_samples_split': 11, 'min_samples_leaf': 16, 'max_depth': 9}. Best is trial 0 with value: 0.9837135575275961.[0m
[32m[I 2024-03-18 12:36:56,430][0m Trial 3 finished with value: 0.9803889341768464 and parameters: {'n_estimators': 139, 'min_samples_split': 16, '

In [17]:
ET_study.best_params

{'n_estimators': 346,
 'min_samples_split': 21,
 'min_samples_leaf': 6,
 'max_depth': 10}

In [18]:
ET_study.best_trial.value

0.9870503637583982

# VotingClassifier

In [19]:
RF_X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-5__remainder__Work_accident',
 'step-5__remainder__RandD',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [20]:
ET_X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-4__interaction_2__promotion_last_5years',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [21]:
RF_md = Pipeline([('RF-features', ColumnTransformer([('pass', 'passthrough', RF_X_to_select)])), 
                  ('RF', RandomForestClassifier(**RF_study.best_params))])

ET_md = Pipeline([('ET-features', ColumnTransformer([('pass', 'passthrough', ET_X_to_select)])), 
                  ('ET', ExtraTreesClassifier(**ET_study.best_params))])

voting_md = VotingClassifier(estimators=[('RF', RF_md),
                                         ('ET', ET_md)],
                             voting='soft', 
                             weights=[0.6, 0.4], 
                             n_jobs=-1)
voting_md

In [22]:
skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

vot_cv = cross_val_score(voting_md, X_trans, Y, 
                         scoring='roc_auc', 
                         cv=skf,
                         n_jobs=-1)

print(f"The 10-fold ROC-AUC score of the VotingClassifier is {vot_cv.mean()}")

The 10-fold ROC-AUC score of the VotingClassifier is 0.9907647415935621


In [None]:
# Based on my results, I would use the RandomForestClassifier model to predict left.