# Loading Libraries

In [22]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import optuna

# Reading the data

In [23]:
turnover = pd.read_csv('turnover.csv')
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [24]:
sales = pd.get_dummies(turnover['sales'], drop_first = True, dtype = int)
sales.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [25]:
salary = pd.get_dummies(turnover['salary'], drop_first = True, dtype = int)
salary.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [26]:
turnover = turnover.drop(columns = ['sales', 'salary'], axis = 1)
turnover = pd.concat([turnover, sales, salary], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


# Defining X & Y 

In [27]:
X = turnover.drop(columns=['left'], axis=1)
Y = turnover['left']

skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)

# Random Forest

In [28]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

RF_numb_features = RFECV(estimator=RandomForestClassifier(n_estimators=100, 
                                                          max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md1 = Pipeline([('transformations', all_transformations),
                ('RF-numb-features', RF_numb_features)])

md1

In [8]:
%%time
md1_num_features = md1.fit(X, Y)
print(f"The suggested number of features of RF is {sum(md1_num_features['RF-numb-features'].support_)}")

The suggested number of features of RF is 10
CPU times: user 20.7 s, sys: 117 ms, total: 20.9 s
Wall time: 20.9 s


In [9]:
RF_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, 
                                                   max_depth=3),
                 n_features_to_select=10)

X_trans = all_transformations.fit_transform(X)
X_trans.head()

Unnamed: 0,step-1__power-tran__time_spend_company,step-2__0-1-tran__number_project,step-2__0-1-tran__average_montly_hours,step-3__interaction_1__satisfaction_level,step-3__interaction_1__time_spend_company,step-3__interaction_1__satisfaction_level time_spend_company,step-4__interaction_2__last_evaluation,step-4__interaction_2__promotion_last_5years,step-4__interaction_2__last_evaluation promotion_last_5years,step-5__remainder__Work_accident,...,step-5__remainder__accounting,step-5__remainder__hr,step-5__remainder__management,step-5__remainder__marketing,step-5__remainder__product_mng,step-5__remainder__sales,step-5__remainder__support,step-5__remainder__technical,step-5__remainder__low,step-5__remainder__medium
0,-0.143951,0.0,0.285047,0.38,3.0,1.14,0.53,0.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
1,1.575753,0.6,0.775701,0.8,6.0,4.8,0.86,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.662038,1.0,0.82243,0.11,4.0,0.44,0.88,0.0,0.0,0,...,0,0,0,0,0,1,0,0,0,1
3,1.196143,0.6,0.593458,0.72,5.0,3.6,0.87,0.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
4,-0.143951,0.0,0.294393,0.37,3.0,1.11,0.52,0.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0


In [10]:
RF_to_select = RF_selector.fit(X_trans, Y)
X_to_select = X_trans.columns[RF_to_select.support_].tolist()
X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [18]:
class RF_Objective:

    def __init__(self, X, Y, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.X = X
        self.Y = Y
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 500),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10) 
                     )
        
        skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=self.seed)

        cv_run = cross_val_score(RandomForestClassifier(**params),
                                 self.X, 
                                 self.Y, 
                                 scoring = 'roc_auc', 
                                 cv = skf,
                                 n_jobs = -1)

        return cv_run.mean()
        
SEED = 42
N_TRIALS = 50

# Execute an optimization
study = optuna.create_study(direction = 'maximize')
study.optimize(RF_Objective(X_trans[X_to_select], Y, SEED), n_trials = N_TRIALS, n_jobs = -1)

[32m[I 2024-02-28 06:15:15,977][0m A new study created in memory with name: no-name-0a3a62a9-d044-4d9c-ae66-cacdaa6451d4[0m
[32m[I 2024-02-28 06:15:23,586][0m Trial 2 finished with value: 0.943102392029331 and parameters: {'n_estimators': 282, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_depth': 2}. Best is trial 2 with value: 0.943102392029331.[0m
[32m[I 2024-02-28 06:15:26,491][0m Trial 5 finished with value: 0.97532532810721 and parameters: {'n_estimators': 193, 'min_samples_split': 23, 'min_samples_leaf': 9, 'max_depth': 3}. Best is trial 5 with value: 0.97532532810721.[0m
[32m[I 2024-02-28 06:15:31,216][0m Trial 4 finished with value: 0.9426542186762463 and parameters: {'n_estimators': 417, 'min_samples_split': 25, 'min_samples_leaf': 14, 'max_depth': 2}. Best is trial 5 with value: 0.97532532810721.[0m
[32m[I 2024-02-28 06:15:32,871][0m Trial 6 finished with value: 0.9874826296853254 and parameters: {'n_estimators': 463, 'min_samples_split': 26, 'min_samples

In [19]:
study.best_params

{'n_estimators': 383,
 'min_samples_split': 9,
 'min_samples_leaf': 7,
 'max_depth': 9}

In [20]:
study.best_trial.value

0.9917360212861854

# Extra Trees

In [29]:
step_1 = ColumnTransformer([('power-tran', PowerTransformer(), ['time_spend_company'])])

step_2 = ColumnTransformer([('0-1-tran', MinMaxScaler(), ['number_project', 'average_montly_hours'])])

step_3 = ColumnTransformer([('interaction_1', PolynomialFeatures(interaction_only=True, include_bias=False), ['satisfaction_level', 'time_spend_company'])])

step_4 = ColumnTransformer([('interaction_2', PolynomialFeatures(interaction_only=True, include_bias=False), ['last_evaluation', 'promotion_last_5years'])])

step_5 = ColumnTransformer([('drop-out', 'drop', ['time_spend_company', 'number_project', 'average_montly_hours',
                                                  'satisfaction_level', 'last_evaluation', 'promotion_last_5years'])],
                           remainder='passthrough')

all_transformations = FeatureUnion([('step-1', step_1),
                                    ('step-2', step_2), 
                                    ('step-3', step_3), 
                                    ('step-4', step_4),
                                    ('step-5', step_5)]).set_output(transform='pandas')

ET_numb_features = RFECV(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                        max_depth=3),
                         min_features_to_select=10,
                         cv=5,
                         scoring='roc_auc')

md2 = Pipeline([('transformations', all_transformations),
                ('ET-numb-features', ET_numb_features)])

md2

In [31]:
%%time
md2_num_features = md2.fit(X, Y)
print(f"The suggested number of features of ET is {sum(md2_num_features['ET-numb-features'].support_)}")

The suggested number of features of ET is 17
CPU times: user 11.5 s, sys: 53.5 ms, total: 11.6 s
Wall time: 11.6 s


In [32]:
ET_selector = RFE(estimator=ExtraTreesClassifier(n_estimators=100, 
                                                 max_depth=3),
                 n_features_to_select=17)

ET_to_select = ET_selector.fit(X_trans, Y)
X_to_select = X_trans.columns[RF_to_select.support_].tolist()
X_to_select

['step-1__power-tran__time_spend_company',
 'step-2__0-1-tran__number_project',
 'step-2__0-1-tran__average_montly_hours',
 'step-3__interaction_1__satisfaction_level',
 'step-3__interaction_1__time_spend_company',
 'step-3__interaction_1__satisfaction_level time_spend_company',
 'step-4__interaction_2__last_evaluation',
 'step-5__remainder__Work_accident',
 'step-5__remainder__low',
 'step-5__remainder__medium']

In [33]:
class ET_Objective:

    def __init__(self, X, Y, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.X = X
        self.Y = Y
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 500),
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10) 
                     )
        
        skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=self.seed)

        cv_run = cross_val_score(ExtraTreesClassifier(**params),
                                 self.X, 
                                 self.Y, 
                                 scoring = 'roc_auc', 
                                 cv = skf,
                                 n_jobs = -1)

        return cv_run.mean()
        
SEED = 42
N_TRIALS = 50

# Execute an optimization
study = optuna.create_study(direction = 'maximize')
study.optimize(ET_Objective(X_trans[X_to_select], Y, SEED), n_trials = N_TRIALS, n_jobs = -1)

[32m[I 2024-02-28 06:27:30,758][0m A new study created in memory with name: no-name-c7cc0512-faa4-46c2-a104-0282b29347ba[0m
[32m[I 2024-02-28 06:27:45,598][0m Trial 1 finished with value: 0.9615019057623089 and parameters: {'n_estimators': 261, 'min_samples_split': 21, 'min_samples_leaf': 27, 'max_depth': 3}. Best is trial 1 with value: 0.9615019057623089.[0m
[32m[I 2024-02-28 06:27:45,609][0m Trial 2 finished with value: 0.9815230186354557 and parameters: {'n_estimators': 143, 'min_samples_split': 29, 'min_samples_leaf': 16, 'max_depth': 7}. Best is trial 2 with value: 0.9815230186354557.[0m
[32m[I 2024-02-28 06:27:46,879][0m Trial 7 finished with value: 0.9852138435151734 and parameters: {'n_estimators': 390, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_depth': 9}. Best is trial 7 with value: 0.9852138435151734.[0m
[32m[I 2024-02-28 06:27:47,054][0m Trial 0 finished with value: 0.9620856997034448 and parameters: {'n_estimators': 411, 'min_samples_split': 22, 'mi

In [17]:
# Based on my results, I would use the Random Forest model to predict left.