In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,cross_val_score,cross_val_predict,KFold, StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE,ADASYN,RandomOverSampler,BorderlineSMOTE,SVMSMOTE,SMOTENC
from imblearn.under_sampling import RandomUnderSampler,TomekLinks
from imblearn.combine import SMOTETomek,SMOTEENN
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow import keras
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [None]:
## 4. MODEL

In [None]:
### 4.1 Logistic Classifier

In [None]:
logreg=LogisticRegression(solver='lbfgs')

In [None]:
scores=cross_val_score(logreg,X_train,y_train,scoring='accuracy',cv=5)
print('Accuracy {:.2f} %'.format(100*scores.mean()))

In [None]:
y_pred=cross_val_predict(logreg, X_train, y_train, cv=5)

In [None]:
cm=confusion_matrix(y_train,y_pred)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_train,y_pred))

Acceptable precission 78% for attrition (class 1) which means that when the model identifies a postitive , in 78% of the cases it will be right (low rate of false positives). However the recall is low, only 37%, which means that the model is only able to identify correctly 37% of the employees that leave (high rate of false negatives). Let's take a look to the precision-recall curve:

In [None]:
y_proba = cross_val_predict(logreg, X_train, y_train, cv=5, method='predict_proba')

In [None]:
precisions,recalls,thresholds=precision_recall_curve(y_train,y_proba[:,1])

In [None]:
plt.plot(thresholds,precisions[:-1],'b--',label='Precision')
plt.plot(thresholds,recalls[:-1],'g-',label='Recall')
plt.plot([0, 0.5], [0.37, 0.37],"r:")                               
plt.plot([0, 0.5], [0.78, 0.78],"r:")
plt.plot([0.5,0.5], [0, 0.78],"r:")
plt.plot([0.5], [0.37], "ro")                                            
plt.plot([0.5], [0.78], "ro")  
plt.legend()
plt.title('Precision-Recall vs decision threshold')
plt.xlim(xmin=0) 
plt.ylim(ymin=0) 
plt.grid(which='both')
plt.show()

In [None]:
#Precision recall curve
plt.plot(recalls,precisions,'b--')
plt.title('Precision-Recall curve')
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.xlim(xmin=0) 
plt.ylim(ymin=0) 

The Precision-Recall curve ilustrates perfectly the precision/recall trade off. We can see that the precission starts to fall sharply around a 20% recall and if we want to obtain a recall of 50%, the precission will fall to a value of aroung 60%.

We can tweak the precision and recall of the classifier, by modifyig the threshold of 0.5 used to decide the class (attrition in this case). For example, let's see what happens if we set a threshold of 0.4 and a threshold of 0.2:

In [None]:
y_pred_04=(y_proba[:,1]>0.4).astype(int)
y_pred_02=(y_proba[:,1]>0.2).astype(int)

In [None]:
print(classification_report(y_train,y_pred_04))
print(classification_report(y_train,y_pred_02))

We could ask HR department what is they preference:

- to be more certain that the employees classified as 'leavers' by the model are more likely to leave but accepting that there are many 'leavers' 'leaver' employees  that have not been classified as such( high precision and low recall). In other words, less employers are classified as leavers but higher percentage ot them are true "leavers" (less false positives at the expense of having more false negatives).
- to be more certain that the classifier is not missing employees that are more likely to leave at the expense of not being able to identify which of them are true "leavers" (low precision and high recall). In other words, more employees are classified as 'leavers' but a higher percentage of them are not true 'leavers' (less false negatives but more false positives)

To try to improve the results, we next will tune the hyperparameters of the classifier by performing a Gridsearch. We will use several scores and refit for both AUC and f1 scores:

In [None]:
#Gridsearch AUC
#Need to create a training set for tuning the model parameters and a validation set to evaluate the tunned model
X_train['Attrition']=y_train

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index,val_index in split.split(X_train,X_train['Attrition']):
    train_grid=X_train.loc[train_index]
    val_grid=X_train.loc[val_index]

y_train_grid=train_grid['Attrition']
X_train_grid=train_grid
X_train_grid.drop('Attrition',inplace=True,axis=1)
y_val_grid=val_grid['Attrition']
X_val_grid=val_grid
X_val_grid.drop('Attrition',inplace=True,axis=1)

In [None]:
scoring = {'AUC': 'roc_auc', 'Precision': 'precision', 'Recall':'recall','Accuracy':'accuracy','f1':'f1'}
params = {'solver':['liblinear'],'penalty': ['l1', 'l2', ],'C': [0.001,0.01,0.1,1,10,100]}
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42) 

grid_logreg_auc = GridSearchCV(logreg, params, cv=kf,scoring=scoring,return_train_score='True',refit='AUC')
grid_logreg_auc.fit(X_train_grid,y_train_grid)

In [None]:
grid_logreg_auc.best_params_

In [None]:
y_val_grid_pred=grid_logreg_auc.predict(X_val_grid)

In [None]:
cm=confusion_matrix(y_val_grid,y_val_grid_pred)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_val_grid,y_val_grid_pred))

In [None]:
#GridSearch f1
scoring = {'AUC': 'roc_auc', 'Precision': 'precision', 'Recall':'recall','Accuracy':'accuracy','f1':'f1'}
params = {'solver':['liblinear'],'penalty': ['l1', 'l2', ],'C': [0.001,0.01,0.1,1,10,100]}
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42) 

grid = GridSearchCV(logreg, params, cv=kf,scoring=scoring,return_train_score='True',refit='f1')
grid.fit(X_train_grid,y_train_grid)
y_val_grid_pred=grid.predict(X_val_grid)
print(classification_report(y_val_grid,y_val_grid_pred))

Lastly, before moving onto a different classifier, we can explore some ampling techniques. In particular, we will try Randon Over Sampling, Synthetic Minority Oversampling Technique (SMOTE) and SMOTE Tomek.

In [None]:
##Random Over Sampling
ros=RandomOverSampler(random_state=2,sampling_strategy='minority')
sm = SMOTE(random_state=2)
##Synthetic Minority Oversampling Technique (SMOTE)
#Combining over- and under-sampling.
#SMOTE Tomek
smt = SMOTETomek(ratio='auto')

In [None]:
ros = RandomOverSampler(random_state=2,sampling_strategy='minority')
steps = [('ros', ros), ('model', logreg)]
pipeline = Pipeline(steps=steps)
y_pred_ros=cross_val_predict(pipeline, X_train, y_train, cv=5)
print(classification_report(y_train,y_pred_ros))

In [None]:
sm = SMOTE(random_state=2)
steps = [('SMOTE', sm), ('model', logreg)]
pipeline = Pipeline(steps=steps)
y_pred_sm=cross_val_predict(pipeline, X_train, y_train, cv=5)
print(classification_report(y_train,y_pred_sm))

In [None]:
sm = SMOTETomek(ratio='auto')
steps = [('SMOTETomek', smt), ('model', logreg)]
pipeline = Pipeline(steps=steps)
y_pred_smt=cross_val_predict(pipeline, X_train, y_train, cv=5)
print(classification_report(y_train,y_pred_smt))

By oversampling, we are able to increase the recall of the minority class. However, the precision is penalized and we are not able to improve the overall performance. We stay better by keeping the model with the hyperparameters tunned.

In the classification report we can see how both the precision and recall have increased, achieving the best results so far. Last step is to apply this last model to the test set.

### 4.2. Random Forest

In [None]:
RFC=RandomForestClassifier(n_estimators=100)
scores=cross_val_score(RFC,X_train,y_train,scoring='accuracy',cv=5)
print('Accuracy {:.2f} %'.format(100*scores.mean()))

In [None]:
y_pred=cross_val_predict(RFC, X_train, y_train, cv=5)
cm=confusion_matrix(y_train,y_pred)
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(y_train,y_pred))