In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = os.path.join("..","WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
df_td = pd.read_csv(path)

In [4]:
df_td.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
df_td.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df_td.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


#### Dropped below columns (EmployeeCount,Over18,StandardHours) because it's only has one value for all rows. Also dropping EmployeeNumber.

In [7]:
df_td.drop(columns=['EmployeeCount', 'Over18', 'StandardHours','EmployeeNumber'], inplace=True)

#### Number 1 is Yes, means employee moves out of company and number 0 is No, means stay.

In [8]:
df_td['Attrition'] = np.where(df_td['Attrition'] == 'Yes', 1, 0)

In [9]:
df_td["Attrition"].value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

In [10]:
df_td['Attrition'].value_counts()/df_td.shape[0]*100

0    83.877551
1    16.122449
Name: Attrition, dtype: float64

In [11]:
cat_feats =  ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

In [12]:
# Converting categorical variables to dummy variables
df_new = pd.get_dummies(df_td, columns = cat_feats,)

In [13]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 52 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Age                                1470 non-null   int64
 1   Attrition                          1470 non-null   int32
 2   DailyRate                          1470 non-null   int64
 3   DistanceFromHome                   1470 non-null   int64
 4   Education                          1470 non-null   int64
 5   EnvironmentSatisfaction            1470 non-null   int64
 6   HourlyRate                         1470 non-null   int64
 7   JobInvolvement                     1470 non-null   int64
 8   JobLevel                           1470 non-null   int64
 9   JobSatisfaction                    1470 non-null   int64
 10  MonthlyIncome                      1470 non-null   int64
 11  MonthlyRate                        1470 non-null   int64
 12  NumCompaniesWorked  

### Preparing data from machine learning

In [14]:
X = df_new.drop('Attrition', axis = 1)
y = df_new['Attrition']
X.shape

(1470, 51)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y,stratify=y, test_size=0.30, random_state=2021)

In [16]:
# Standardization of the data
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)

# Adaboost model

In [17]:
from sklearn.ensemble import AdaBoostClassifier

In [18]:
adabc = AdaBoostClassifier()

In [19]:
adabc.fit(X_train, y_train)

AdaBoostClassifier()

In [20]:
y_pred_adabc = adabc.predict(X_val)

In [21]:
metrics.accuracy_score(y_val, y_pred_adabc)

0.8458049886621315

In [22]:
pred_train_adabc=adabc.predict(X_train)

In [23]:
(metrics.accuracy_score(y_train, pred_train_adabc))

0.9115646258503401

In [24]:
print('Adaboost\n',metrics.classification_report(y_val, y_pred_adabc))

Adaboost
               precision    recall  f1-score   support

           0       0.88      0.94      0.91       370
           1       0.53      0.35      0.42        71

    accuracy                           0.85       441
   macro avg       0.71      0.65      0.67       441
weighted avg       0.83      0.85      0.83       441



In [25]:
print('Confusion Matrix:')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, y_pred_adabc).ravel()
print(metrics.confusion_matrix(y_val, y_pred_adabc))
recall_score = metrics.recall_score(y_val, y_pred_adabc)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, y_pred_adabc)
accuracy = metrics.accuracy_score(y_val, y_pred_adabc)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred_adabc)
f1_score = metrics.f1_score(y_val, y_pred_adabc)
# fpr = fp / (fp + tn)
print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))

Confusion Matrix:
[[348  22]
 [ 46  25]]

Metrics on test data
Recall Score : 0.35
Specificity : 0.94
Precision Score : 0.53
Accuracy: 0.85
Balanced Accuracy: 0.65
F1 Score : 0.42


#### Apply cross validation

In [26]:
cv_ab = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [27]:
adabc = AdaBoostClassifier()
scores = cross_val_score(adabc, X=X_train, y=y_train, scoring='accuracy', cv=cv_ab, n_jobs=1, )
print('CV accuracy scores: %s' % scores)

CV accuracy scores: [0.83980583 0.85436893 0.88834951 0.89320388 0.85365854]


In [28]:
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy: 0.866 +/- 0.021


## Adaboost model - pipeline and gridsearch

In [29]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [30]:
ss = preprocessing.StandardScaler()
# mm = preprocessing.MinMaxScaler()
adabc = AdaBoostClassifier()

steps = [('ss', ss),
         ('classifier', adabc)]

parameters = {
    'classifier__n_estimators': [50,100,200,300],
    'classifier__learning_rate': [0.1,0.2,1,2],
}
pipe = pipeline.Pipeline(steps)

cv_pipe_ab = GridSearchCV(pipe, parameters, cv=cv_ab, n_jobs=-1, scoring = 'accuracy')
cv_pipe_ab.fit(X_train, y_train)

print_results(cv_pipe_ab)

BEST PARAMS: {'classifier__learning_rate': 1, 'classifier__n_estimators': 100}

0.844 (+/-0.007) for {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 50}
0.849 (+/-0.009) for {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}
0.858 (+/-0.038) for {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200}
0.866 (+/-0.035) for {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 300}
0.851 (+/-0.005) for {'classifier__learning_rate': 0.2, 'classifier__n_estimators': 50}
0.859 (+/-0.033) for {'classifier__learning_rate': 0.2, 'classifier__n_estimators': 100}
0.866 (+/-0.035) for {'classifier__learning_rate': 0.2, 'classifier__n_estimators': 200}
0.866 (+/-0.03) for {'classifier__learning_rate': 0.2, 'classifier__n_estimators': 300}
0.866 (+/-0.042) for {'classifier__learning_rate': 1, 'classifier__n_estimators': 50}
0.874 (+/-0.033) for {'classifier__learning_rate': 1, 'classifier__n_estimators': 100}
0.849 (+/-0.044) for {'classifier__lea

In [31]:
best_estimator_ab_pipe = cv_pipe_ab.best_estimator_
best_estimator_ab_pipe

Pipeline(steps=[('ss', StandardScaler()),
                ('classifier',
                 AdaBoostClassifier(learning_rate=1, n_estimators=100))])

In [32]:
pred_pipe_ab = best_estimator_ab_pipe.predict(X_val)

In [33]:
# Testing # Validation
metrics.accuracy_score(y_val, pred_pipe_ab)

0.8458049886621315

In [34]:
# Training
pred_train_pipe_ab=best_estimator_ab_pipe.predict(X_train)
(metrics.accuracy_score(y_train, pred_train_pipe_ab))

0.9261418853255587

In [35]:
print('Adaboost with Hyperparamter Tuning\n',metrics.classification_report(y_val, pred_pipe_ab))

Adaboost with Hyperparamter Tuning
               precision    recall  f1-score   support

           0       0.89      0.93      0.91       370
           1       0.53      0.39      0.45        71

    accuracy                           0.85       441
   macro avg       0.71      0.66      0.68       441
weighted avg       0.83      0.85      0.84       441



In [36]:
print('\nConfusion Matrix')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, pred_pipe_ab).ravel()
print(metrics.confusion_matrix(y_val, pred_pipe_ab))
recall_score = metrics.recall_score(y_val, pred_pipe_ab)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, pred_pipe_ab)
accuracy = metrics.accuracy_score(y_val, pred_pipe_ab)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, pred_pipe_ab)
f1_score = metrics.f1_score(y_val, pred_pipe_ab)
fpr = fp / (fp + tn)
print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))



Confusion Matrix
[[345  25]
 [ 43  28]]

Metrics on test data
Recall Score : 0.39
Specificity : 0.93
Precision Score : 0.53
Accuracy: 0.85
Balanced Accuracy: 0.66
F1 Score : 0.45
