In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = os.path.join("..","WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
df_td = pd.read_csv(path)

In [4]:
df_td.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
df_td.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df_td.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


#### Dropped below columns (EmployeeCount,Over18,StandardHours) because it's only has one value for all rows. Also dropping EmployeeNumber.

In [7]:
df_td.drop(columns=['EmployeeCount', 'Over18', 'StandardHours','EmployeeNumber'], inplace=True)

#### Number 1 is Yes, means employee moves out of company and number 0 is No, means stay.

In [8]:
df_td['Attrition'] = np.where(df_td['Attrition'] == 'Yes', 1, 0)

In [9]:
df_td['Attrition'].value_counts()/df_td.shape[0]*100

0    83.877551
1    16.122449
Name: Attrition, dtype: float64

In [10]:
cat_feats =  ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

In [11]:
df_new = pd.get_dummies(df_td, columns = cat_feats,)

In [12]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 52 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Age                                1470 non-null   int64
 1   Attrition                          1470 non-null   int32
 2   DailyRate                          1470 non-null   int64
 3   DistanceFromHome                   1470 non-null   int64
 4   Education                          1470 non-null   int64
 5   EnvironmentSatisfaction            1470 non-null   int64
 6   HourlyRate                         1470 non-null   int64
 7   JobInvolvement                     1470 non-null   int64
 8   JobLevel                           1470 non-null   int64
 9   JobSatisfaction                    1470 non-null   int64
 10  MonthlyIncome                      1470 non-null   int64
 11  MonthlyRate                        1470 non-null   int64
 12  NumCompaniesWorked  

### Preparing data for machine learning

In [13]:
X = df_new.drop('Attrition', axis = 1)
y = df_new['Attrition']
X.shape

(1470, 51)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y,stratify=y, test_size=0.30, random_state=2021)

In [15]:
# Standardization of the data
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)

In [16]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

## Support vector machine Classifier

#### Fit RBF Kernel SVM Classifier

In [17]:
from sklearn.svm import SVC

In [18]:
svc_rbf = SVC(kernel ='rbf')

In [19]:
svc_rbf

SVC()

In [20]:
cv_svc = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [21]:
svc_rbf.fit(X_train, y_train)

SVC()

In [22]:
# Validation # Testing
y_pred_svc_rbf = svc_rbf.predict(X_val)

In [23]:
metrics.accuracy_score(y_val, y_pred_svc_rbf)

0.8707482993197279

In [24]:
# Train
pred_train_svc_rbf=svc_rbf.predict(X_train)

In [25]:
# Train
(metrics.accuracy_score(y_train, pred_train_svc_rbf))

0.9154518950437318

In [26]:
print('SVC-Fit RBF Kernel SVM Classifier\n',metrics.classification_report(y_val, y_pred_svc_rbf))

SVC-Fit RBF Kernel SVM Classifier
               precision    recall  f1-score   support

           0       0.87      0.99      0.93       370
           1       0.89      0.23      0.36        71

    accuracy                           0.87       441
   macro avg       0.88      0.61      0.64       441
weighted avg       0.87      0.87      0.84       441



In [27]:
print('\nConfusion Matrix')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, y_pred_svc_rbf).ravel()
print(metrics.confusion_matrix(y_val, y_pred_svc_rbf))
recall_score = metrics.recall_score(y_val, y_pred_svc_rbf)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, y_pred_svc_rbf)
accuracy = metrics.accuracy_score(y_val, y_pred_svc_rbf)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred_svc_rbf)
f1_score = metrics.f1_score(y_val, y_pred_svc_rbf)

print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))


Confusion Matrix
[[368   2]
 [ 55  16]]

Metrics on test data
Recall Score : 0.23
Specificity : 0.99
Precision Score : 0.89
Accuracy: 0.87
Balanced Accuracy: 0.61
F1 Score : 0.36


# Grid Search # Hyperparamter Tuning

In [28]:
# Code
parameters = {
    'gamma': [1,0.1,0.01,0.001,0.0001],
    'C': [1,10,100,1000]
}
svc_rbf_gs = GridSearchCV(svc_rbf,parameters,cv=cv_svc)
svc_rbf_gs.fit(X_train, y_train)
print_results(svc_rbf_gs)


print('Best score for data:', svc_rbf_gs.best_score_)
print('Best C:',svc_rbf_gs.best_estimator_.C) 
print('Best Gamma:',svc_rbf_gs.best_estimator_.gamma)

BEST PARAMS: {'C': 1000, 'gamma': 0.0001}

0.839 (+/-0.004) for {'C': 1, 'gamma': 1}
0.839 (+/-0.004) for {'C': 1, 'gamma': 0.1}
0.849 (+/-0.014) for {'C': 1, 'gamma': 0.01}
0.839 (+/-0.004) for {'C': 1, 'gamma': 0.001}
0.839 (+/-0.004) for {'C': 1, 'gamma': 0.0001}
0.839 (+/-0.004) for {'C': 10, 'gamma': 1}
0.838 (+/-0.005) for {'C': 10, 'gamma': 0.1}
0.856 (+/-0.049) for {'C': 10, 'gamma': 0.01}
0.872 (+/-0.019) for {'C': 10, 'gamma': 0.001}
0.839 (+/-0.004) for {'C': 10, 'gamma': 0.0001}
0.839 (+/-0.004) for {'C': 100, 'gamma': 1}
0.838 (+/-0.005) for {'C': 100, 'gamma': 0.1}
0.846 (+/-0.059) for {'C': 100, 'gamma': 0.01}
0.858 (+/-0.035) for {'C': 100, 'gamma': 0.001}
0.872 (+/-0.014) for {'C': 100, 'gamma': 0.0001}
0.839 (+/-0.004) for {'C': 1000, 'gamma': 1}
0.838 (+/-0.005) for {'C': 1000, 'gamma': 0.1}
0.846 (+/-0.059) for {'C': 1000, 'gamma': 0.01}
0.837 (+/-0.049) for {'C': 1000, 'gamma': 0.001}
0.873 (+/-0.029) for {'C': 1000, 'gamma': 0.0001}
Best score for data: 0.87271134

In [29]:
best_estimator_svc_gs = svc_rbf_gs.best_estimator_
best_estimator_svc_gs

SVC(C=1000, gamma=0.0001)

In [30]:
# Validation Predict
y_pred_svc_rbf_gs = best_estimator_svc_gs.predict(X_val)
# Training Predict
pred_train_svc_rbf_gs= best_estimator_svc_gs.predict(X_train)

In [31]:
# Testing Accuracy Score # Validation
metrics.accuracy_score(y_val, y_pred_svc_rbf_gs)

0.8843537414965986

In [32]:
# Training Accuracy Score
(metrics.accuracy_score(y_train, pred_train_svc_rbf_gs))

0.8989310009718173

In [33]:
print('SVC-Fit RBF Kernel SVM Classifier with tuning\n',metrics.classification_report(y_val, y_pred_svc_rbf_gs))

SVC-Fit RBF Kernel SVM Classifier with tuning
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       370
           1       0.75      0.42      0.54        71

    accuracy                           0.88       441
   macro avg       0.82      0.70      0.74       441
weighted avg       0.87      0.88      0.87       441



In [34]:
print('\nConfusion Matrix')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, y_pred_svc_rbf_gs).ravel()
print(metrics.confusion_matrix(y_val, y_pred_svc_rbf_gs))
recall_score = metrics.recall_score(y_val, y_pred_svc_rbf_gs)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, y_pred_svc_rbf_gs)
accuracy = metrics.accuracy_score(y_val, y_pred_svc_rbf_gs)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred_svc_rbf_gs)
f1_score = metrics.f1_score(y_val, y_pred_svc_rbf_gs)

print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))


Confusion Matrix
[[360  10]
 [ 41  30]]

Metrics on test data
Recall Score : 0.42
Specificity : 0.97
Precision Score : 0.75
Accuracy: 0.88
Balanced Accuracy: 0.7
F1 Score : 0.54
