In [88]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df["attrition"] = df["Attrition"]

"""
removing "MonthlyIncome", "TotalWorkingYears", "YearsInCurrentRole", "YearsWithCurrentManager" 
because their correlation with "JobLevel" and "YearsAtCompany" >= 0.75 [found this in EDA.ipynb]
"""
df.drop(columns = ["Attrition", "MonthlyIncome", "TotalWorkingYears", "YearsInCurrentRole", "YearsWithCurrManager"], inplace = True)
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,attrition
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,...,11,3,1,80,0,0,1,6,0,Yes
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,...,23,4,4,80,1,3,3,10,1,No
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,...,15,3,2,80,0,3,3,0,0,Yes
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,...,11,3,3,80,0,3,3,8,3,No
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,...,12,3,4,80,1,3,3,2,2,No


In [5]:
df.nunique()

Age                           43
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EmployeeCount                  1
EmployeeNumber              1470
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyRate                 1427
NumCompaniesWorked            10
Over18                         1
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsSinceLastPromotion       16
attrition 

In [6]:
"""
EmployeeCount, Over18, StandardHours has only one unique value. So this variables can be ignored/removed.
"""
df.drop(columns = ["EmployeeCount", "Over18", "StandardHours"], inplace = True)
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,attrition
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,...,Yes,11,3,1,0,0,1,6,0,Yes
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,Male,...,No,23,4,4,1,3,3,10,1,No
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,Male,...,Yes,15,3,2,0,3,3,0,0,Yes
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,Female,...,Yes,11,3,3,0,3,3,8,3,No
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,Male,...,No,12,3,4,1,3,3,2,2,No


In [8]:
df["attrition"] = df["attrition"].astype('category').cat.codes
df["BusinessTravel"] = df["BusinessTravel"].astype('category').cat.codes
df["Department"] = df["Department"].astype('category').cat.codes
df["EducationField"] = df["EducationField"].astype('category').cat.codes
df["Gender"] = df["Gender"].astype('category').cat.codes
df["JobRole"] = df["JobRole"].astype('category').cat.codes
df["MaritalStatus"] = df["MaritalStatus"].astype('category').cat.codes
df["OverTime"] = df["OverTime"].astype('category').cat.codes
df

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,attrition
0,41,2,1102,2,1,2,1,1,2,0,...,1,11,3,1,0,0,1,6,0,1
1,49,1,279,1,8,1,1,2,3,1,...,0,23,4,4,1,3,3,10,1,0
2,37,2,1373,1,2,2,4,4,4,1,...,1,15,3,2,0,3,3,0,0,1
3,33,1,1392,1,3,4,1,5,4,0,...,1,11,3,3,0,3,3,8,3,0
4,27,2,591,1,2,1,3,7,1,1,...,0,12,3,4,1,3,3,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,1,884,1,23,2,3,2061,3,1,...,0,17,3,3,1,3,3,5,0,0
1466,39,2,613,1,6,1,3,2062,4,1,...,0,15,3,1,1,5,3,7,1,0
1467,27,2,155,1,4,3,1,2064,2,1,...,1,20,4,2,1,0,3,6,0,0
1468,49,1,1023,2,2,3,3,2065,4,1,...,0,14,3,4,0,3,2,9,0,0


In [9]:
x = np.array(df.iloc[:, :-1])
y = np.array(df.iloc[:, -1])
x.shape, y.shape

((1470, 27), (1470,))

In [15]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled.shape

(1470, 27)

# Logistic Regression

### without SMOTE

In [61]:
precision_recall_fscore_supports = []
model = LogisticRegression()
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [62]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

41.572462379664174


### with SMOTE

In [59]:
precision_recall_fscore_supports = []
model = LogisticRegression()
cv = KFold(n_splits = 10, random_state = 0)
smote = SMOTE()

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    x_smote, y_smote = smote.fit_resample(x_train, y_train)
    model.fit(x_smote, y_smote)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [60]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.36363636363636365, 0.7692307692307693, 0.4938271604938272, None)
(0.425, 0.8095238095238095, 0.5573770491803278, None)
(0.391304347826087, 0.8181818181818182, 0.5294117647058824, None)
(0.3333333333333333, 0.7142857142857143, 0.4545454545454545, None)
(0.4166666666666667, 0.6896551724137931, 0.5194805194805195, None)
(0.4318181818181818, 0.6333333333333333, 0.5135135135135135, None)
(0.3392857142857143, 0.8260869565217391, 0.4810126582278481, None)
(0.2916666666666667, 0.6363636363636364, 0.4, None)
(0.3333333333333333, 0.5833333333333334, 0.4242424242424242, None)
(0.3404255319148936, 0.8421052631578947, 0.48484848484848486, None)
48.58259029238282


# SVM 

### without SMOTE

In [66]:
precision_recall_fscore_supports = []
model = SVC(kernel = "rbf", class_weight = "balanced", probability = True)
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [67]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.5769230769230769, 0.5769230769230769, 0.5769230769230769, None)
(0.5185185185185185, 0.6666666666666666, 0.5833333333333334, None)
(0.43333333333333335, 0.5909090909090909, 0.5, None)
(0.3611111111111111, 0.6190476190476191, 0.45614035087719296, None)
(0.4523809523809524, 0.6551724137931034, 0.5352112676056338, None)
(0.5416666666666666, 0.43333333333333335, 0.4814814814814815, None)
(0.40625, 0.5652173913043478, 0.4727272727272727, None)
(0.4666666666666667, 0.6363636363636364, 0.5384615384615385, None)
(0.3611111111111111, 0.5416666666666666, 0.43333333333333335, None)
(0.32432432432432434, 0.631578947368421, 0.42857142857142855, None)
50.06183083314293


### with SMOTE

In [70]:
precision_recall_fscore_supports = []
model = SVC(kernel = "rbf")
cv = KFold(n_splits = 10, random_state = 0)
smote = SMOTE()

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    x_smote, y_smote = smote.fit_resample(x_train, y_train)
    model.fit(x_smote, y_smote)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [71]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.5238095238095238, 0.4230769230769231, 0.4680851063829788, None)
(0.5625, 0.42857142857142855, 0.4864864864864864, None)
(0.3684210526315789, 0.3181818181818182, 0.3414634146341463, None)
(0.4444444444444444, 0.5714285714285714, 0.5, None)
(0.45454545454545453, 0.5172413793103449, 0.4838709677419355, None)
(0.35294117647058826, 0.2, 0.25531914893617025, None)
(0.36, 0.391304347826087, 0.37499999999999994, None)
(0.45454545454545453, 0.45454545454545453, 0.45454545454545453, None)
(0.35, 0.2916666666666667, 0.31818181818181823, None)
(0.3684210526315789, 0.3684210526315789, 0.3684210526315789, None)
40.51373449540569


# Decision Tree

### without SMOTE

In [73]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.3, random_state = 0)
x_train.shape, x_test.shape

((1029, 27), (441, 27))

In [77]:
# pre prunning 
model = DecisionTreeClassifier(random_state = 0)
param_grid = {"max_depth" : [8, 12, 16, 20]}

gs = GridSearchCV(estimator = model, param_grid = param_grid)
gs.fit(x_train, y_train)
print("best parameters:", gs.best_estimator_)
print("best score:", gs.best_score_*100)

best parameters: DecisionTreeClassifier(class_weight='balanced', max_depth=16, random_state=0)
best score: 78.81458678664457


In [78]:
precision_recall_fscore_supports = []
model = DecisionTreeClassifier(max_depth = 8, class_weight = "balanced")
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [79]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.2857142857142857, 0.23076923076923078, 0.25531914893617025, None)
(0.35714285714285715, 0.47619047619047616, 0.40816326530612246, None)
(0.36666666666666664, 0.5, 0.423076923076923, None)
(0.2826086956521739, 0.6190476190476191, 0.3880597014925373, None)
(0.34146341463414637, 0.4827586206896552, 0.4000000000000001, None)
(0.25, 0.23333333333333334, 0.2413793103448276, None)
(0.3103448275862069, 0.391304347826087, 0.34615384615384615, None)
(0.25, 0.36363636363636365, 0.2962962962962963, None)
(0.42424242424242425, 0.5833333333333334, 0.4912280701754386, None)
(0.3333333333333333, 0.3684210526315789, 0.35, None)
35.996765617821616


### with SMOTE

In [80]:
precision_recall_fscore_supports = []
model = DecisionTreeClassifier(max_depth = 8, class_weight = "balanced")
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    x_smote, y_smote = smote.fit_resample(x_train, y_train)
    model.fit(x_smote, y_smote)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [81]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.39285714285714285, 0.4230769230769231, 0.4074074074074074, None)
(0.4166666666666667, 0.47619047619047616, 0.4444444444444445, None)
(0.3181818181818182, 0.3181818181818182, 0.3181818181818182, None)
(0.3, 0.42857142857142855, 0.3529411764705882, None)
(0.4411764705882353, 0.5172413793103449, 0.47619047619047616, None)
(0.391304347826087, 0.3, 0.33962264150943394, None)
(0.39285714285714285, 0.4782608695652174, 0.4313725490196078, None)
(0.3333333333333333, 0.4090909090909091, 0.36734693877551017, None)
(0.4230769230769231, 0.4583333333333333, 0.43999999999999995, None)
(0.20588235294117646, 0.3684210526315789, 0.2641509433962264, None)
38.416583953955126


# Random Forest

### without SMOTE

In [82]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.3, random_state = 0)
x_train.shape, x_test.shape

((1029, 27), (441, 27))

In [83]:
# pre prunning 
model = RandomForestClassifier()
param_grid = {"max_depth" : [8, 12, 16, 20],
             "class_weight" : ["balanced", "balanced_subsample"]}

gs = GridSearchCV(estimator = model, param_grid = param_grid)
gs.fit(x_train, y_train)
print("best parameters:", gs.best_estimator_)
print("best score:", gs.best_score_*100)

best parameters: RandomForestClassifier(class_weight='balanced', max_depth=8)
best score: 85.7148946246744


In [84]:
precision_recall_fscore_supports = []
model = RandomForestClassifier(max_depth = 8, class_weight = "balanced")
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [85]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.625, 0.19230769230769232, 0.29411764705882354, None)
(0.8, 0.19047619047619047, 0.3076923076923077, None)
(0.6153846153846154, 0.36363636363636365, 0.4571428571428572, None)
(0.625, 0.23809523809523808, 0.3448275862068965, None)
(0.7692307692307693, 0.3448275862068966, 0.47619047619047616, None)
(0.5, 0.16666666666666666, 0.25, None)
(0.6923076923076923, 0.391304347826087, 0.5, None)
(0.7777777777777778, 0.3181818181818182, 0.45161290322580644, None)
(0.75, 0.125, 0.21428571428571427, None)
(0.5, 0.2631578947368421, 0.3448275862068966, None)
36.40697078009779


### with SMOTE

In [86]:
precision_recall_fscore_supports = []
model = RandomForestClassifier(max_depth = 8, class_weight = "balanced")
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    x_smote, y_smote = smote.fit_resample(x_train, y_train)
    model.fit(x_smote, y_smote)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [87]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.9166666666666666, 0.4230769230769231, 0.5789473684210527, None)
(0.5714285714285714, 0.38095238095238093, 0.4571428571428571, None)
(0.5, 0.3181818181818182, 0.3888888888888889, None)
(0.8333333333333334, 0.47619047619047616, 0.6060606060606061, None)
(0.5789473684210527, 0.3793103448275862, 0.45833333333333337, None)
(0.45454545454545453, 0.16666666666666666, 0.2439024390243902, None)
(0.5217391304347826, 0.5217391304347826, 0.5217391304347826, None)
(0.5333333333333333, 0.36363636363636365, 0.43243243243243246, None)
(0.5333333333333333, 0.3333333333333333, 0.4102564102564102, None)
(0.5, 0.42105263157894735, 0.45714285714285713, None)
45.54846323137611


# XGBoost

### without SMOTE

In [89]:
precision_recall_fscore_supports = []
model = XGBClassifier(n_estimators = 100, max_depth = 8)
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(x_train, y_train)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [90]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.6666666666666666, 0.23076923076923078, 0.3428571428571429, None)
(0.75, 0.14285714285714285, 0.24, None)
(0.6923076923076923, 0.4090909090909091, 0.5142857142857142, None)
(0.5555555555555556, 0.23809523809523808, 0.33333333333333326, None)
(0.6470588235294118, 0.3793103448275862, 0.4782608695652174, None)
(0.4444444444444444, 0.13333333333333333, 0.20512820512820512, None)
(0.6153846153846154, 0.34782608695652173, 0.4444444444444444, None)
(0.6666666666666666, 0.2727272727272727, 0.3870967741935484, None)
(0.6666666666666666, 0.25, 0.36363636363636365, None)
(0.6363636363636364, 0.3684210526315789, 0.4666666666666667, None)
37.75709514110636


### with SMOTE

In [93]:
precision_recall_fscore_supports = []
model = XGBClassifier(n_estimators = 100, max_depth = 8)
cv = KFold(n_splits = 10, random_state = 0)

for train_index, test_index in cv.split(x_scaled):
    #print(train_index, test_index)
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    x_smote, y_smote = smote.fit_resample(x_train, y_train)
    model.fit(x_smote, y_smote)
    precision_recall_fscore_supports.append(precision_recall_fscore_support(y_test, model.predict(x_test), average = "binary"))



In [94]:
f_scores = []
for prfs in precision_recall_fscore_supports:
    print(prfs)
    f_scores.append(prfs[2])
print(np.mean(np.array(f_scores)) * 100)

(0.6666666666666666, 0.3076923076923077, 0.42105263157894735, None)
(0.6666666666666666, 0.38095238095238093, 0.4848484848484849, None)
(0.7142857142857143, 0.45454545454545453, 0.5555555555555556, None)
(0.6363636363636364, 0.3333333333333333, 0.43749999999999994, None)
(0.6842105263157895, 0.4482758620689655, 0.5416666666666666, None)
(0.5454545454545454, 0.2, 0.29268292682926833, None)
(0.4090909090909091, 0.391304347826087, 0.4, None)
(0.5333333333333333, 0.36363636363636365, 0.43243243243243246, None)
(0.6666666666666666, 0.25, 0.36363636363636365, None)
(0.6, 0.47368421052631576, 0.5294117647058824, None)
44.58786826253601
