## Import Libraries and final CSV file

In [20]:
import pandas as pd
import matplotlib as plt
import numpy as np
import time
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

#import data 
data = pd.read_csv("employee_final.csv")

# Check data type
print (data.dtypes)

#display data
print('\n',data.shape)
data.head()

Age                         int64
Attrition                   int64
BusinessTravel              int64
Department                  int64
DistanceFromHome            int64
Education                   int64
EducationField              int64
EnvironmentSatisfaction     int64
Gender                      int64
HourlyRate                  int64
JobInvolvement              int64
JobLevel                    int64
JobRole                     int64
JobSatisfaction             int64
MaritalStatus               int64
MonthlyIncome               int64
MonthlyRate                 int64
NumCompaniesWorked          int64
OverTime                    int64
PercentSalaryHike           int64
PerformanceRating           int64
RelationshipSatisfaction    int64
StockOptionLevel            int64
TotalWorkingYears           int64
TrainingTimesLastYear       int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsInCurrentRole          int64
YearsSinceLastPromotion     int64
YearsWithCurrM

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,2,1,2,1,2,0,94,...,3,1,0,8,0,1,6,4,0,5
1,49,0,1,1,8,1,1,3,0,61,...,4,4,1,10,3,3,10,7,1,7
2,37,1,2,1,2,2,5,4,0,92,...,3,2,0,7,3,3,0,0,0,0
3,33,0,1,1,3,4,1,4,0,56,...,3,3,0,8,3,3,8,7,3,0
4,27,0,2,1,2,1,3,1,0,40,...,3,4,1,6,3,3,2,2,2,2


## Random Forest Classifier

In [15]:
# Split dataset into training and test sets
y = data['Attrition']
X = data.drop(columns=['Attrition'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

# Fit Random Forest classifier on training set
random_forest = RandomForestClassifier(n_estimators=200, max_depth=5, max_features= 4, random_state=42)

# Print top 10 Features
random_forest1 = random_forest.fit(X_train,y_train)
top10_features = pd.Series(random_forest1.feature_importances_, index=X.columns).nlargest(10)
print('\n 10 most important features:\n',top10_features)

# Add Grid
param_grid = {'n_estimators': [200],'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8],'criterion' :['gini', 'entropy']}
new_tune = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv= 5)
new_tune.fit(X_train,y_train)
c_score3 = cross_val_score(new_tune, X_train, y_train, cv=2)
print('Accuracy of classifier: ', c_score3.mean())

print('\n Best value for each of the tested parameters:',new_tune.best_params_)
print('\n and accuracy of the model with these best values is',new_tune.best_score_)


 10 most important features:
 Age                   0.095620
OverTime              0.080204
TotalWorkingYears     0.074995
MonthlyIncome         0.066279
DistanceFromHome      0.054946
HourlyRate            0.054791
NumCompaniesWorked    0.052368
YearsAtCompany        0.046083
MonthlyRate           0.045825
StockOptionLevel      0.040250
dtype: float64




Accuracy of classifier:  0.8299319727891157

 Best value for each of the tested parameters: {'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto', 'n_estimators': 200}

 and accuracy of the model with these best values is 0.826530612244898


#### Once we obtain the best parameters, we pass these values to the classifier, and fit it to our training data. Then we compare the accuracy with our testing data

In [16]:
rfc1=RandomForestClassifier(random_state=42, criterion = 'entropy', max_depth = 7, max_features = 'auto', n_estimators = 200)
rfc1.fit(X_train, y_train)
pred=rfc1.predict(X_test)
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

Accuracy for Random Forest on CV data:  0.8622448979591837


In [17]:
# Finally we print the confusion matrix and classification report for analysis

# Print confusion matrix 
print('\n Confusion Matrix \n',confusion_matrix(y_test,pred))

# Print classification report
print('\n Classification report \n',classification_report(y_test,pred))


 Confusion Matrix 
 [[991   6]
 [156  23]]

 Classification report 
               precision    recall  f1-score   support

           0       0.86      0.99      0.92       997
           1       0.79      0.13      0.22       179

    accuracy                           0.86      1176
   macro avg       0.83      0.56      0.57      1176
weighted avg       0.85      0.86      0.82      1176



# KNN Classifier

In [23]:
# define a pipeline to search for best combination of PCA dimensions and n_neighbors
scaler = MinMaxScaler()
pca = PCA()
knn = KNeighborsClassifier()

# create a pipeline
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])

# set up parameters to tune for each step in pipeline
param_grid = {
    'pca__n_components': list(range(1, 19)), # find how many principal componenet to keep
    'knn__n_neighbors': list(range(1, 30)),  # find the best value of k
}

# pass pipeline into gridsearchcv
grid_pipe = GridSearchCV(pipe,param_grid,cv=5)

# call fit on grid_pipe and pass in unscaled data
grid_pipe = grid_pipe.fit(X,y)

# print out the best_score_ and best_params_ from the GridSearchCV
print("best_score",grid_pipe.best_score_)
print("best_params",grid_pipe.best_params_)

# display accuracy on model
scores = cross_val_score(grid_pipe,X,y,cv=3,verbose=2)
print("Accuracy:", scores.mean()*100)

best_score 0.8537414965986394
best_params {'knn__n_neighbors': 13, 'pca__n_components': 4}
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=  42.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.8s remaining:    0.0s


[CV] ................................................. , total=  49.5s
[CV]  ................................................................
[CV] ................................................. , total=  43.1s
Accuracy: 85.10204081632652


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min finished


In [31]:
knn = KNeighborsClassifier()
y_pred = cross_val_predict(knn,X,y,cv=3)
print("Confusion matrix:\n",confusion_matrix(y,y_pred))
print("Classification Report:\n",classification_report(y,y_pred))

Confusion matrix:
 [[1194   39]
 [ 217   20]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.97      0.90      1233
           1       0.34      0.08      0.14       237

    accuracy                           0.83      1470
   macro avg       0.59      0.53      0.52      1470
weighted avg       0.76      0.83      0.78      1470



# Decision Tree Classifier

In [32]:
# Fit a decision tree classifier on the training set
decision_tree = DecisionTreeClassifier(random_state=0)

# Add Grid
new_grid = {'max_depth':[5,10,15,20], 'min_samples_leaf':[5,10,15,20], 'max_features':[5,10,15]}
new_tune2 = GridSearchCV(estimator = decision_tree, param_grid = new_grid, cv = 5)
new_tune2.fit(X_train,y_train)

# Run Cross Validation loop and print accuracy
c_score = cross_val_score(decision_tree, X_train, y_train, cv=10)
print('Accuracy of classifier: ', c_score.mean())

print('\n Best value for each of the tested parameters:',new_tune2.best_params_)
print('\n and accuracy of the model with these best values is',new_tune2.best_score_)

Accuracy of classifier:  0.7489737274220032

 Best value for each of the tested parameters: {'max_depth': 5, 'max_features': 5, 'min_samples_leaf': 5}

 and accuracy of the model with these best values is 0.8231292517006803




In [35]:
# Once we obtain the best parameters, we pass these values to the classifier, and fit it to our training data. 
# Then we compare the accuracy with our testing data

dt = DecisionTreeClassifier(min_samples_leaf = 5, max_depth = 5, max_features = 5)
dt.fit(X_train, y_train)
pred1 = dt.predict(X_test)
print("Accuracy for Decision Trees on CV data: ",accuracy_score(y_test,pred1))

Accuracy for Decision Trees on CV data:  0.8061224489795918


In [37]:
# Print confusion matrix 
print('\n Confusion Matrix \n',confusion_matrix(y_test,pred1))

# Print classification report
print('\n Classification report \n',classification_report(y_test,pred1))


 Confusion Matrix 
 [[889 108]
 [120  59]]

 Classification report 
               precision    recall  f1-score   support

           0       0.88      0.89      0.89       997
           1       0.35      0.33      0.34       179

    accuracy                           0.81      1176
   macro avg       0.62      0.61      0.61      1176
weighted avg       0.80      0.81      0.80      1176

