In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from urllib.request import urlretrieve

In [2]:
url="https://assets.datacamp.com/production/repositories/1765/datasets/ae888d00f9b36dd7d50a4afbc112761e2db766d2/turnover.csv"
urlretrieve(url,"hr.csv")
hr=pd.read_csv("hr.csv")
hr.head()

Unnamed: 0,satisfaction,evaluation,number_of_projects,average_montly_hours,time_spend_company,work_accident,churn,promotion,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction            14999 non-null float64
evaluation              14999 non-null float64
number_of_projects      14999 non-null int64
average_montly_hours    14999 non-null int64
time_spend_company      14999 non-null int64
work_accident           14999 non-null int64
churn                   14999 non-null int64
promotion               14999 non-null int64
department              14999 non-null object
salary                  14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [4]:
hr.department.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [5]:
hr.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

In [6]:
hr.salary= hr.salary.astype('category')
hr.salary = hr.salary.cat.reorder_categories(['low', 'medium', 'high'])
hr.salary = hr.salary.cat.codes

In [7]:
departments=pd.get_dummies(hr.department)
departments.head()

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0


In [8]:
departments=departments.drop("accounting",axis=1)

In [9]:
hr=hr.drop("department",axis=1)

In [10]:
print(hr.join(departments).head())

   satisfaction  evaluation  number_of_projects  average_montly_hours  \
0          0.38        0.53                   2                   157   
1          0.80        0.86                   5                   262   
2          0.11        0.88                   7                   272   
3          0.72        0.87                   5                   223   
4          0.37        0.52                   2                   159   

   time_spend_company  work_accident  churn  promotion  salary  IT  RandD  hr  \
0                   3              0      1          0       0   0      0   0   
1                   6              0      1          0       1   0      0   0   
2                   4              0      1          0       1   0      0   0   
3                   5              0      1          0       0   0      0   0   
4                   3              0      1          0       0   0      0   0   

   management  marketing  product_mng  sales  support  technical  
0      

In [11]:
n_employee=len(hr)

In [12]:
print(hr.churn.value_counts())

0    11428
1     3571
Name: churn, dtype: int64


In [13]:
print(hr.churn.value_counts()/n_employee *100)

0    76.191746
1    23.808254
Name: churn, dtype: float64


In [14]:
y=hr.churn
X=hr.drop("churn",axis=1)

# Predicting Employee turnover

In [15]:
from sklearn.model_selection import train_test_split as tts

In [16]:
X_train,X_test,y_train,y_test=tts(X,y,test_size=.25,random_state=42)

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
model=DecisionTreeClassifier(random_state=42)

In [19]:
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [20]:
model.score(X_train,y_train)

1.0

In [21]:
model.score(X_test,y_test)

0.9752

# Pruning model

In [22]:
model_depth5=DecisionTreeClassifier(max_depth=5,random_state= 42)
model_depth5.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [23]:
print("Model accuracy Score on training set:",model_depth5.score(X_train,y_train)*100)
print("Model accuracy Score on testing set:",model_depth5.score(X_test,y_test)*100)

Model accuracy Score on training set: 97.68868343852787
Model accuracy Score on testing set: 97.11999999999999


In [24]:
model_sample_100=DecisionTreeClassifier(min_samples_leaf=100, random_state=42)
model_sample_100.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [25]:
print("Model accuracy score on training set:", model_sample_100.score(X_train,y_train)*100)
print("Model accuracy score on testing set:", model_sample_100.score(X_test,y_test)*100)

Model accuracy score on training set: 96.57747355320473
Model accuracy score on testing set: 96.13333333333334


In [26]:
from sklearn.metrics import precision_score

In [27]:
predictions=model.predict(X_test)

In [28]:
print("Precision Score:", precision_score(y_test,predictions))

Precision Score: 0.935064935064935


In [29]:
from sklearn.metrics import recall_score

In [30]:
predictions_recall=model.predict(X_test)

In [31]:
print("Recall Score:", recall_score(y_test,predictions_recall))

Recall Score: 0.9632107023411371


In [32]:
from sklearn.metrics import roc_auc_score

In [33]:
predictions3=model.predict(X_test)

In [34]:
print("ROC AUC Score:",roc_auc_score(y_test,predictions3))

ROC AUC Score: 0.9710901040622616


In [35]:
model_bal=DecisionTreeClassifier(max_depth=5,class_weight="balanced",random_state=42)
model_bal.fit(X_train,y_train)
model_bal_pred=model_bal.predict(X_test)

In [36]:
print("Accuracy Score of balanced class:", model_bal.score(X_test,y_test)*100)
print("Recall score of balanced class:", recall_score(y_test,model_bal_pred)*100)
print("ROC AUC of balanced class:",roc_auc_score(y_test,model_bal_pred)*100)

Accuracy Score of balanced class: 93.57333333333332
Recall score of balanced class: 93.19955406911929
ROC AUC of balanced class: 93.44520290206752


In [37]:
model_bal_p=DecisionTreeClassifier(max_depth=7,class_weight="balanced",random_state=42)
model_bal_p.fit(X_train,y_train)
model4=model_bal_p.predict(X_test)

In [38]:
print("Accuracy Score of balanced class:", model_bal_p.score(X_test,y_test)*100)
print("Recall score of balanced class:", recall_score(y_test,model4)*100)
print("ROC AUC of balanced class:",roc_auc_score(y_test,model4)*100)

Accuracy Score of balanced class: 97.44
Recall score of balanced class: 93.31103678929766
ROC AUC of balanced class: 96.0246035681504


# Hypertuning parameter

In [39]:
from sklearn.model_selection import cross_val_score

In [40]:
print(cross_val_score(model,X,y,cv=10))

[0.98334444 0.982      0.97333333 0.96266667 0.95733333 0.974
 0.988      0.992      1.         1.        ]


In [41]:
depth = [i for i in range(5,21,1)]
samples = [i for i in range(50 ,500,50)]

In [42]:
parameters = dict(max_depth=depth, min_samples_leaf= samples)

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
param_search=GridSearchCV(model,parameters)

In [45]:
param_search.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                       16, 17, 18, 19, 

In [46]:
print("Best parameter:",param_search.best_params_)
print("Best Score :",param_search.best_score_)

Best parameter: {'max_depth': 5, 'min_samples_leaf': 50}
Best Score : 0.9670192906036092


# Feature Importance

In [47]:
model_best=DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=150, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
model_best.fit(X_train,y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=150, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [48]:
feature_importance=model_best.feature_importances_

In [49]:
feature_list=list(X)

In [51]:
relative_importances = pd.DataFrame(index=feature_list, data=feature_importance, columns=["importance"])

In [52]:
relative_importances.sort_values(by="importance", ascending=False)

Unnamed: 0,importance
satisfaction,0.481711
time_spend_company,0.358765
evaluation,0.088027
average_montly_hours,0.038531
number_of_projects,0.032966
work_accident,0.0
promotion,0.0
salary,0.0


In [55]:
selected_feature=relative_importances[relative_importances.values>.01]

In [56]:
selected_list=selected_feature.index

In [57]:
X_train_selected=X[selected_list]
X_test_selected=X[selected_list]

In [61]:
print("Importance feature:\n",X_train_selected.head(2))

Importance feature:
    satisfaction  evaluation  number_of_projects  average_montly_hours  \
0          0.38        0.53                   2                   157   
1          0.80        0.86                   5                   262   

   time_spend_company  
0                   3  
1                   6  


In [62]:
print("Importance feature:\n",X_test_selected.head(2))

Importance feature:
    satisfaction  evaluation  number_of_projects  average_montly_hours  \
0          0.38        0.53                   2                   157   
1          0.80        0.86                   5                   262   

   time_spend_company  
0                   3  
1                   6  


# Develop and test best model

In [67]:
model_best=DecisionTreeClassifier(max_depth=8,min_samples_leaf=150,class_weight="balanced",random_state=42)

In [78]:
model_best.fit(X_train_selected, y_train)

In [77]:
best_prediction=model_best.predict(X_test_selected)

In [80]:
print(model_best.score(X_test_selected, y_test) * 100)
print(recall_score(y_test, prediction_best) * 100)
print(roc_auc_score(y_test, prediction_best) * 100)

In [None]:
importances = pd.Series(data=feature_importance,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color = 'lightgreen')
plt.title('Features Importances')
plt.show()