In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate,cross_val_score,KFold,StratifiedKFold,GridSearchCV

### Datasets

In [16]:
HR = pd.read_csv('C:/Users/Nithin/Downloads/ML_DT/HR_comma_sep.csv')

### Data Prep

In [17]:
HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [18]:
HR.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64

In [19]:
#Ordinal Label Encoding
HR.salary = np.where(HR.salary == 'low',1,
            np.where(HR.salary == 'medium',2,3))

In [20]:
#One hot encoding
HR.department.value_counts()

dummy_df = pd.get_dummies(HR.department,drop_first=True,prefix='Dept')

In [21]:
HR = pd.concat([HR,dummy_df],axis=1)
del HR['department']

### Train & Test Model

In [22]:
feat = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'Dept_RandD', 'Dept_accounting',
       'Dept_hr', 'Dept_management', 'Dept_marketing', 'Dept_product_mng',
       'Dept_sales', 'Dept_support', 'Dept_technical']

In [23]:
train_x,test_x,train_y,test_y = train_test_split(HR[feat],HR['left'],test_size=0.3,random_state=12345)

### Buildiing a Model

In [24]:
#Init Model
DTC = DecisionTreeClassifier()

#Fit Model on train data
DTC.fit(train_x,train_y)

DecisionTreeClassifier()

In [25]:
DTC_feat_imp = pd.concat([pd.Series(train_x.columns),pd.Series(DTC.feature_importances_)],axis = 1)
DTC_feat_imp.columns = ['feat','value']

In [26]:
DTC_feat_imp

Unnamed: 0,feat,value
0,satisfaction_level,0.488269
1,last_evaluation,0.149566
2,number_project,0.107791
3,average_montly_hours,0.093011
4,time_spend_company,0.142808
5,Work_accident,0.001346
6,promotion_last_5years,2.6e-05
7,salary,0.0062
8,Dept_RandD,0.000889
9,Dept_accounting,0.001177


### Model Scoring

In [27]:
train_pred = pd.DataFrame(DTC.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

1.0

In [28]:
test_pred = pd.DataFrame(DTC.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9773912985166707

### K Fold Cross Validation -- Without Hyper parameter tunning

In [29]:
#Complete data
kfold = KFold(n_splits = 10)
scoring = 'roc_auc'

In [30]:
#Init Model
DTC = DecisionTreeClassifier()

X = HR[feat]
Y = HR['left']

In [31]:
cross_val_score(DTC,X,Y,cv=kfold,scoring=scoring)

Traceback (most recent call last):
  File "C:\Users\Nithin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Nithin\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "C:\Users\Nithin\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 362, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\Nithin\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\Nithin\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 542, in roc_auc_score
    return _average_binary_score(partial(_binary_roc_auc_score,
  File "C:\Users\Nithin\anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 77, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_wei

array([  nan, 0.959,   nan,   nan,   nan,   nan,   nan,   nan, 1.   ,
       1.   ])

In [32]:
kfold_s = StratifiedKFold(n_splits = 10)
scoring = 'roc_auc'

In [33]:
result = cross_val_score(DTC,X,Y,cv=kfold_s,scoring=scoring)
act_acc = np.mean(result)
act_acc

0.9800621962406136

In [34]:
#Train Data
result = cross_val_score(DTC,train_x,train_y,cv=kfold_s,scoring=scoring)
act_acc = np.mean(result)
act_acc

0.9724062447558826

### With Hyper parameter tunning

In [35]:
#One hold out
DTC = DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_split=5,min_samples_leaf=2)

DTC.fit(train_x,train_y)

DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_leaf=2,
                       min_samples_split=5)

In [36]:
train_pred = pd.DataFrame(DTC.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9954012876513327

In [37]:
test_pred = pd.DataFrame(DTC.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9860527145194603

In [38]:
#K fold + One hold out
result = cross_val_score(DTC,train_x,train_y,cv=kfold_s,scoring=scoring)
act_acc = np.mean(result)
act_acc

0.9830064187217873

In [39]:
#with Hyperparameter tuning
params = {'criterion': ['gini','entropy'],
         'max_depth': [5,6,7],
         'min_samples_split': [2,3,4],
         'min_samples_leaf':  [3,4,5],
         'max_features': ["auto", "sqrt"]}

In [40]:
DTC_H = GridSearchCV(DecisionTreeClassifier(),param_grid=params,cv=5,scoring=scoring)

DTC_H.fit(train_x,train_y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 6, 7],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [2, 3, 4]},
             scoring='roc_auc')

In [41]:
DTC_H.best_params_

{'criterion': 'gini',
 'max_depth': 7,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 3}

##### Using Best Params

In [42]:
DTC = DecisionTreeClassifier(criterion='gini',max_depth=7,min_samples_split=4,min_samples_leaf=3,max_features='sqrt')

DTC.fit(train_x,train_y)

DecisionTreeClassifier(max_depth=7, max_features='sqrt', min_samples_leaf=3,
                       min_samples_split=4)

In [43]:
train_pred = pd.DataFrame(DTC.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9720694601404027

In [124]:
test_pred = pd.DataFrame(DTC.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9747082685103223