In [1]:
# pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.1-py3-none-win_amd64.whl (89.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate,cross_val_score,KFold,StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler,minmax_scale

### Datasets

In [4]:
HR = pd.read_csv('C:/Users/Nithin/Downloads/ML_DT/HR_comma_sep.csv')

### Data Prep

In [5]:
HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [6]:
HR.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64

In [7]:
#Ordinal Label Encoding
HR.salary = np.where(HR.salary == 'low',1,
            np.where(HR.salary == 'medium',2,3))

In [8]:
#One hot encoding
HR.department.value_counts()

dummy_df = pd.get_dummies(HR.department,drop_first=True,prefix='Dept')

In [9]:
HR = pd.concat([HR,dummy_df],axis=1)
del HR['department']

### Train & Test Model

In [10]:
feat = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'Dept_RandD', 'Dept_accounting',
       'Dept_hr', 'Dept_management', 'Dept_marketing', 'Dept_product_mng',
       'Dept_sales', 'Dept_support', 'Dept_technical']

In [11]:
train_x,test_x,train_y,test_y = train_test_split(HR[feat],HR['left'],test_size=0.3,random_state=12345)

# Random Forest

### Without Hyperparameter Tuning

In [12]:
RAND = RandomForestClassifier(n_estimators=10,oob_score=True,n_jobs=-1)

RAND.fit(train_x,train_y)

  warn("Some inputs do not have OOB scores. "
  decision = (predictions[k] /


RandomForestClassifier(n_estimators=10, n_jobs=-1, oob_score=True)

In [13]:
RAND.oob_score_

0.9799028478902753

In [14]:
train_pred = pd.DataFrame(RAND.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.999993443982417

In [15]:
test_pred = pd.DataFrame(RAND.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9921804449093818

### With Hyperparameter Tuning

In [16]:
params = {'n_estimators':[50,60,70,80,90,100],
          'max_features':[5,6,7,8,9]}

In [17]:
RAND_H = GridSearchCV(RandomForestClassifier(),param_grid=params,cv=5,scoring='roc_auc')

RAND_H.fit(train_x,train_y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': [5, 6, 7, 8, 9],
                         'n_estimators': [50, 60, 70, 80, 90, 100]},
             scoring='roc_auc')

In [18]:
RAND_H.best_params_

{'max_features': 8, 'n_estimators': 100}

### Using Best Params

In [19]:
RAND = RandomForestClassifier(n_estimators=80,oob_score=True,n_jobs=-1,max_features=6)

RAND.fit(train_x,train_y)

RandomForestClassifier(max_features=6, n_estimators=80, n_jobs=-1,
                       oob_score=True)

In [20]:
RAND.oob_score_

0.9903800361939232

In [21]:
train_pred = pd.DataFrame(RAND.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

1.0

In [22]:
test_pred = pd.DataFrame(RAND.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9926069635560563

# Adaboost / Adaptive Boosting

### Without Hyperparameter Tuning

In [23]:
ADA = AdaBoostClassifier(n_estimators=10,learning_rate=0.01)

ADA.fit(train_x,train_y)

AdaBoostClassifier(learning_rate=0.01, n_estimators=10)

In [24]:
train_pred = pd.DataFrame(ADA.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.7793395641064718

In [25]:
test_pred = pd.DataFrame(ADA.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.7886943080925942

### With Hyperparameter Tuning

In [26]:
params = {'n_estimators':[50,100,200,300,400],
          'learning_rate':[0.001,0.01,0.1]}

In [27]:
ADA_H = GridSearchCV(AdaBoostClassifier(),param_grid=params,cv=5,scoring='roc_auc')

ADA_H.fit(train_x,train_y)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'n_estimators': [50, 100, 200, 300, 400]},
             scoring='roc_auc')

In [28]:
ADA_H.best_params_

{'learning_rate': 0.1, 'n_estimators': 400}

### Using Best Params

In [29]:
ADA = AdaBoostClassifier(n_estimators=400,learning_rate=0.1)

ADA.fit(train_x,train_y)

AdaBoostClassifier(learning_rate=0.1, n_estimators=400)

In [30]:
train_pred = pd.DataFrame(ADA.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.981140563985742

In [31]:
test_pred = pd.DataFrame(ADA.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9842540156737569

# Bagging

### Without Hyperparameter Tuning

In [32]:
BAG = BaggingClassifier(n_estimators=10,oob_score=True,n_jobs=-1)

BAG.fit(train_x,train_y)

  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /


BaggingClassifier(n_jobs=-1, oob_score=True)

In [33]:
BAG.oob_score_

0.9796171063910849

In [34]:
train_pred = pd.DataFrame(BAG.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9999905741784939

In [35]:
test_pred = pd.DataFrame(BAG.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9892367258120758

### With Hyperparameter Tuning

In [36]:
params = {'n_estimators':[50,60,70,80,90,100],
          'max_samples':[1,10,20,50]}

In [37]:
BAG_H = GridSearchCV(BaggingClassifier(),param_grid=params,cv=5,scoring='roc_auc')

BAG_H.fit(train_x,train_y)

GridSearchCV(cv=5, estimator=BaggingClassifier(),
             param_grid={'max_samples': [1, 10, 20, 50],
                         'n_estimators': [50, 60, 70, 80, 90, 100]},
             scoring='roc_auc')

In [38]:
BAG_H.best_params_

{'max_samples': 50, 'n_estimators': 60}

### Using Best Params

In [39]:
BAG = BaggingClassifier(n_estimators=100,oob_score=True,n_jobs=-1,max_samples=50)

BAG.fit(train_x,train_y)

BaggingClassifier(max_samples=50, n_estimators=100, n_jobs=-1, oob_score=True)

In [40]:
BAG.oob_score_

0.9431374416611106

In [41]:
train_pred = pd.DataFrame(BAG.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9723086434535814

In [42]:
test_pred = pd.DataFrame(BAG.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.973562689439661

# Gradient Boosting

### Without Hyperparameter Tuning

In [43]:
GRAD = GradientBoostingClassifier(n_estimators=10,learning_rate=0.01)

GRAD.fit(train_x,train_y)

GradientBoostingClassifier(learning_rate=0.01, n_estimators=10)

In [44]:
train_pred = pd.DataFrame(GRAD.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9675027856889806

In [45]:
test_pred = pd.DataFrame(GRAD.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9696590589039993

### With Hyperparameter Tuning

In [46]:
params = {'n_estimators':[50,100,200,300,400],
          'learning_rate':[0.001,0.01,0.1]}

In [47]:
GRAD_H = GridSearchCV(GradientBoostingClassifier(),param_grid=params,cv=5,scoring='roc_auc')

%time GRAD_H.fit(train_x,train_y)

Wall time: 1min 58s


GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'n_estimators': [50, 100, 200, 300, 400]},
             scoring='roc_auc')

In [48]:
GRAD_H.best_params_

{'learning_rate': 0.1, 'n_estimators': 400}

### Using Best Params

In [49]:
GRAD = GradientBoostingClassifier(n_estimators=400,learning_rate=0.1)

GRAD.fit(train_x,train_y)

GradientBoostingClassifier(n_estimators=400)

In [50]:
train_pred = pd.DataFrame(GRAD.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.997639759451056

In [51]:
test_pred = pd.DataFrame(GRAD.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9937565940648945

# XG Boosting

### Without Hyperparameter Tuning

In [61]:
? XGBClassifier

In [58]:
XG = XGBClassifier(n_estimators=10,learning_rate=0.01)

XG.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [59]:
train_pred = pd.DataFrame(XG.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9778550343287924

In [60]:
test_pred = pd.DataFrame(XG.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9796063175612242

### With Hyperparameter Tuning

In [62]:
params = {'n_estimators':[50,100,200,300,400],
          'learning_rate':[0.001,0.01,0.1]}

In [63]:
XG_H = GridSearchCV(XGBClassifier(),param_grid=params,cv=5,scoring='roc_auc')

%time XG_H.fit(train_x,train_y)

Wall time: 42.3 s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...ne,
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=

In [64]:
XG_H.best_params_

{'learning_rate': 0.1, 'n_estimators': 200}

### Using Best Params

In [65]:
XG = XGBClassifier(n_estimators=200,learning_rate=0.1)

XG.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [66]:
train_pred = pd.DataFrame(XG.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.9995646804324894

In [67]:
test_pred = pd.DataFrame(XG.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.9945126762515312