## IBM HR Analytics
https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [2]:
df = pd.read_csv(r"./WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7


In [3]:
df.shape

(1470, 35)

### Droping the feature set which is not much relevant as per use case understanding

In [9]:
print(df['StandardHours'].unique())
print(df['EmployeeCount'].unique())
print(df['EmployeeNumber'].unique())
print(df['Over18'].unique())
## Same values in columns make these as irrelevant 

[80]
[1]
[   1    2    4 ... 2064 2065 2068]
['Y']


In [10]:
df.drop('StandardHours', axis=1, inplace=True)
df.drop('EmployeeCount', axis=1, inplace=True)
df.drop('EmployeeNumber', axis=1, inplace=True)
df.drop('Over18', axis=1, inplace=True)

In [11]:
## Check remaining feature set
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
MonthlyRate                 1470 non-null int64
NumCompaniesWorked    

### Creating numeric data for discrete non numeric values in feature set

In [14]:
y = df['Attrition']
X = df.drop('Attrition',axis=1)

In [15]:
y.unique()

array(['Yes', 'No'], dtype=object)

In [16]:
y = pd.get_dummies(y, drop_first=True)
y.head()

Unnamed: 0,Yes
0,1
1,0
2,1
3,0
4,0


In [17]:
X.head(2)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,...,3,1,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,...,4,4,1,10,3,3,10,7,1,7


In [18]:
X.select_dtypes(['object'])

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No
5,Travel_Frequently,Research & Development,Life Sciences,Male,Laboratory Technician,Single,No
6,Travel_Rarely,Research & Development,Medical,Female,Laboratory Technician,Married,Yes
7,Travel_Rarely,Research & Development,Life Sciences,Male,Laboratory Technician,Divorced,No
8,Travel_Frequently,Research & Development,Life Sciences,Male,Manufacturing Director,Single,No
9,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married,No


In [20]:
features = ['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

In [22]:
from sklearn.preprocessing import LabelEncoder
for feature in features:
        label_enc = LabelEncoder()
        #label_enc = label_enc.fit(X[feature])
        X[feature] = label_enc.fit_transform(X[feature])

In [23]:
X.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,2,1102,2,1,2,1,2,0,94,...,3,1,0,8,0,1,6,4,0,5
1,49,1,279,1,8,1,1,3,1,61,...,4,4,1,10,3,3,10,7,1,7
2,37,2,1373,1,2,2,4,4,1,92,...,3,2,0,7,3,3,0,0,0,0
3,33,1,1392,1,3,4,1,4,0,56,...,3,3,0,8,3,3,8,7,3,0
4,27,2,591,1,2,1,3,1,1,40,...,3,4,1,6,3,3,2,2,2,2


### Normalization & Feature Selection

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [51]:
X_std = StandardScaler().fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [52]:
X_scaled = pd.DataFrame(X_std, index=X.index, columns=X.columns)

In [53]:
X_scaled.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.44635,0.590048,0.742527,1.401512,-1.010909,-0.891688,-0.937414,-0.660531,-1.224745,1.383138,...,-0.42623,-1.584178,-0.932014,-0.421642,-2.171982,-2.49382,-0.164613,-0.063296,-0.679146,0.245834
1,1.322365,-0.913194,-1.297775,-0.493817,-0.14715,-1.868426,-0.937414,0.254625,0.816497,-0.240677,...,2.346151,1.191438,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,-0.368715,0.806541
2,0.008343,0.590048,1.414363,-0.493817,-0.887515,-0.891688,1.316673,1.169781,0.816497,1.284725,...,-0.42623,-0.658973,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-0.679146,-1.155935
3,-0.429664,-0.913194,1.461466,-0.493817,-0.764121,1.061787,-0.937414,1.169781,-1.224745,-0.486709,...,-0.42623,0.266233,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,-1.155935
4,-1.086676,0.590048,-0.524295,-0.493817,-0.887515,-1.868426,0.565311,-1.575686,0.816497,-1.274014,...,-0.42623,1.191438,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.058285,-0.595227


In [54]:
feature_selector = SelectKBest(f_classif, k=20)

In [55]:
X_feature_strip = feature_selector.fit_transform(X_scaled, y)

  y = column_or_1d(y, warn=True)


In [56]:
X_feature_strip.shape

(1470, 20)

In [59]:
cols_stripped = X_scaled.columns[feature_selector.get_support()]
print(cols_stripped)

Index(['Age', 'DailyRate', 'Department', 'DistanceFromHome',
       'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'OverTime',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsWithCurrManager'],
      dtype='object')


In [60]:
X_Final = X_scaled[cols_stripped]

In [61]:
X_Final.head()`

Unnamed: 0,Age,DailyRate,Department,DistanceFromHome,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsWithCurrManager
0,0.44635,0.742527,1.401512,-1.010909,-0.660531,0.379672,-0.057788,1.032716,1.153254,1.23682,-0.10835,1.591746,-1.584178,-0.932014,-0.421642,-2.171982,-2.49382,-0.164613,-0.063296,0.245834
1,1.322365,-1.297775,-0.493817,-0.14715,0.254625,-1.026167,-0.057788,0.626374,-0.660853,-0.133282,-0.291719,-0.628241,1.191438,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,0.806541
2,0.008343,1.414363,-0.493817,-0.887515,1.169781,-1.026167,-0.961486,-0.998992,0.2462,1.23682,-0.937654,1.591746,-0.658973,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-1.155935
3,-0.429664,1.461466,-0.493817,-0.764121,1.169781,0.379672,-0.961486,0.626374,0.2462,-0.133282,-0.763634,1.591746,0.266233,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,-1.155935
4,-1.086676,-0.524295,-0.493817,-0.887515,-1.575686,0.379672,-0.961486,-0.998992,-0.660853,-0.133282,-0.644858,-0.628241,1.191438,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.595227


In [63]:
X_Final.shape

(1470, 20)

### Creating generic function to evaluate various classifier's scores

In [80]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        y_train_pred = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train,y_train_pred )))
        print("Classification Report: \n {}\n".format(classification_report(y_train, y_train_pred)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train,y_train_pred)))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        y_test_pred = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test,y_test_pred)))
        print("Classification Report: \n {}\n".format(classification_report(y_test,y_test_pred)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test,y_test_pred)))

### Create the Test - Train dataframe for various models

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_Final, y, test_size=0.3, random_state=42)

### Using Decision Tree Classifier

In [73]:
from sklearn.tree import DecisionTreeClassifier

In [74]:
from sklearn.model_selection import GridSearchCV

In [69]:
dt_clf = DecisionTreeClassifier()

In [70]:
params_grid = {"max_depth": [3, 5, None],
               "min_samples_split": [2, 3, 10],
               "min_samples_leaf": [1, 3, 10]}

In [75]:
grid_search = GridSearchCV(dt_clf, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=False, scoring='accuracy')

In [76]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 5, None], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=False)

In [77]:
print(grid_search.best_score_)    
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.8347910592808552
{'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [78]:
# Set the clf to the best combination of parameters
dt_clf = grid_search.best_estimator_
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [81]:
# Evaluating the classifier scores
print_score(dt_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.8814

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       853
           1       0.75      0.47      0.57       176

   micro avg       0.88      0.88      0.88      1029
   macro avg       0.82      0.72      0.75      1029
weighted avg       0.87      0.88      0.87      1029


Confusion Matrix: 
 [[825  28]
 [ 94  82]]

Average Accuracy: 	 0.8221
Accuracy SD: 		 0.0369


In [82]:
print_score(dt_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.8707

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.96      0.93       380
           1       0.56      0.31      0.40        61

   micro avg       0.87      0.87      0.87       441
   macro avg       0.73      0.64      0.66       441
weighted avg       0.85      0.87      0.85       441


Confusion Matrix: 
 [[365  15]
 [ 42  19]]



### Using AdaBoost with Random Forest Classifier

In [83]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [84]:
ada_clf = AdaBoostClassifier(RandomForestClassifier())

In [85]:
params_grid = {"base_estimator__max_depth": [3, 5, 8, None],
               "base_estimator__min_samples_split": [2, 3, 10],
               "base_estimator__min_samples_leaf": [1, 3, 10],
			   "base_estimator__bootstrap" : [True,False],
			   "learning_rate": [0.5,0.8,1.0]}

In [86]:
grid_search = GridSearchCV(ada_clf, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=False, scoring='accuracy')

In [87]:
grid_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)




GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min...e=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'base_estimator__max_depth': [3, 5, 8, None], 'base_estimator__min_samples_split': [2, 3, 10], 'base_estimator__min_samples_leaf': [1, 3, 10], 'base_estimator__bootstrap': [True, False], 'learning_rate': [0.5, 0.8, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=False)

In [88]:
print(grid_search.best_score_)    
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.8756073858114675
{'base_estimator__bootstrap': True, 'base_estimator__max_depth': 5, 'base_estimator__min_samples_leaf': 10, 'base_estimator__min_samples_split': 3, 'learning_rate': 1.0}
AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)


In [89]:
# Set the clf to the best combination of parameters
ada_clf = grid_search.best_estimator_
ada_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)




AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [90]:
# Evaluating the classifier scores
print_score(ada_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       853
           1       1.00      1.00      1.00       176

   micro avg       1.00      1.00      1.00      1029
   macro avg       1.00      1.00      1.00      1029
weighted avg       1.00      1.00      1.00      1029


Confusion Matrix: 
 [[853   0]
 [  0 176]]



  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)




  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)




Average Accuracy: 	 0.8697
Accuracy SD: 		 0.0174


In [91]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.8639

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       380
           1       0.52      0.28      0.36        61

   micro avg       0.86      0.86      0.86       441
   macro avg       0.70      0.62      0.64       441
weighted avg       0.84      0.86      0.85       441


Confusion Matrix: 
 [[364  16]
 [ 44  17]]



### XGBoost Classifier

In [92]:
import xgboost as xgb

In [93]:
xgb_clf = xgb.XGBClassifier(learning_rate=0.05,silent=True)

In [94]:
params_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0,0.5,1],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [95]:
grid_search = GridSearchCV(xgb_clf, params_grid,n_jobs=-1, cv=5,verbose=False, scoring='accuracy')

In [96]:
grid_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.05,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'min_child_weight': [1, 5, 10], 'gamma': [0, 0.5, 1], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=False)

In [97]:
print(grid_search.best_score_)    
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.8736637512147716
{'colsample_bytree': 0.8, 'gamma': 0, 'max_depth': 4, 'min_child_weight': 5}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, verbosity=1)


In [98]:
# Set the clf to the best combination of parameters
xgb_clf = grid_search.best_estimator_
xgb_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, verbosity=1)

In [99]:
# Evaluating the classifier scores
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9086

Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.99      0.95       853
           1       0.92      0.51      0.66       176

   micro avg       0.91      0.91      0.91      1029
   macro avg       0.91      0.75      0.80      1029
weighted avg       0.91      0.91      0.90      1029


Confusion Matrix: 
 [[845   8]
 [ 86  90]]



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Average Accuracy: 	 0.8726
Accuracy SD: 		 0.0259


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [100]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.8685

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.98      0.93       380
           1       0.58      0.18      0.28        61

   micro avg       0.87      0.87      0.87       441
   macro avg       0.73      0.58      0.60       441
weighted avg       0.84      0.87      0.84       441


Confusion Matrix: 
 [[372   8]
 [ 50  11]]



In [101]:
xgb_clf_base = xgb.XGBClassifier()

In [102]:
xgb_clf_base.fit(X_train, y_train.values.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [103]:
print_score(xgb_clf_base, X_train, y_train.values.ravel(), X_test, y_test.values.ravel(), train=True)

Train Result:

accuracy score: 0.9261

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.99      0.96       853
           1       0.95      0.60      0.74       176

   micro avg       0.93      0.93      0.93      1029
   macro avg       0.94      0.80      0.85      1029
weighted avg       0.93      0.93      0.92      1029


Confusion Matrix: 
 [[847   6]
 [ 70 106]]

Average Accuracy: 	 0.8677
Accuracy SD: 		 0.0260


In [105]:
print_score(xgb_clf_base, X_train, y_train.values.ravel(), X_test, y_test.values.ravel(), train=False)

Test Result:

accuracy score: 0.8798

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       380
           1       0.64      0.30      0.40        61

   micro avg       0.88      0.88      0.88       441
   macro avg       0.77      0.63      0.67       441
weighted avg       0.86      0.88      0.86       441


Confusion Matrix: 
 [[370  10]
 [ 43  18]]

