In [1]:
!pip install xgboost



## XGBoost - eXtreme Gradient BOOSTing package

* Written in C++, very fast, faster than all other ensemble classifiers
* Parallelizable - on GPUs as well, run distributed on Hadoop as well
* Hyper-parameter tuning is much effective since it has availability of wide variety of tuning parameters
* Handles missing values automatically
* Tree pruning available - max_depth param
* Built-in cross-validation

In [5]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [6]:
df = sns.load_dataset('titanic')

In [7]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [9]:
df['pclass'].unique()

array([3, 1, 2])

In [10]:
X = df[['pclass','sex','age']]

In [11]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

X['sex']=lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [12]:
y=df['survived']

In [13]:
X.head()

Unnamed: 0,pclass,sex,age
0,3,1,22.0
1,1,0,38.0
2,3,0,26.0
3,1,0,35.0
4,3,1,35.0


In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [19]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [20]:
import xgboost as xgb

In [21]:
xgb_clf = xgb.XGBClassifier(max_depth=5,n_estimators=10000,learning_rate=0.3, n_jobs=-1)

In [22]:
xgb_clf.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [23]:
cv_res = cross_val_score(xgb_clf,x_train,y_train,scoring='accuracy',cv=10)
avg_acc = np.mean(cv_res)
sd = np.std(cv_res)

In [24]:
print(f'Average Accuracy : {avg_acc}')
print(f'Accuracy Standard Deviation : {sd}')

Average Accuracy : 0.7656682027649769
Accuracy Standard Deviation : 0.05193840565181309


In [25]:
print('Accuracy Score : ',accuracy_score(y_test,xgb_clf.predict(x_test)))
print('\n\nConfusion Matrix : \n',confusion_matrix(y_test,xgb_clf.predict(x_test)))
print('\n\nClassification Report : \n',classification_report(y_test,xgb_clf.predict(x_test)))

Accuracy Score :  0.8171641791044776


Confusion Matrix : 
 [[146  18]
 [ 31  73]]


Classification Report : 
               precision    recall  f1-score   support

           0       0.82      0.89      0.86       164
           1       0.80      0.70      0.75       104

    accuracy                           0.82       268
   macro avg       0.81      0.80      0.80       268
weighted avg       0.82      0.82      0.81       268



# Take Home : Since XGB doesn't require any missing value imputation, run the last section with all the variables. Make sure to convert all the values into numeric representation before putting into algo.