In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score
seed=123

In [3]:
X, y = make_classification(
    n_samples=200,
    n_features=5,
    n_informative=3,
    n_classes=2,
    weights=[.9, .1],
    shuffle=True,
    random_state=seed
)

print('There are {} positive instances.'.format(y.sum()))

There are 20 positive instances.


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=seed)

print('Total number of postivie train instances: {}'.format(y_train.sum()))
print('Total number of positive test instances: {}'.format(y_test.sum()))

Total number of postivie train instances: 13
Total number of positive test instances: 7


In [9]:
dtrain=xgb.DMatrix(X_train,label=y_train)
dtest=xgb.DMatrix(X_test)

In [10]:
params={
    'objective':'binary:logistic',
    'max_depth':1,
    'silent':1,
    'eta':1
}
num_rounds=15

In [19]:
bst=xgb.train(params,dtrain,num_rounds)
y_test_preds=(bst.predict(dtest)>0.5).astype(int)

In [20]:
y_test_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [21]:
y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [27]:
pd.crosstab(pd.Series(y_test,name='Actual'),pd.Series(y_test_preds,name='Predicted'),margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,58,1,59
1,5,2,7
All,63,3,66


In [30]:
"""We can also present the performance using 3 different evaluation metrics:

accuracy,
precision (the ability of the classifier not to label as positive a sample that is negative),
recall (the ability of the classifier to find all the positive samples)."""

'We can also present the performance using 3 different evaluation metrics:\n\naccuracy,\nprecision (the ability of the classifier not to label as positive a sample that is negative),\nrecall (the ability of the classifier to find all the positive samples).'

In [31]:
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_test_preds)))
print('Precision: {0:.2f}'.format(precision_score(y_test, y_test_preds)))
print('Recall: {0:.2f}'.format(recall_score(y_test, y_test_preds)))

Accuracy: 0.91
Precision: 0.67
Recall: 0.29


In [39]:
#initilizing weight
weights=np.zeros(len(y_train))
weights[y_train==0]=1
weights[y_train==1]=5

In [41]:
dtrain=xgb.DMatrix(X_train,label=y_train,weight=weights)
dtest=xgb.DMatrix(X_test)

In [44]:
bst=xgb.train(params,dtrain,num_rounds)
y_test_preds=(bst.predict(dtest)>0.5).astype(int)
pd.crosstab(pd.Series(y_test,name='Actual'),pd.Series(y_test_preds,name='Predicted'),margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,57,2,59
1,4,3,7
All,61,5,66


In [45]:
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_test_preds)))
print('Precision: {0:.2f}'.format(precision_score(y_test, y_test_preds)))
print('Recall: {0:.2f}'.format(recall_score(y_test, y_test_preds)))

Accuracy: 0.91
Precision: 0.60
Recall: 0.43


In [46]:
#using scale_pos_weight parameter
#set ratio  between negative and positive classes
dtrain=xgb.DMatrix(X_train,label=y_train)
dtest=xgb.DMatrix(X_test)

In [59]:
train_labels = dtrain.get_label()

ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels == 1)
params['scale_pos_weight'] = ratio

In [60]:
bst=xgb.train(params,dtrain,num_rounds)
y_test_preds=(bst.predict(dtest)>0.5).astype(int)
pd.crosstab(pd.Series(y_test,name='Actual'),pd.Series(y_test_preds,name='Predicted'),margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,56,3,59
1,4,3,7
All,60,6,66


In [61]:
print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_test_preds)))
print('Precision: {0:.2f}'.format(precision_score(y_test, y_test_preds)))
print('Recall: {0:.2f}'.format(recall_score(y_test, y_test_preds)))

Accuracy: 0.89
Precision: 0.50
Recall: 0.43
