# XGBoost

### Image Mode

In [61]:
IMAGE_MODE = 0      #1 for colour, 0 for grayscale, unchanged for -1

### Datasets locations:

In [62]:
import os

height = 25
width = 25

train_csv = os.path.join(os.path.pardir, "Datasets", 'Resized_data_{}_{}'.format(height, width), 'train.csv')
test_csv = os.path.join(os.path.pardir, "Datasets", 'Resized_data_{}_{}'.format(height, width), 'test.csv')

N_CLASSES = 2

## Imports

In [63]:
import xgboost as xgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import cv2

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

## Loading the dataset

In [64]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [65]:
train_df.head()

Unnamed: 0,Image_Path,Parasitized
0,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
1,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
2,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
3,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
4,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0


In [66]:
train_df.dtypes

Image_Path      object
Parasitized    float64
dtype: object

## Data and Labels

In [67]:
train_x = train_df['Image_Path'].to_numpy()
train_y = train_df['Parasitized'].to_numpy()

test_x  = test_df['Image_Path'].to_numpy()
test_y  = test_df['Parasitized'].to_numpy()

#### Load images from paths

In [68]:
def load_images(path_arr):
    '''Reads and loads images into a numpy array
    Returns: a numpy array'''
    arr = []
    for path in path_arr:
        arr.append(cv2.imread(path, IMAGE_MODE))
    
    return np.array(arr)


In [69]:
train_x = load_images(train_x)
test_x = load_images(test_x)

#### Checking shape

In [70]:
print(train_x.shape, test_x.shape, sep = '\n')

(24802, 25, 25)
(2756, 25, 25)


## Flattening

In [71]:
num_train, num_test = train_x.shape[0], test_x.shape[0]

dim = 1
for i in train_x.shape[1:]: dim*=i

print(num_train, dim)

24802 625


In [72]:
train_x = train_x.reshape([num_train, dim])
test_x = test_x.reshape([num_test, dim])

## XGBoost

### Data prep

In [73]:
dtrain = xgb.DMatrix(train_x, label = train_y)
dtest = xgb.DMatrix(test_x, label = test_y)

In [148]:
params = {"max_depth": 3, "objective": 'reg:logistic','verbosity': 1, 'num_parallel_tree' : 1, 'booster': 'gbtree', 'colsamplebytree': 0.1, 'gamma': 0.75}
xgboosto = xgb.train(params, dtrain, 1000, [(dtest, 'eval'), (dtrain, 'train')])

Parameters: { "colsamplebytree" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	eval-rmse:0.48447	train-rmse:0.48295
[1]	eval-rmse:0.47595	train-rmse:0.47246
[2]	eval-rmse:0.46912	train-rmse:0.46483
[3]	eval-rmse:0.46430	train-rmse:0.45930
[4]	eval-rmse:0.46148	train-rmse:0.45505
[5]	eval-rmse:0.45902	train-rmse:0.45180
[6]	eval-rmse:0.45660	train-rmse:0.44899
[7]	eval-rmse:0.45418	train-rmse:0.44649
[8]	eval-rmse:0.45200	train-rmse:0.44379
[9]	eval-rmse:0.44993	train-rmse:0.44137
[10]	eval-rmse:0.44891	train-rmse:0.43977
[11]	eval-rmse:0.44717	train-rmse:0.43783
[12]	eval-rmse:0.44619	train-rmse:0.43661
[13]	eval-rmse:0.44406	train-rmse:0.43460
[14]	eval-rmse:0.44398	train-rmse:0.43362
[15]	eval-rmse:0.44286	train-rmse:0.43211
[16]	eval-rmse:0

## Performance Evaluation

In [149]:
y_pred_train = xgboosto.predict(xgb.DMatrix(train_x))
y_pred_test  = xgboosto.predict(xgb.DMatrix(test_x))

#### Accuracy

In [150]:
y_pred_train[np.where(y_pred_train >= 0.5)] = 1
y_pred_train[np.where(y_pred_train < 0.5)] = 0

In [151]:
y_pred_test[np.where(y_pred_test >= 0.5)] = 1
y_pred_test[np.where(y_pred_test < 0.5)] = 0

In [152]:
y_pred_train

array([1., 1., 0., ..., 0., 0., 1.], dtype=float32)

In [153]:
acc_train = accuracy_score(train_y, y_pred_train)
acc_test  = accuracy_score(test_y , y_pred_test)

print("Training Accuracy is: {:.4f} and Validation Accuracy is: {:.4f}".format(acc_train, acc_test))

Training Accuracy is: 0.9894 and Validation Accuracy is: 0.8846


#### Precision

In [154]:
prec_train = precision_score(train_y, y_pred_train)
prec_test  = precision_score(test_y , y_pred_test)

print("Training Precision is: {:.4f} and Validation Precision is: {:.4f}".format(prec_train, prec_test))

Training Precision is: 0.9978 and Validation Precision is: 0.8949


#### Recall

In [155]:
recall_train = recall_score(train_y, y_pred_train)
recall_test  = recall_score(test_y , y_pred_test)

print("Training Recall is: {:.4f} and Validation Recall is: {:.4f}".format(recall_train, recall_test))

Training Recall is: 0.9809 and Validation Recall is: 0.8716


#### F1 Score

In [156]:
f1_train = f1_score(train_y, y_pred_train)
f1_test  = f1_score(test_y , y_pred_test)

print("Training F1-Score is: {:.4f} and Validation F1-Score is: {:.4f}".format(f1_train, f1_test))

Training F1-Score is: 0.9893 and Validation F1-Score is: 0.8831


#### ROC AUC

In [157]:
y_score = nb.predict_proba(test_x)

fpr, tpr, _ = roc_curve(test_y, y_score[:, 1], pos_label = 1)
roc_auc = roc_auc_score(test_y, y_score[:, 1])

plt.plot(fpr, tpr, color = 'orange')
plt.plot([0,1], [0,1], color = 'navy', linestyle = '--')

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve\nAuC = {:.4f}".format(roc_auc))

plt.show()


NameError: name 'nb' is not defined