# XGBoost

### Image Mode

In [24]:
IMAGE_MODE = 0      #1 for colour, 0 for grayscale, unchanged for -1

### Datasets locations:

In [25]:
import os


train_csv = os.path.join(os.path.pardir, "Datasets", 'Resized_data_50_50', 'train.csv')
test_csv = os.path.join(os.path.pardir, "Datasets", 'Resized_data_50_50', 'test.csv')

N_CLASSES = 2

## Imports

In [26]:
import xgboost as xgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import cv2

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

## Loading the dataset

In [27]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [28]:
train_df.head()

Unnamed: 0,Image_Path,Parasitized
0,..\Datasets\Resized_data_50_50\Parasitized\50x...,1.0
1,..\Datasets\Resized_data_50_50\Parasitized\50x...,1.0
2,..\Datasets\Resized_data_50_50\Uninfected\50x5...,0.0
3,..\Datasets\Resized_data_50_50\Parasitized\50x...,1.0
4,..\Datasets\Resized_data_50_50\Parasitized\50x...,1.0


In [29]:
train_df.dtypes

Image_Path      object
Parasitized    float64
dtype: object

## Data and Labels

In [30]:
train_x = train_df['Image_Path'].to_numpy()
train_y = train_df['Parasitized'].to_numpy()

test_x  = test_df['Image_Path'].to_numpy()
test_y  = test_df['Parasitized'].to_numpy()

#### Load images from paths

In [31]:
def load_images(path_arr):
    '''Reads and loads images into a numpy array
    Returns: a numpy array'''
    arr = []
    for path in path_arr:
        arr.append(cv2.imread(path, IMAGE_MODE))
    
    return np.array(arr)


In [32]:
train_x = load_images(train_x)
test_x = load_images(test_x)

#### Checking shape

In [33]:
print(train_x.shape, test_x.shape, sep = '\n')

(24802, 50, 50)
(2756, 50, 50)


## Flattening

In [34]:
num_train, num_test = train_x.shape[0], test_x.shape[0]

dim = 1
for i in train_x.shape[1:]: dim*=i

print(num_train, dim)

24802 2500


In [35]:
train_x = train_x.reshape([num_train, dim])
test_x = test_x.reshape([num_test, dim])

## XGBoost

### Data prep

In [36]:
dtrain = xgb.DMatrix(train_x, label = train_y)
dtest = xgb.DMatrix(test_x, label = test_y)

In [78]:
params = {"max_depth": 50, "objective": 'reg:logistic','verbosity': 1, 'num_parallel_tree' : 1, 'booster': 'gbtree'}
xgboosto = xgb.train(params, dtrain, 100, [(dtest, 'eval'), (dtrain, 'train')])

[0]	eval-rmse:0.46077	train-rmse:0.39632
[1]	eval-rmse:0.43838	train-rmse:0.32282
[2]	eval-rmse:0.42457	train-rmse:0.26580
[3]	eval-rmse:0.41451	train-rmse:0.22221
[4]	eval-rmse:0.41021	train-rmse:0.18866
[5]	eval-rmse:0.40531	train-rmse:0.16056
[6]	eval-rmse:0.40209	train-rmse:0.13787
[7]	eval-rmse:0.39831	train-rmse:0.11965
[8]	eval-rmse:0.39411	train-rmse:0.10586
[9]	eval-rmse:0.39070	train-rmse:0.09392
[10]	eval-rmse:0.38787	train-rmse:0.08391
[11]	eval-rmse:0.38615	train-rmse:0.07474
[12]	eval-rmse:0.38411	train-rmse:0.06706
[13]	eval-rmse:0.38272	train-rmse:0.06070
[14]	eval-rmse:0.38037	train-rmse:0.05520
[15]	eval-rmse:0.37712	train-rmse:0.05038
[16]	eval-rmse:0.37259	train-rmse:0.04612
[17]	eval-rmse:0.36918	train-rmse:0.04288
[18]	eval-rmse:0.36780	train-rmse:0.03974
[19]	eval-rmse:0.36429	train-rmse:0.03719
[20]	eval-rmse:0.36418	train-rmse:0.03470
[21]	eval-rmse:0.36330	train-rmse:0.03254
[22]	eval-rmse:0.36238	train-rmse:0.03075
[23]	eval-rmse:0.36218	train-rmse:0.02908
[2

## Performance Evaluation

In [79]:
y_pred_train = xgboosto.predict(xgb.DMatrix(train_x))
y_pred_test  = xgboosto.predict(xgb.DMatrix(test_x))

#### Accuracy

In [80]:
y_pred_train[np.where(y_pred_train >= 0.5)] = 1
y_pred_train[np.where(y_pred_train < 0.5)] = 0

In [81]:
y_pred_test[np.where(y_pred_test >= 0.5)] = 1
y_pred_test[np.where(y_pred_test < 0.5)] = 0

In [82]:
y_pred_train

array([1., 1., 0., ..., 0., 0., 1.], dtype=float32)

In [83]:
acc_train = accuracy_score(train_y, y_pred_train)
acc_test  = accuracy_score(test_y , y_pred_test)

print("Training Accuracy is: {:.4f} and Validation Accuracy is: {:.4f}".format(acc_train, acc_test))

Training Accuracy is: 1.0000 and Validation Accuracy is: 0.8520


#### Precision

In [84]:
prec_train = precision_score(train_y, y_pred_train)
prec_test  = precision_score(test_y , y_pred_test)

print("Training Precision is: {:.4f} and Validation Precision is: {:.4f}".format(prec_train, prec_test))

Training Precision is: 1.0000 and Validation Precision is: 0.8499


#### Recall

In [85]:
recall_train = recall_score(train_y, y_pred_train)
recall_test  = recall_score(test_y , y_pred_test)

print("Training Recall is: {:.4f} and Validation Recall is: {:.4f}".format(recall_train, recall_test))

Training Recall is: 1.0000 and Validation Recall is: 0.8549


#### F1 Score

In [86]:
f1_train = f1_score(train_y, y_pred_train)
f1_test  = f1_score(test_y , y_pred_test)

print("Training F1-Score is: {:.4f} and Validation F1-Score is: {:.4f}".format(f1_train, f1_test))

Training F1-Score is: 1.0000 and Validation F1-Score is: 0.8524


#### ROC AUC

In [87]:
y_score = nb.predict_proba(test_x)

fpr, tpr, _ = roc_curve(test_y, y_score[:, 1], pos_label = 1)
roc_auc = roc_auc_score(test_y, y_score[:, 1])

plt.plot(fpr, tpr, color = 'orange')
plt.plot([0,1], [0,1], color = 'navy', linestyle = '--')

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve\nAuC = {:.4f}".format(roc_auc))

plt.show()


NameError: name 'nb' is not defined