# XGBoost

### Image Mode

In [1]:
IMAGE_MODE = 0      #1 for colour, 0 for grayscale, unchanged for -1

### Datasets locations:

In [2]:
import os

height = 100
width  = 100

data_dir = os.path.join(os.path.pardir, "Datasets", "Resized_data_{}_{}".format(height, width))
train_csv = os.path.join(data_dir, "train.csv")
test_csv = os.path.join(data_dir, "test.csv")

N_CLASSES = 2

## Imports

In [3]:
import xgboost as xgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import cv2

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

  from pandas import MultiIndex, Int64Index


## Loading the dataset

In [4]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [5]:
train_df.head()

Unnamed: 0,Image_Path,Parasitized
0,..\Datasets\Resized_data_100_100\Parasitized\1...,1.0
1,..\Datasets\Resized_data_100_100\Parasitized\1...,1.0
2,..\Datasets\Resized_data_100_100\Uninfected\10...,0.0
3,..\Datasets\Resized_data_100_100\Parasitized\1...,1.0
4,..\Datasets\Resized_data_100_100\Parasitized\1...,1.0


In [6]:
train_df.dtypes

Image_Path      object
Parasitized    float64
dtype: object

## Data and Labels

In [7]:
train_x = train_df['Image_Path'].to_numpy()
train_y = train_df['Parasitized'].to_numpy()

test_x  = test_df['Image_Path'].to_numpy()
test_y  = test_df['Parasitized'].to_numpy()

#### Load images from paths

In [8]:
def load_images(path_arr):
    '''Reads and loads images into a numpy array
    Returns: a numpy array'''
    arr = []
    for path in path_arr:
        arr.append(cv2.imread(path, IMAGE_MODE))
    
    return np.array(arr)


In [9]:
train_x = load_images(train_x)
test_x = load_images(test_x)

#### Checking shape

In [10]:
print(train_x.shape, test_x.shape, sep = '\n')

(24802, 100, 100)
(2756, 100, 100)


## Flattening

In [11]:
num_train, num_test = train_x.shape[0], test_x.shape[0]

dim = 1
for i in train_x.shape[1:]: dim*=i

print(num_train, dim)

24802 10000


In [12]:
train_x = train_x.reshape([num_train, dim])
test_x = test_x.reshape([num_test, dim])

## XGBoost

### Data prep

In [13]:
dtrain = xgb.DMatrix(train_x, label = train_y)
dtest = xgb.DMatrix(test_x, label = test_y)

In [14]:
params = {"max_depth": 3, "objective": 'reg:logistic','verbosity': 1, 'num_parallel_tree' : 1, 'booster': 'gbtree', 'colsamplebytree': 0.1, 'gamma': 0.75}
xgboosto = xgb.train(params, dtrain, 1000, [(dtest, 'eval'), (dtrain, 'train')])

Parameters: { "colsamplebytree" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	eval-rmse:0.48435	train-rmse:0.48294
[1]	eval-rmse:0.47538	train-rmse:0.47191
[2]	eval-rmse:0.46885	train-rmse:0.46461
[3]	eval-rmse:0.46405	train-rmse:0.45903
[4]	eval-rmse:0.46094	train-rmse:0.45491
[5]	eval-rmse:0.45847	train-rmse:0.45125
[6]	eval-rmse:0.45579	train-rmse:0.44822
[7]	eval-rmse:0.45326	train-rmse:0.44505
[8]	eval-rmse:0.45115	train-rmse:0.44282
[9]	eval-rmse:0.44954	train-rmse:0.44073
[10]	eval-rmse:0.44668	train-rmse:0.43833
[11]	eval-rmse:0.44551	train-rmse:0.43677
[12]	eval-rmse:0.44427	train-rmse:0.43514
[13]	eval-rmse:0.44289	train-rmse:0.43306
[14]	eval-rmse:0.44190	train-rmse:0.43166
[15]	eval-rmse:0.44058	train-rmse:0.42961
[16]	eval-rmse:0

## Performance Evaluation

In [15]:
y_pred_train = xgboosto.predict(xgb.DMatrix(train_x))
y_pred_test  = xgboosto.predict(xgb.DMatrix(test_x))

#### Accuracy

In [16]:
y_pred_train[np.where(y_pred_train >= 0.5)] = 1
y_pred_train[np.where(y_pred_train < 0.5)] = 0

In [17]:
y_pred_test[np.where(y_pred_test >= 0.5)] = 1
y_pred_test[np.where(y_pred_test < 0.5)] = 0

In [18]:
y_pred_train

array([1., 1., 0., ..., 0., 0., 1.], dtype=float32)

In [19]:
acc_train = accuracy_score(train_y, y_pred_train)
acc_test  = accuracy_score(test_y , y_pred_test)

print("Training Accuracy is: {:.4f} and Validation Accuracy is: {:.4f}".format(acc_train, acc_test))

Training Accuracy is: 0.9991 and Validation Accuracy is: 0.8875


#### Precision

In [20]:
prec_train = precision_score(train_y, y_pred_train)
prec_test  = precision_score(test_y , y_pred_test)

print("Training Precision is: {:.4f} and Validation Precision is: {:.4f}".format(prec_train, prec_test))

Training Precision is: 1.0000 and Validation Precision is: 0.9033


#### Recall

In [21]:
recall_train = recall_score(train_y, y_pred_train)
recall_test  = recall_score(test_y , y_pred_test)

print("Training Recall is: {:.4f} and Validation Recall is: {:.4f}".format(recall_train, recall_test))

Training Recall is: 0.9982 and Validation Recall is: 0.8679


#### F1 Score

In [22]:
f1_train = f1_score(train_y, y_pred_train)
f1_test  = f1_score(test_y , y_pred_test)

print("Training F1-Score is: {:.4f} and Validation F1-Score is: {:.4f}".format(f1_train, f1_test))

Training F1-Score is: 0.9991 and Validation F1-Score is: 0.8853


#### ROC AUC

In [23]:
y_score = nb.predict_proba(test_x)

fpr, tpr, _ = roc_curve(test_y, y_score[:, 1], pos_label = 1)
roc_auc = roc_auc_score(test_y, y_score[:, 1])

plt.plot(fpr, tpr, color = 'orange')
plt.plot([0,1], [0,1], color = 'navy', linestyle = '--')

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve\nAuC = {:.4f}".format(roc_auc))

plt.show()


NameError: name 'nb' is not defined