# XGBoost Implementation

## Data Download and Processing

### RUN THE BELOW CELLS IF USING LOCAL MACHINE

In [112]:
import sys, os
sys.path.append(os.path.dirname(os.path.join((os.path.pardir), "Modules")))

origin_dir = os.path.join(os.path.pardir, 'Data')
new_dir_path = os.path.join(os.path.pardir, 'Datasets')

#for local systems

height, width = 25, 25
csv_dir = os.path.join(os.path.pardir, 'Datasets', 'Resized_data_{}_{}'.format(height, width))
train_csv = os.path.join(csv_dir, 'train.csv')
test_csv = os.path.join(csv_dir, 'test.csv')
val_csv = os.path.join(csv_dir, 'val.csv')

### **RUN THE BELOW CELL IF USING COLAB** (overrides paths and installs packages)

For the git clone segment, please add the Personal Access Token instead of the {pat} segment 

In [113]:
# # RUN THIS CELL IF COLAB

# from google.colab import drive

# drive.mount('/content/gdrive')
# !git clone "https://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@github.com/madhava20217/Malaria-Detection-from-Cells.git"

# !pip install -q -r "/content/Malaria-Detection-from-Cells/requirements_versionless.txt" 

# sys.path.append(os.path.dirname(os.path.join(os.path.curdir, "Malaria-Detection-from-Cells", "Modules")))

# origin_dir = "/content/Data"
# new_dir_path = "/content/Datasets/"

In [114]:
# from Modules.data_download import Data_Download
# from Modules.labelling import Labelling

# download = Data_Download(origin_dir)
# data_dir = download.resize_image(new_dir_path, height, width)

# lab = Labelling()
# lab.label(data_dir)
# train_csv, val_csv, test_csv = lab.train_test_val_split(data_dir, train_split = 0.7, test_split = 0.15, labels = "labels.csv")

### Image Mode

In [115]:
IMAGE_MODE = 1      #1 for colour, 0 for grayscale, unchanged for -1

In [116]:
N_CLASSES = 2

## Imports

In [117]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import cv2

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

## Loading the dataset

In [118]:
train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)

In [119]:
train_df.head()

Unnamed: 0,Image_Path,Parasitized
0,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
1,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
2,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
3,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
4,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0


In [120]:
train_df.dtypes

Image_Path      object
Parasitized    float64
dtype: object

## Data and Labels

In [121]:
train_x = train_df['Image_Path'].to_numpy()
train_y = train_df['Parasitized'].to_numpy()

val_x  = val_df['Image_Path'].to_numpy()
val_y  = val_df['Parasitized'].to_numpy()

#### Load images from paths

In [122]:
def load_images(path_arr):
    '''Reads and loads images into a numpy array
    Returns: a numpy array'''
    arr = []
    for path in path_arr:
        arr.append(cv2.imread(path, IMAGE_MODE))
    
    return np.array(arr)


In [123]:
train_x = load_images(train_x)
val_x = load_images(val_x)

#### Checking shape

In [124]:
print(train_x.shape, val_x.shape, sep = '\n')

(17471, 25, 25, 3)
(4765, 25, 25, 3)


## Flattening

In [125]:
num_train, num_val = train_x.shape[0], val_x.shape[0]

dim = 1
for i in train_x.shape[1:]: dim*=i

print(num_train, dim)

17471 1875


In [126]:
train_x = train_x.reshape([num_train, dim])
val_x = val_x.reshape([num_val, dim])

## XGBoost

In [127]:
dtrain = xgb.DMatrix(train_x, label = train_y)
dval = xgb.DMatrix(val_x, label = val_y)

In [129]:
params = {  "max_depth": 3, 
            "objective": 'reg:logistic',
            'verbosity': 1, 
            'num_parallel_tree' : 20, 
            'booster': 'gbtree', 
            'gamma': 0.5, 
            'tree_method': 'gpu_hist',
            'subsample': 0.5,
            'lambda' : 1
        }
xgboosto = xgb.train(params, dtrain, 2000, [(dval, 'eval'), (dtrain, 'train')])

[0]	eval-rmse:0.47682	train-rmse:0.47807
[1]	eval-rmse:0.46353	train-rmse:0.46506
[2]	eval-rmse:0.45466	train-rmse:0.45610
[3]	eval-rmse:0.44859	train-rmse:0.44953
[4]	eval-rmse:0.44329	train-rmse:0.44407
[5]	eval-rmse:0.43953	train-rmse:0.43976
[6]	eval-rmse:0.43631	train-rmse:0.43608
[7]	eval-rmse:0.43358	train-rmse:0.43273
[8]	eval-rmse:0.43111	train-rmse:0.42978
[9]	eval-rmse:0.42889	train-rmse:0.42704
[10]	eval-rmse:0.42692	train-rmse:0.42454
[11]	eval-rmse:0.42531	train-rmse:0.42246
[12]	eval-rmse:0.42393	train-rmse:0.42060
[13]	eval-rmse:0.42233	train-rmse:0.41844
[14]	eval-rmse:0.42087	train-rmse:0.41658
[15]	eval-rmse:0.41928	train-rmse:0.41465
[16]	eval-rmse:0.41806	train-rmse:0.41286
[17]	eval-rmse:0.41676	train-rmse:0.41111
[18]	eval-rmse:0.41567	train-rmse:0.40951
[19]	eval-rmse:0.41472	train-rmse:0.40793
[20]	eval-rmse:0.41373	train-rmse:0.40655
[21]	eval-rmse:0.41251	train-rmse:0.40495
[22]	eval-rmse:0.41166	train-rmse:0.40368
[23]	eval-rmse:0.41077	train-rmse:0.40239
[2

## Performance Evaluation

In [130]:
y_pred_train = xgboosto.predict(xgb.DMatrix(train_x))
y_pred_val  = xgboosto.predict(xgb.DMatrix(val_x))


# y_pred_train = xgboosto.predict(train_x)
# y_pred_val  = xgboosto.predict(val_x)

In [131]:
y_pred_train[np.where(y_pred_train >= 0.5)] = 1
y_pred_train[np.where(y_pred_train < 0.5)] = 0


y_pred_val[np.where(y_pred_val >= 0.5)] = 1
y_pred_val[np.where(y_pred_val < 0.5)] = 0

In [132]:
print(y_pred_val)

[0. 0. 1. ... 1. 0. 0.]


#### Accuracy

In [133]:
acc_train = accuracy_score(train_y, y_pred_train)
acc_val  = accuracy_score(val_y , y_pred_val)

print("Training Accuracy is: {:.4f} and Validation Accuracy is: {:.4f}".format(acc_train, acc_val))

Training Accuracy is: 1.0000 and Validation Accuracy is: 0.9047


#### Precision

In [134]:
prec_train = precision_score(train_y, y_pred_train)
prec_val  = precision_score(val_y , y_pred_val)

print("Training Precision is: {:.4f} and Validation Precision is: {:.4f}".format(prec_train, prec_val))

Training Precision is: 1.0000 and Validation Precision is: 0.9173


#### Recall

In [135]:
recall_train = recall_score(train_y, y_pred_train)
recall_val  = recall_score(val_y , y_pred_val)

print("Training Recall is: {:.4f} and Validation Recall is: {:.4f}".format(recall_train, recall_val))

Training Recall is: 1.0000 and Validation Recall is: 0.8905


#### F1 Score

In [136]:
f1_train = f1_score(train_y, y_pred_train)
f1_val  = f1_score(val_y , y_pred_val)

print("Training F1-Score is: {:.4f} and Validation F1-Score is: {:.4f}".format(f1_train, f1_val))

Training F1-Score is: 1.0000 and Validation F1-Score is: 0.9037


#### ROC AUC

In [137]:
y_score = xgboosto.predict(xgb.DMatrix(val_x))

fpr, tpr, _ = roc_curve(val_y, y_score[:, 1], pos_label = 1)
roc_auc = roc_auc_score(val_y, y_score[:, 1])

plt.plot(fpr, tpr, color = 'orange')
plt.plot([0,1], [0,1], color = 'navy', linestyle = '--')

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve\nAuC = {:.4f}".format(roc_auc))

plt.show()


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed