In [128]:
#Import Libraries
import cudf as pd
import cupy as np
import os
import cv2
from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.metrics.confusion_matrix import confusion_matrix
from sklearn.metrics import accuracy_score
from tensorflow.keras.datasets import fashion_mnist
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Preprocessing

### 1. Loading the dataset

In [129]:
image_size = 200
# function to load the dataset and return the dataset list
labels = ['PNEUMONIA', 'NORMAL']

# returns a list called data containing elements of format [image, label]  
# where image is a [200, 200] matrix representing an image
def data_loader(data_dir):
    data = list()
    for label in labels:
        path = os.path.join(data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            # reading the images from the folder represented in path
            img_arr = cv2.imread(os.path.join(path, img), 0)
            # resizing the image to 200x200 for analysis
            resized_arr = cv2.resize(img_arr, (image_size, image_size))
            data.append([resized_arr, class_num]) 
    return data

In [130]:
# the data returned is a list 
# list[0] is a list of type [image, class]
# list[0][0] gives us a 2d list of 200x200 image values

val = data_loader('../input/chest-xray-pneumonia/chest_xray/val')
test = data_loader('../input/chest-xray-pneumonia/chest_xray/test')
train = data_loader('../input/chest-xray-pneumonia/chest_xray/train')

In [131]:
# displaying the stored image in the dataset
plt.figure(figsize = (6,6))
plt.imshow(np.asnumpy(train[2][0]))

### 2. Normalizing the images

In [132]:
# normalizing the values in image array for all images
def normalize_list(train):
    for pair in train:
        pair[0] = np.array(pair[0])/ 255
    return train

train = normalize_list(train)
test = normalize_list(test)
val = normalize_list(val)

In [133]:
# checking if the values were normalised properly
train[0]

# Splitting the dataset

In [134]:
# iterating over the data and appending to X & Y
# converting X & Y to numpy and returning them as numpy arrays
def make_x_y(data):
    X = []
    Y = []
    for pair in data:
        X.append(pair[0])
        Y.append(pair[1])
    return np.array(X),np.array(Y)
x_train, y_train = make_x_y(train)
x_test, y_test = make_x_y(test)
x_val, y_val = make_x_y(val)

In [135]:
# checking shapes of the data
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

### 1. flattening / reshaping the images

In [136]:
# using reshape to flatten the images from 200*200 to 40,000 columns
x_train = x_train.reshape(len(x_train), 200*200)
x_test = x_test.reshape(len(x_test), 200*200)
x_val = x_val.reshape(len(x_val), 200*200)

print(x_train.shape)
print(x_test.shape)
print(x_val.shape)

# Model Creation & Training

### 1. testing the RFC model on the dataset

In [137]:
from cuml.ensemble import RandomForestClassifier as cuRFC

In [138]:
# n_estimators is number of trees in forest, 
# max_features = number of features to look at for best split 1.0 means look at all features

cuml_model = cuRFC(max_features=1.0,n_bins=8, n_estimators=40)
# making the model and fitting the training data on the model
cuml_model.fit(x_train.astype('float32'), y_train.astype('float32'))

In [139]:
# using the above trained model to predict for the 
y_rfc_predict = cuml_model.predict(x_test.astype('float32'))
print(type(y_rfc_predict))
print(y_rfc_predict.shape)

# Model Evaluation | Test Set

In [140]:
# transforming the y_test matrix in the same order as of y_predict
y_test_temp = y_test.reshape(1, len(y_test))[0]
print(type(y_test_temp))
print(y_test_temp.shape)

In [141]:
# converting the results from ndarray to nparrays for evaluation
y_np_test = np.asnumpy(y_test_temp)
y_np_rfc_predict = np.asnumpy(y_rfc_predict)
print(type(y_np_test))
print(type(y_np_rfc_predict))

### 1. Accuracy of model

In [142]:
accu = accuracy_score(y_np_test, y_np_rfc_predict)
print("accuracy of model is: %f" %accu)

### 2. Confusion matrix

In [143]:
import seaborn as sns

In [144]:
print("confusion matrix of model: \n")
cmap = confusion_matrix(y_np_test, y_np_rfc_predict)
plt.figure(figsize = (4, 4), dpi = 150)
hm = sns.heatmap(data=cmap,annot=True,fmt='g')

### 3. classification report of model

In [145]:
print("classification report of model: \n")
print(classification_report(y_np_test, y_np_rfc_predict, target_names = labels))

### 4. ROC curve

In [146]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [147]:
# plotting the ROC curve and associated metrics
pred = cuml_model.predict(x_test)
pred = np.asnumpy(pred)

fpr, tpr, threshold = metrics.roc_curve(y_np_test, pred)
roc_auc = metrics.auc(fpr, tpr)

plt.figure(figsize=(6, 4), dpi = 150)
plt.title('ROC curve')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### 5. All Metrics 

In [148]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [149]:
accuracy  = accuracy_score(y_np_test, y_np_rfc_predict)
precision  = precision_score(y_np_test, y_np_rfc_predict)
recall  = recall_score(y_np_test, y_np_rfc_predict)
f1  = f1_score(y_np_test, y_np_rfc_predict)
roc  = roc_auc_score(y_np_test, y_np_rfc_predict)

value = [accuracy, precision, recall, f1, roc]
labels = ['Accuarcy', 'Precision', 'Recall', 'F1', 'ROC Score']

plt.figure(figsize = (6, 4), dpi=150)
plt.bar(labels, value)
plt.title('Metrics')
plt.show()

## model results
>  the model shows an accuracy of **76.9%** <br/>
>  we see a higher **99% recall** in case of pneumonia which is desired <br/>
>  we have a high F1 score in case of pneumonia of **84%** <br/>


### note
>  the model is not run on validation due to very few examples in the validation set therefore is not a dataset big enough to judge models performance