# Notes

1. Seed Averaging
2. Maybe output of AE as features.

In [1]:
from time import time

notebook_start_time = time()

#Only Uncomment When Committing

# Directory Setup

In [2]:
tr_dir_base = "/kaggle/input/labeled-chest-xray-images/chest_xray/train/"
ts_dir_base = "/kaggle/input/labeled-chest-xray-images/chest_xray/test/"

tr_dir_normal = tr_dir_base + "NORMAL/"
ts_dir_normal = ts_dir_base + "NORMAL/"

tr_dir_pnemon = tr_dir_base + "PNEUMONIA/"
ts_dir_pnemon = ts_dir_base + "PNEUMONIA/"

# Library Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.image as img
from PIL import Image
import cv2

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, log_loss, f1_score

import random as r
import os
from time import time

MAX_VALUE = 255

# Helper Functions

In [4]:
def breaker():
    print("\n" + 30*"-" + "\n")
    
def head(x, no_of_ele=5):
    breaker()
    print(x[:no_of_ele])
    breaker()

def getFileNames(root_dir=None):
    f_name = []
    for dirname, _, filenames in os.walk(root_dir):
        for filename in filenames:
            f_name.append(filename)
    return f_name

def removeChannelInfo(file_path=None, file_names=None, size=None):
    sizes = []
    images = []
    for name in file_names:
        image = cv2.imread(file_path+name)
        if len(image.shape) > 2:
            image = image[:, :, -1]
        sizes.append(image.shape)
        images.append(cv2.resize(image, dsize=(size, size), interpolation=cv2.INTER_LANCZOS4))
    return images, sizes

# Data Processing

**File Name Handling**

In [5]:
tr_normal_file_names = getFileNames(tr_dir_normal)
tr_pnemon_file_names = getFileNames(tr_dir_pnemon)
ts_normal_file_names = getFileNames(ts_dir_normal)
ts_pnemon_file_names = getFileNames(ts_dir_pnemon)

**Basic Info (Used to confirm correctness Pre-Dataset Numpy Arrays)**

In [6]:
breaker()
print("Total Training Set Size        :", repr(len(tr_normal_file_names) + len(tr_pnemon_file_names)))
breaker()
print("Total Test Set Size            :", repr(len(ts_normal_file_names) + len(ts_pnemon_file_names)))
breaker()


------------------------------

Total Training Set Size        : 5232

------------------------------

Total Test Set Size            : 624

------------------------------



**Reading Image Data**

In [7]:
start_time = time()

n_size = 127
trn_images, trn_sizes = removeChannelInfo(tr_dir_normal, tr_normal_file_names, n_size)
trp_images, trp_sizes = removeChannelInfo(tr_dir_pnemon, tr_pnemon_file_names, n_size)

tsn_images, tsn_sizes = removeChannelInfo(ts_dir_normal, ts_normal_file_names, n_size)
tsp_images, tsp_sizes = removeChannelInfo(ts_dir_pnemon, ts_pnemon_file_names, n_size)

print("Time Taken to process data : {:.2f} minutes".format((time()-start_time)/60))

Time Taken to process data : 2.03 minutes


**Consolidating Train Images and creating Labels**

In [8]:
tr_images = np.concatenate((trn_images, trp_images), axis=0)
tr_images = np.divide(tr_images, MAX_VALUE)
tr_labels = np.concatenate((np.zeros((len(trn_images))), np.ones((len(trp_images)))), axis=0)

print(tr_images.shape)
print(tr_labels.shape)

(5232, 127, 127)
(5232,)


**Consolidating Test Images and creating Labels**

In [9]:
ts_images = np.concatenate((tsn_images, tsp_images), axis=0)
ts_images = np.divide(ts_images, MAX_VALUE)
ts_labels = np.concatenate((np.zeros((len(tsn_images))), np.ones((len(tsp_images)))), axis=0)

print(ts_images.shape)
print(ts_labels.shape)

(624, 127, 127)
(624,)


# XGB

In [10]:
xgb = XGBClassifier(tree_method="gpu_hist", random_state=0, n_estimators=100)
xgb.fit(tr_images.reshape(tr_images.shape[0], -1), tr_labels)
y_pred = xgb.predict(ts_images.reshape(ts_images.shape[0], -1))

breaker()
print("XGB Model Accuracy : {:.2f} %".format(accuracy_score(ts_labels, y_pred) * 100))
breaker()


------------------------------

XGB Model Accuracy : 80.61 %

------------------------------



# Final Cell

In [11]:
breaker()
print("Time taken to run Notebook : {:.2f} minutes".format((time()-notebook_start_time)/60))
breaker()

#Only Uncomment When Committing


------------------------------

Time taken to run Notebook : 2.43 minutes

------------------------------

