<a href="https://colab.research.google.com/github/mthomp89/NU_489_capstone/blob/develop_thompson/NIH_with_tensorboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Modeling NIH Dataset**

In [0]:
!rm -rf /content/sample_data

## Imports

In [0]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [0]:
# Load general libraries

import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from os import path, listdir
import zipfile
import warnings
from collections import Counter

warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
%matplotlib inline

In [0]:
# Load library models

from tensorflow.keras import layers, models
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import Callback, EarlyStopping
from tensorflow.keras.callbacks import TensorBoard

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

#from imblearn.keras import balanced_batch_generator
#from imblearn.over_sampling import SMOTE

In [0]:
from packaging import version

#import tensorflow as tf
#from tensorflow import keras

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."


## Mount Drive and Kaggle

In [0]:
# Mount G Drive files

from google.colab import drive
drive.mount('/content/drive')
root_path = '/content/drive/My Drive/'

In [0]:
# bring in Kaggle API
# need to pull this file from Kaggle based on your user ID for the site

from google.colab import files

api = files.upload() 

In [0]:
# load Kaggle

!pip install -q Kaggle

In [0]:
# make a directory to save the .json file

! mkdir =p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! ls ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

 ## Load Data

In [0]:
# load Kaggle data
%timeit
! kaggle datasets download -d nih-chest-xrays/sample

# Not enough disk space on Colab to unzip the whole dataset
#! kaggle datasets download -d nih-chest-xrays/data

In [0]:
# save files in .zip folder to G Drive

!unzip -q sample.zip -d image_path

## Curate Data

In [0]:
# load the label data

labels_df = pd.read_csv('/content/image_path/sample_labels.csv', low_memory=False)

In [0]:
# create Y/N field for conditions
# the image either does or doesn't have one present

def conditions(x):
  if 'No Finding' in x:
    return 'N'
  return 'Y'

labels_df['HasCondition'] = labels_df['Finding Labels'].map(conditions)

In [0]:
# one hot encode results

labels_df_cat = pd.concat([labels_df.drop(['HasCondition'], axis=1), pd.get_dummies(labels_df['HasCondition'], prefix='Condition')], axis=1)

In [0]:
# add one hot encoding for labels
# used to include multiple conditions

conditions = ['No Finding','Infiltration','Atelectasis','Effusion','Nodule','Pneumothorax','Mass','Consolidation','Pleural_Thickening','Cardiomegaly','Emphysema','Fibrosis','Edema','Pneumonia','Hernia']

for i in conditions :
    labels_df[i] = labels_df['Finding Labels'].apply(lambda x: 1 if i in x else 0)

In [0]:
labels_df.head()

In [0]:
# load images from directory
# save them as a list

img_path = '/content/image_path/sample/images/'
image_shape = (128,128)
scans = list()
img_id = list()

for file in listdir(img_path):
  picture = load_img(img_path+file, target_size=image_shape)
  picture = img_to_array(picture)
  scans.append(picture)
  img_id.append(file)

In [0]:
# combine image IDs from download with their labels

img_id_df = pd.DataFrame(img_id, columns = ['IMG_ID'])
image_labels = labels_df_cat[['Image Index','Condition_N','Condition_Y']]

img_id_df = pd.merge(img_id_df, image_labels, how = 'left', left_on = ['IMG_ID'], right_on = ['Image Index'])

In [0]:
# create table with just labels for model input

labels = img_id_df[['Condition_N','Condition_Y']]

## Prepare Data

In [0]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

In [0]:
# convert scans and labels to arrays
# makes it easier to load into model

all_scans = np.asarray(scans)
all_labels = np.asarray(labels)

In [0]:
# set up data for the model

seed = 42

X = all_scans
y = all_labels

# this is for original data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = seed)

#configure data using image data generator

train_data_gen = ImageDataGenerator(
    rescale = 1.0/255.0,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True)

test_data_gen = ImageDataGenerator(
    rescale = 1.0/255.0,
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True)


In [0]:
# define data for model

train_samples = len(Xtrain)
test_samples = len(Xtest)
batch_size = 10

# final staging of data

train_data = train_data_gen.flow(np.array(Xtrain), ytrain, batch_size = batch_size)
test_data = test_data_gen.flow(np.array(Xtest), ytest, batch_size = batch_size)

## Model Build

In [0]:
# build the model

model = models.Sequential()
model.add(layers.Conv2D(32, (3,3), activation = 'relu', input_shape = (128,128,3)))
model.add(layers.Conv2D(32, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D((2,2)))

model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(64, (3,3), activation = 'relu'))
model.add(layers.Conv2D(64, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D((2,2)))

model.add(layers.Dropout(0.2))
model.add(layers.Conv2D(128, (3,3), activation = 'relu'))
model.add(layers.Conv2D(128, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D((2,2)))

model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation = 'relu'))

# final output layer
# 
model.add(layers.Dense(2, activation = 'sigmoid'))



In [0]:
# compile the model
#opt = keras.optimizers.rmsprop(lr = 0.001)


model.compile(optimizer = tf.keras.optimizers.RMSprop(0.001), 
              loss = 'binary_crossentropy', 
              metrics = ['accuracy'])


# calculate steps per epoch for training and validation
training_steps = train_samples // batch_size
test_steps = test_samples // batch_size


log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)





In [0]:
startTrainTime = time.time()

# train the model
# this model uses balanced batch generator

model_train = model.fit_generator(train_data,
                                  steps_per_epoch = training_steps,
                                  epochs = 15,
                                  validation_data = test_data,
                                  validation_steps = test_steps,
                                  callbacks = [tensorboard_callback])

endTrainTime = time.time()
trainTime = endTrainTime - startTrainTime
print()
print('Total Training Time (sec): {}'.format(trainTime))

In [0]:
model.summary()

### Start TensorBoard 

In [0]:
%tensorboard --logdir logs/fit

# Evaluate Model

In [0]:
#y_pred = model.predict(Xtest)
#y_pred[y_pred >= 0.5] = 1
#y_pred[y_pred < 0.5] = 0

### ROC score

In [0]:
# ROC score

#roc = roc_auc_score(ytest.astype(int), y_pred.astype(int))
#print('ROC AUC: {:.3f}'.format(roc))