In [1]:
"""
This notebook tunes a ResNet CNN to images of chest x-rays. 
The code is set up to run in GoogleColab in order to utilise GPU. 
The input data is loaded from GDrive. 
"""

'\nThis notebook tunes a ResNet CNN to images of chest x-rays. \nThe code is set up to run in GoogleColab in order to utilise GPU. \nThe input data is loaded from GDrive. \n'

In [2]:
import numpy as np
import pandas as pd
import os
import shutil
import zipfile
import numpy as np
from google.colab import drive, files
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from keras.models import Sequential
import sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix
import keras
import pickle


In [3]:
# Check access to a GPU & print specs
def check_gpu():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
  else:
    print(gpu_info)

In [4]:
def mount_to_Gdrive(path='/content/gdrive'):
  drive.mount(path)

In [5]:
def copy_files():
  #Copy files folder
  %cp -av 'gdrive/MyDrive/files' '/content'

  # Move files from GDrive
  if 'Train' not in os.listdir():
    os.makedirs('Train')
  if 'Test' not in os.listdir():
    os.makedirs('Test')

  shutil.move('files/train_resize.zip', 'Train/')   
  shutil.move('files/test_resize.zip', 'Test/')   

In [6]:
def unzip_files():
  # Unzip train files
  with zipfile.ZipFile('Train/train_resize.zip', 'r') as zip_ref:
      zip_ref.extractall('Train/')
  # Unzip test files
  with zipfile.ZipFile('Test/test_resize.zip', 'r') as zip_ref:
      zip_ref.extractall('Test/')

In [7]:
def undersample_data(df):
  # Get list of classes in training set 
  train_list=pd.read_csv('files/train_val_list.txt', header=None)[0].tolist()
  # Get list of 'No Finding' class which is majority 
  nf_index=df[(df['Finding Labels']=='No Finding') & (df['Image Index'].isin(train_list))].index
  # Randomly drop a subset of these 
  nf_index_drop=np.random.choice(nf_index, 40000, replace=False)
  nf_images_drop=df[df.index.isin(nf_index_drop)]['Image Index'].tolist()
  # Remove images designated to be dropped from folder directory 
  for i in nf_images_drop:
    os.remove('Train/train_resize/'+str(i))
  # Remove images designated to be droppped from tabular data 
  df=df[~df.index.isin(nf_index_drop)]
  df.reset_index(drop=True, inplace=True)
  print(len(df))
  return df

In [8]:
def process_data(train_path, test_path):
  df=pd.read_csv(r'files/Data_Entry_2017_v2020.csv')
  # Get Labels of train val images as they will be read in order 
  labels=pd.DataFrame([*os.walk(train_path)][0][2])
  labels.columns=['Image']

  # #Merge labels with the df file. 
  labels=labels.merge(df[['Image Index', 'Finding Labels']], left_on='Image', right_on='Image Index')
  labels.drop('Image Index', axis=1, inplace=True)
  #One Hot Encode Labels 
  labels=pd.concat([labels, labels['Finding Labels'].str.get_dummies(sep="|")], axis=1)
  labels['Path']=labels['Image'].apply(lambda x: train_path+str(x) )

  # Test Set
  test=pd.DataFrame([*os.walk(test_path)][0][2])
  test.columns=['Image']

  # Merge labels with the df file.
  test=test.merge(df[['Image Index', 'Finding Labels']], left_on='Image', right_on='Image Index')
  test.drop('Image Index', axis=1, inplace=True)
  #One Hot Encode
  test=pd.concat([test, test['Finding Labels'].str.get_dummies(sep="|")], axis=1)
  test['Path']=test['Image'].apply(lambda x: train_path+str(x) )

  return labels, test, df

In [9]:
def compute_class_weights(labels):
  # Separate the labels df into those where there are more than 1 entry, and those where there is a single entry 
  labels_ = labels[labels['Finding Labels'].apply(lambda x: x.count("|"))==0]
  labels__= labels[labels['Finding Labels'].apply(lambda x: x.count("|"))!=0]

  # Encode class labels 
  ord_enc = OrdinalEncoder()
  labels_["Encoded"] = ord_enc.fit_transform(labels_[["Finding Labels"]])
  print(labels_['Encoded'].nunique())
  labels__['Encoded']=16

  labels=pd.concat([labels_, labels__])

  # Compute Class weights based on distribution of classes 
  class_weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(labels['Encoded']), y=labels['Encoded'])
  vals=class_weights[:-1]
  # Store in dictionary 
  class_weights_dict=dict(enumerate(vals))

  # Create df to summarise number of observations & weight per class 
  classes=pd.merge(labels['Encoded'].value_counts(), pd.DataFrame(data=class_weights_dict.items()).iloc[:, -1], left_index=True, right_index=True)
  classes.columns=['Observations', 'Class weight']

  print(classes)
  
  return labels, class_weights_dict

In [10]:
def get_image_generators(labels, train_path, test_path):
  # Create generators per training, val & test datasets
  datagen=ImageDataGenerator(rescale=1./255, validation_split = 0.2)
  val_datagen=ImageDataGenerator(rescale=1./255, validation_split = 0.2)
  test_datagen=ImageDataGenerator(rescale=1./255)

  # Define generators per set. Set batch size to 64 
  train_generator=datagen.flow_from_dataframe(dataframe=labels.loc[:, :], x_col='Image', y_col=labels.columns[2:-2], directory=train_path, class_mode='raw', batch_size=64, target_size=(224, 224), subset='training')
  val_generator=val_datagen.flow_from_dataframe(dataframe=labels.loc[:, :], x_col='Image', y_col=labels.columns[2:-2], directory=train_path, class_mode='raw', batch_size=64, target_size=(224, 224), subset='validation')
  test_generator=test_datagen.flow_from_dataframe(dataframe=test, x_col='Image', y_col=test.columns[2:-2], directory=test_path, class_mode='raw', batch_size=64, target_size=(224, 224), shuffle=False)
  return train_generator, val_generator, test_generator

In [26]:
def create_model(trainable=False):
  # Download pretrained resnet for feature reduction (no classification layer)
  resnet = tf.keras.applications.ResNet50V2(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(224, 224, 3),
    pooling=None )
  resnet.trainable = trainable

  # Append layers for training classification
  model = tf.keras.Sequential([
    resnet,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(50),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(30),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(15),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('sigmoid')])
  return model 

In [None]:
def train(train_generator, val_generator, test_generator, model, my_callbacks ):
  res=pd.DataFrame() # Create df to store results 
  count=1
  for lr in [0.0001]: # a range of learning rates were tested , including [0.001, 0.0001, 0.00001] with training done in seperate notebooks to speed up tuning 
    for optimizer in [tf.keras.optimizers.Adam(learning_rate=lr), tf.keras.optimizers.SGD(learning_rate=lr)]:
      for batch in [32, 256, 512]:
        print(lr, optimizer, batch)

        key=str(lr)+"_"+str(count)+"_"+str(batch)

        # Estimate step size for generator 
        STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size

        # Compile Model with metrics = Accuracy & AUC
        model.compile(optimizer=optimizer,loss="binary_crossentropy",metrics=[tf.keras.metrics.binary_accuracy, tf.keras.metrics.AUC(multi_label=True)])

        # Fit Model 
        history = model.fit(train_generator, steps_per_epoch=STEP_SIZE_TRAIN, validation_data=val_generator, class_weight=class_weights_dict, #callbacks=my_callbacks,
                            epochs=50)
        # Predict 
        preds=model.predict(test_generator)

        # Store ROC_AUC scores for each class 
        for i in range(15):
          res.loc[i, 'Disease'+str(key)]= test.columns[i+2]
          res.loc[i, 'AUC'+str(key)] = roc_auc_score(test.iloc[:,i+2], preds[:,i], average='samples')
        # Store in Colab folder 
        with open('/ResNetDict_0_0001', 'ab') as file_pi:
            pickle.dump(history.history, file_pi)

        count=count+1
 

In [11]:
# Check access to GPU within COlab & print specs 
check_gpu()

Sun Feb  6 21:23:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
# Mount to GDrive where data is stored 
mount_to_Gdrive()

Mounted at /content/gdrive


In [13]:
# Copy files within Gdrive to relevant folders
copy_files()

'gdrive/MyDrive/files' -> '/content/files'
'gdrive/MyDrive/files/train_val_list.txt' -> '/content/files/train_val_list.txt'
'gdrive/MyDrive/files/test_list.txt' -> '/content/files/test_list.txt'
'gdrive/MyDrive/files/Data_Entry_2017_v2020.csv' -> '/content/files/Data_Entry_2017_v2020.csv'
'gdrive/MyDrive/files/test_resize.zip' -> '/content/files/test_resize.zip'
'gdrive/MyDrive/files/train_resize.zip' -> '/content/files/train_resize.zip'
'gdrive/MyDrive/files/ResNetDict_0_00001' -> '/content/files/ResNetDict_0_00001'
'gdrive/MyDrive/files/res_ResNet_0_00001.csv' -> '/content/files/res_ResNet_0_00001.csv'
'gdrive/MyDrive/files/res_ResNet_0_001.csv' -> '/content/files/res_ResNet_0_001.csv'
'gdrive/MyDrive/files/ResNetDict_0_001' -> '/content/files/ResNetDict_0_001'
'gdrive/MyDrive/files/res_ResNet_0_0001.csv' -> '/content/files/res_ResNet_0_0001.csv'
'gdrive/MyDrive/files/ResNetDict_0_0001' -> '/content/files/ResNetDict_0_0001'


In [14]:
# Unzip folders of images 
unzip_files()

In [15]:
# Import tabular data
df=pd.read_csv(r'files/Data_Entry_2017_v2020.csv')

In [17]:
# Undersample majority class 'No Finding' (class weights to be applied during training too)
undersample=True
if undersample==True:
  df=undersample_data(df)

72120


In [18]:
train_path='Train/train_resize'
test_path='Test/test_resize'

In [19]:
labels, test, df=process_data(train_path=train_path, test_path=test_path)

In [20]:
labels, class_weights_dict=compute_class_weights(labels)

15
      Observations  Class weight
10.0         10500      0.276929
8.0           7327      0.396854
0.0           3414      0.851714
4.0           2788      1.042952
11.0          2248      1.293483
9.0           1696      1.714475
14.0          1241      2.343070
2.0            829      3.507539
12.0           817      3.559058
1.0            777      3.742278
5.0            587      4.953578
6.0            551      5.277223
3.0            397      7.324307
13.0           234     12.426282
7.0             65     44.734615


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [21]:
train_generator, val_generator, test_generator=get_image_generators(labels=labels, train_path=train_path, test_path=test_path)

Found 37220 validated image filenames.
Found 9304 validated image filenames.
Found 25596 validated image filenames.


In [27]:
model=create_model(trainable=False)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50v2 (Functional)     (None, 7, 7, 2048)        23564800  
                                                                 
 flatten (Flatten)           (None, 100352)            0         
                                                                 
 dense (Dense)               (None, 50)                5017650   
                                                                 
 batch_normalization (BatchN  (None, 50)               200       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 50)                0         
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                        

In [23]:
my_callbacks = [ tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, verbose=0, mode="auto", baseline=None, min_delta=0.01),]

In [34]:
def train(train_generator, val_generator, test_generator, model, my_callbacks ):
  res=pd.DataFrame() # Create df to store results 
  count=1
  for lr in [0.0001]: # a range of learning rates were tested , including [0.001, 0.0001, 0.00001] with training done in seperate notebooks to speed up tuning 
    for optimizer in [tf.keras.optimizers.Adam(learning_rate=lr), tf.keras.optimizers.SGD(learning_rate=lr)]:
      for batch in [32, 256, 512]:
        print(lr, optimizer, batch)

        key=str(lr)+"_"+str(count)+"_"+str(batch)

        # Estimate step size for generator 
        STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size

        # Compile Model with metrics = Accuracy & AUC
        model.compile(optimizer=optimizer,loss="binary_crossentropy",metrics=[tf.keras.metrics.binary_accuracy, tf.keras.metrics.AUC(multi_label=True)])

        # Fit Model 
        history = model.fit(train_generator, steps_per_epoch=STEP_SIZE_TRAIN, validation_data=val_generator, class_weight=class_weights_dict, #callbacks=my_callbacks,
                            epochs=50)
        # Predict 
        preds=model.predict(test_generator)

        # Store ROC_AUC scores for each class 
        for i in range(15):
          res.loc[i, 'Disease'+str(key)]= test.columns[i+2]
          res.loc[i, 'AUC'+str(key)] = roc_auc_score(test.iloc[:,i+2], preds[:,i], average='samples')
        # Store in Colab folder 
        with open('/ResNetDict_0_0001', 'ab') as file_pi:
            pickle.dump(history.history, file_pi)

        count=count+1
 

In [None]:
train(train_generator, val_generator, test_generator, model, my_callbacks )

0.0001 <tensorflow.python.keras.optimizer_v2.adam.Adam object at 0x7fe53e085a90> 32
Found 37220 validated image filenames.
Found 9304 validated image filenames.
Found 25596 validated image filenames.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
0.0001 <tensorflow.python.keras.optimizer_v2.adam.Adam object at 0x7fe53e085a90> 256
Found 37220 validated image filenames.
Found 9304 validated image filenames.
Found 25596 validated image filenames.
Epoch 1/

In [None]:
# Copy results from Colab to Gdrive 
%cp -av '/ResNetDict_0_0001' 'gdrive/MyDrive/files' 
res=res.to_csv('res_ResNet_0_0001.csv')
%cp -av 'res_ResNet_0_0001.csv' 'gdrive/MyDrive/files' 
