In [1]:
# basics
import pandas as pd
import numpy as np
import zipfile
import os
import keras
import tensorflow as tf

#Models and layers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout
from keras.applications.vgg16 import VGG16
#optimizers TO be extended
from keras.optimizers import Adam, RMSprop, SGD

#stopping
from keras.callbacks import EarlyStopping, ModelCheckpoint

#visualizing
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report,ConfusionMatrixDisplay
from keras_preprocessing.image import img_to_array, array_to_img, load_img, ImageDataGenerator
#gridsearch
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from keras.utils.np_utils import to_categorical

import talos
from talos.utils import lr_normalizer

In [27]:
ziploc= "E:/School/UU/PATREC/CXR_project/NEW_DATA/"
augmented_image_loc = "E:/School/UU/PATREC/PR_images/"
og_image_loc = "E:/School/UU/PATREC/.darwin/datasets/v7-labs/covid-19-chest-x-ray-dataset/images/"
store_model_loc = "E:/School/UU/PATREC/checkpoints/"

all_img_loc = "E:/School/UU/PATREC/PR_images/ALL/"

In [3]:
trainzip =  zipfile.ZipFile(ziploc+'final_train_data.zip') 
train_df = pd.read_csv(trainzip.open('train_final.csv'))
valzip =  zipfile.ZipFile(ziploc+'final_val_data.zip') 
val_df = pd.read_csv(valzip.open('val_final.csv'))
testzip =  zipfile.ZipFile(ziploc+'final_test_data.zip') 
test_df = pd.read_csv(testzip.open('test_final.csv'))

In [4]:
val_df

Unnamed: 0,ogfilename,label,filename,type,lung1,lung2,view,Covid,width,height,json_filename
0,aug_1-s2.0-S0196070920301691-gr3_lrg.jpg_0_258...,C,,,,,,,,,
1,aug_1-s2.0-S0196070920301691-gr3_lrg.jpg_0_285...,C,,,,,,,,,
2,aug_1-s2.0-S0196070920301691-gr3_lrg.jpg_0_329...,C,,,,,,,,,
3,aug_1-s2.0-S0196070920301691-gr3_lrg.jpg_0_467...,C,,,,,,,,,
4,aug_1-s2.0-S0196070920301691-gr3_lrg.jpg_0_704...,C,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1134,person1377_virus_2369.jpeg,D,00001225.jpeg,Viral Pneumonia,"{'path': [{'x': 617, 'y': 86}, {'x': 605, 'y':...","{'path': [{'x': 791, 'y': 54}, {'x': 789, 'y':...",,False,1584.0,720.0,person1377_virus_2369.json
1135,person1488_virus_2592.jpeg,D,00001170.jpeg,Viral Pneumonia,"{'path': [{'x': 568, 'y': 138}, {'x': 563, 'y'...","{'path': [{'x': 789, 'y': 133}, {'x': 789, 'y'...",,False,1504.0,1096.0,person1488_virus_2592.json
1136,person500_virus_1009.jpeg,D,00000745.jpeg,Viral Pneumonia,"{'path': [{'x': 598.0, 'y': 81}, {'x': 596.0, ...","{'path': [{'x': 399, 'y': 90}, {'x': 398, 'y':...",,False,968.0,632.0,person500_virus_1009.json
1137,person1270_virus_2163.jpeg,D,00001275.jpeg,Viral Pneumonia,"{'path': [{'x': 533.0, 'y': 76}, {'x': 530, 'y...","{'path': [{'x': 778, 'y': 113.0}, {'x': 778, '...",,False,1280.0,920.0,person1270_virus_2163.json


### Model

In [5]:
def get_data(df, imgloc):
    data = []
    for i in df['ogfilename']:
        img = load_img(imgloc + "ALL/"+i)
        x = img_to_array(img)
        x = x/255
        if img.size > (224,224) or img.size < (244,244):
            x = tf.image.resize_with_pad(x, 224, 224, method="nearest")
            data.append(x)
        else:
            data.append(x)
    return np.asarray(data)

In [28]:
seed = 4
datagenerator =  ImageDataGenerator(rescale= 1/255)

# Make train rescled
train_data = datagenerator.flow_from_dataframe(
        dataframe = train_df,
        directory = all_img_loc,
        x_col = "ogfilename",
        y_col = "label",
        class_mode = "categorical",
        batch_size = 40,
        seed = seed,
        shuffle = True,
        target_size = (224,224), # changed values for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True)

# Make val data
val_data = datagenerator.flow_from_dataframe(
        dataframe = val_df,
        directory = all_img_loc,
        x_col = "ogfilename",
        y_col = "label",
        class_mode = "categorical",
        batch_size = 40,
        seed = seed,
        shuffle = True,
        target_size = (224,224), # changed values for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True)

# Make test data
test_data = datagenerator.flow_from_dataframe(
        dataframe = test_df,
        directory = all_img_loc,
        x_col = "ogfilename",
        y_col = None,
        class_mode = None,
        batch_size = 1,
        seed = seed,
        shuffle = False,
        target_size = (224,224), # changed values for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True)

Found 9119 validated image filenames belonging to 4 classes.
Found 1139 validated image filenames belonging to 4 classes.
Found 635 validated image filenames.


In [6]:
#train_data = get_data(train_df, augmented_image_loc)

In [7]:
#val_data = get_data(val_df, augmented_image_loc)

In [8]:
#test_data = get_data(test_df, augmented_image_loc)

In [9]:
#train_cats = train_df.label
#val_cats = val_df.label
#test_cats = test_df.label

In [11]:
#train_data.shape

In [12]:
#len(train_df)

9119

In [13]:
p = {'lr': (0.001, 0.01, 0.1, 0.2),
     'batch_size': (10, 20, 30),
     'epochs': [50],
     'dropout': (0, 0.2, 0.5),
     'optimizer': [Adam, RMSprop, SGD]}

In [32]:
def get_model(train_data, val_data, params):
    vgg =  VGG16(weights="imagenet", include_top=False, input_shape = (224,224,3))
    model = Sequential()
    model.add(vgg)
    model.add(Dropout(params['dropout']))
    model.add(Flatten())
    model.add(Dropout(params['dropout']))
    model.add(Dense(4, activation='softmax'))
    model.layers[0].trainable = False
    model.compile(loss='categorical_crossentropy',
                  # here we add a regulizer normalization function from Talos
                  optimizer=params['optimizer'](lr=0.001),
                  metrics=['accuracy'])
    history = model.fit(train_data, 
                    validation_data= val_data,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0)
    return history, model

In [33]:
t = talos.Scan(x=train_data,
            y=val_data,
            params=p,
            model=get_model,
            experiment_name='imagenet')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

# NOT WORKING

In [None]:
run_gridsearch = False

if run_gridsearch:
    #if verbose: print (time.strftime( "%H:%M:%S " + "GridSearch started ... " ) )
    #optimizers = ['rmsprop', 'adam']
    epochs = [50, 100, 200, 400]
    batches = [5, 10, 20]
    
    model = KerasClassifier(build_fn=get_model(), verbose=0)
    
    param_grid = dict(epochs=epochs, batch_size=batches)
    grid = GridSearchCV(estimator=model, param_grid=param_grid)
    grid_result = grid.fit(train_data, train_cats)
    
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    if verbose: 
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
        elapsed_time = time.time() - start_time  
        print ("Time elapsed: ",timedelta(seconds=elapsed_time))
        
    best_epochs = grid_result.best_params_['epochs']
    best_batch_size = grid_result.best_params_['batch_size']
    best_init = grid_result.best_params_['init']
    best_optimizer = grid_result.best_params_['optimizer']
    
else:
    # pre-selected paramters
    best_epochs = 200
    best_batch_size = 5
    best_init = 'glorot_uniform'
    best_optimizer = 'rmsprop'

In [None]:
model = KerasClassifier(build_fn=get_model("imagenet", False, (224,224,3), False, 4), epochs=100, batch_size=10, verbose=0)

#batch_size = [20,30,40]
#epochs = [20,30,40,50,60]
optimizers = ["Adam", "RMSprop"]
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]

param_grid = dict(optimizer__optimizer = optimizers, 
                  optimizer__learning_rate=learn_rate, optimizer__momentum=momentum)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_data, train_cats)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))