In [1]:
import matplotlib.pyplot as plt
%matplotlib inline 

import pandas as pd

import os
import inspect
import random
import shutil

import json

In [2]:
this_filename = inspect.getframeinfo(inspect.currentframe()).filename
working_path = os.path.dirname(os.path.abspath(this_filename))
data_path = working_path+"/../input/covid-chest-xray"
images_path = data_path+"/images"
data = pd.read_csv(data_path+"/metadata.csv")
print(images_path)

/home/petejb88/mlenv/transfer-learning-image-classifier/working/../input/covid-chest-xray/images


In [None]:
data['finding'].value_counts()

In [3]:
data['is_covid'] = [str(data['finding'][index] == 'COVID-19') for index in data.index]

Create train, valid, and category FOLDERS

In [4]:
# new_folders = ['train', 'valid']

def delete_cat_folders(images_path,new_folders):
    for folder in new_folders:
        try:
            shutil.rmtree(images_path+"/"+folder)
            print("Folder {} deleted".format(folder))
        except:
            print("Folder {} doesn't exist!".format(folder))
    
def make_cat_folders(images_path,new_folders,names,name_to_cat):
    for folder in new_folders:
        try:
            os.mkdir(images_path+"/"+folder)
            print("Folder {} created".format(folder))
        except:
            print('Folder {} already exists!'.format(folder))
        for name in names:
            try:
                os.mkdir(images_path+"/"+folder+"/"+name_to_cat[name])
                print("Folder {}/{} created".format(folder,name_to_cat[name]))
            except:
                print("Folder {}/{} already exists!".format(folder,name_to_cat[name]))
                
def make_cat_dicts(images_path,names):
    cat_to_name = {}
    name_to_cat = {}
    names.sort()
    for i,name in enumerate(names):
            cat_to_name[str(i)] = name
            name_to_cat[name] = str(i)
    
    json_object = json.dumps(cat_to_name, indent = 4)
    with open(data_path+"/cat_to_name.json", "w") as outfile:
        outfile.write(json_object)
        
    print("cat_to_name, name_to_cat created!")    
    print(name_to_cat)
    return cat_to_name, name_to_cat
            



# Shuffle Images
Now, we shuffle the images into these folders, while classifying them as we go

First, we check to see if any images are missing: (qualitative)

In [5]:
def check_missing(data,images_path):
    not_in_data = []
    in_data = []
    for file in os.listdir(images_path):
        if os.path.isfile(images_path+"/"+file):
            file_data = data[data['filename'] == file]
            if len(file_data) == 0:
                not_in_data.append(file)
            else:
                in_data.append(file)

    return not_in_data, in_data

In [6]:
def list_images_in_data(images_path):
    images = [image for image in os.listdir(images_path) if os.path.isfile(images_path+"/"+image)]
    images_not_in_data = []
    data_not_in_images = []
    for image in images:
        image_data = data[data['filename']==image]
        if len(image_data) == 0:
            images_not_in_data.append(image)
    images_in_data = [image for image in images if (image not in images_not_in_data)]
    for filename in data['filename']:
        if filename not in os.listdir(images_path):
            data_not_in_images.append(filename)
    print("How many in folder: {}, How many not in data: {}, How many in data: {}, How many data not in images: {}".format(
        len(images),
        len(images_not_in_data),
        len(images_in_data),
        len(data_not_in_images)))
    return images, images_in_data, data_not_in_images

### Todo: clean data more at startup 
Noticed a variation in the number of validation photos, coming from 21 missing images and 6 missing data entries.

Test that we have shuffled property:

In [7]:
def test_folder(p,images_in_data,column,name_to_cat,folder):
    count = 0
    wrong_cat = []
    for subfolder in os.listdir(images_path+"/"+folder):
        for image in os.listdir(images_path+"/"+folder+"/"+subfolder):
            count += 1
            image_data = data[data['filename']==image]
            image_index = image_data.index[0]
            image_column = image_data[column][image_index]
            image_cat = name_to_cat[image_column]
            if image_cat != subfolder:
                wrong_cat.append(image)
    return count, wrong_cat

def test(p,images_in_data,column,name_to_cat,new_folders):
    count = []
    wrong_cat = []
    for i,folder in enumerate(new_folders):
        count.append(test_folder(p,images_in_data,column,name_to_cat,folder)[0])
        wrong_cat.append(test_folder(p,images_in_data,column,name_to_cat,folder)[1])
    print("Valid Count: {}, Valid Expected: {}".format(count[1], int((p*len(images_in_data)))))
    print("Train Count: {}, Train Expected: {}".format(count[0], int((1-p)*len(images_in_data))))
    print("Total Count: {}, Total Expected: {}".format(sum(count), len(images_in_data)))
    print("Valid Wrong Cat: {}".format(wrong_cat[1]))
    print("Train Wrong Cat: {}".format(wrong_cat[0]))

In [8]:
def shuffle_images(data,images_path,p,column='finding'):
    '''
    Shuffle the images (as symbolic links) into training and validation folders.
    
    Inputs: 
        - p = percentage split into validation
        - data = metadata
        - images = list of images
        - images_path = location of images
        
    Warning: there will always be some variability in the number of validation photos, as there are 
    '''
    
    names = data[column].unique()
    
    cat_to_name, name_to_cat = make_cat_dicts(images_path,names)
    delete_cat_folders(images_path,['train', 'valid'])
    make_cat_folders(images_path,['train', 'valid'], names,name_to_cat)
    
    images, images_in_data, data_not_in_images = list_images_in_data(images_path)

    valid_nums = random.sample(range(len(images)),int(len(images_in_data)*p))
    for image in images_in_data:    
        image_data = data[data['filename']==image]
        image_index = image_data.index[0]
        image_column = image_data[column][image_index]
        image_cat = name_to_cat[image_column]
        if image_index in valid_nums:
            os.symlink(images_path+"/"+image, images_path+"/valid/"+image_cat+"/"+image)
        else:
            os.symlink(images_path+"/"+image, images_path+"/train/"+image_cat+"/"+image)
    
    test(p,images_in_data,column,name_to_cat,['train','valid'])

In [None]:
shuffle_images(data,images_path,0.2,'is_covid')

# Create Model

In [None]:
os.symlink(working_path+"/../train.py", working_path+"/train.py")
os.symlink(working_path+"/../predict.py", working_path+"/predict.py")
os.symlink(working_path+"/../model_functions.py", working_path+"/model_functions.py")
os.symlink(working_path+"/../predict_functions.py", working_path+"/predict_functions.py")

In [None]:
os.symlink(working_path+"/../predict_functions.py", working_path+"/temp.py")
os.rename(working_path+"/temp.py", working_path+"/predict_functions.py")

In [9]:
from model_functions import *
from predict_functions import *

In [None]:
import importlib,sys
importlib.reload(sys.modules['model_functions'])
importlib.reload(sys.modules['predict_functions'])

In [None]:
train_data, valid_data, trainloader, validloader = build_loader(images_path,16)

In [None]:
model, criterion, optimizer = build_model("alexnet",0.01,[1024,1024],train_data)

In [None]:
model, optimizer, train_losses, valid_losses, accuracy_data = train_model(model,criterion,optimizer,trainloader,validloader,6,5,False)

In [None]:
plot_stats(train_losses, valid_losses)

In [None]:
len(valid_losses) == len(train_losses)

In [None]:
n = 2
plt.plot(range(n,len(train_losses)), train_losses[n:], label="train losses")
plt.plot(range(n,len(train_losses)), valid_losses[n:], label="valid losses")

In [None]:
create_checkpoint(model,optimizer,"alexnet",0.01,6,working_path)

In [35]:
accuracy = 0
column="is_covid"
cat_to_name, name_to_cat = make_cat_dicts(images_path,data[column].unique())
images, images_in_data, data_not_in_images = list_images_in_data(images_path)
predicted_covid = 0
actual_covid = 0
for image in images_in_data:
    image_data = data[data['filename']==image]
    image_index = image_data.index[0]
    image_column = image_data[column][image_index]
    image_cat = name_to_cat[image_column]

    top_prob, top_class = predict(images_path+"/"+image,model,1,False)       
    accuracy += (top_class[0] == image_cat) 
    if top_class[0] == "1":
        predicted_covid += 1
    if image_cat == "1":
        actual_covid += 1
print("Accuracy: {}".format(accuracy / len(images_in_data)))
print("Predicted Covid: {}, Actual Covid: {}".format(predicted_covid,actual_covid))

cat_to_name, name_to_cat created!
{'False': '0', 'True': '1'}
How many in folder: 357, How many not in data: 6, How many in data: 351, How many data not in images: 21
Accuracy: 0.48717948717948717
Predicted Covid: 191, Actual Covid: 275


In [10]:
model = load_checkpoint(working_path+"/../workingcheckpoint.pth")

Checkpoint loaded
Model created


In [31]:
image = images_in_data[0]
image_data = data[data['filename']==image]
image_index = image_data.index[0]
image_column = image_data[column][image_index]
image_cat = name_to_cat[image_column]

top_prob, top_class = predict(images_path+"/"+image,model,1,False)
print(top_prob, top_class)
print(image_cat)
print(top_class[0] == image_cat)
print_predictions(top_prob, top_class,images_path+"/../cat_to_name.json")
data[data['filename']==image]

[0.9114857316017151] ['1']
1
True
Object: True, Probability: 0.9114857316017151


Unnamed: 0,patientid,offset,sex,age,finding,survival,intubated,intubation_present,went_icu,in_icu,...,location,folder,filename,doi,url,license,clinical_notes,other_notes,Unnamed: 28,is_covid
96,46,5.0,F,55.0,COVID-19,,,,,,...,Italy,images,01E392EE-69F9-4E33-BFCE-E5C968654078.jpeg,,https://www.sirm.org/2020/03/10/covid-19-caso-26/,,"Woman, 55 years old, reports dyspnea for a few...","Credit to Izzo Andrea, D'Aversa Lucia, Ceremon...",,True


In [34]:
275/len(images_in_data)

0.7834757834757835

In [39]:
precision = 0
for i in range(1000):
    top_prob, top_cat = predict(images_path+"/"+image,model,1,False)
    precision += int(top_cat[0])
print(precision/100)

4.6
