In [9]:
import matplotlib.pyplot as plt
%matplotlib inline 


import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models

import os
import inspect
import random
import shutil
import json
from collections import OrderedDict
from PIL import Image

In [21]:
this_filename = inspect.getframeinfo(inspect.currentframe()).filename
working_path = os.path.dirname(os.path.abspath(this_filename))
data_path = working_path+"/../input/covid-chest-xray"
images_path = data_path+"/images"

# Create Dataset

In [64]:
class ImageDataSet(torch.utils.data.Dataset):
    
    def __init__(self,csv_path,images_path,these_indices=None,transforms=None):
        """
        Input:
            - csv_path: path to csv file with metadata
            - images_path: path to images
            - these_idx: list of indices to keep in this dataset
            - transform: transforms to be applied
        """
        self.df = pd.read_csv(csv_path)
        self.images_path = images_path
        self.these_indices = these_indices
        self.transforms = transforms        
        
        # remove rows that don't include the appropriate filenames:
        if self.these_indices:
            self.df.drop([idx for idx in self.df.index if idx not in these_indices], inplace=True)
        
        # Data Processing:
        # if view not PA, drop the row
        self.df.drop(self.df[self.df.view != 'PA'].index, inplace=True)
        # if image DNE, drop the row
        self.df.drop([idx for idx in self.df.index if self.df.filename[idx] not in os.listdir(images_path)], inplace=True)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        'Generates one sample of data'
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        image_path = images_path+"/"+self.df['filename'].iloc[idx]
        image = Image.open(image_path)
        if self.transforms:
            image = self.transforms(image)
        
        if self.df['finding'].iloc[idx] != 'COVID-19':
            finding = 0
        else:
            finding = 1
            
        return image, finding

### Create train and valid datasets

In [65]:
all_dataset = ImageDataSet(data_path+"/metadata.csv",images_path)
all_dataset.df.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,  12,
            ...
            348, 349, 350, 351, 352, 353, 356, 357, 358, 364],
           dtype='int64', length=195)

Randomly split all_dataset into training and validation

In [67]:
train_transforms = transforms.Compose([transforms.RandomRotation(90),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize(
                                           [0.485, 0.456, 0.406], 
                                           [0.229, 0.224, 0.225])])
valid_transforms = transforms.Compose([transforms.CenterCrop(224),
                                       transforms.ToTensor(),
                                       transforms.Normalize(
                                           [0.485, 0.456, 0.406], 
                                           [0.229, 0.224, 0.225])])

In [83]:
def build_train_valid_loaders(csv_path,images_path,train_transform,valid_transforms,split_ratio,batchsize):
    """
    Input:
        - csv_path: path to csv file with metadata
        - images_path: path to images
        - train/valid transform: transforms to be applied to training, validation data
        - split_ratio: float between 0,1 indicating percentage of data to split into validation
        - batchsize = batch size
    """         
    
    all_dataset = ImageDataSet(csv_path,images_path)
    num_images = len(all_dataset)
    # all_dataset already does some data processing
    valid_indices = random.sample(list(all_dataset.df.index),int(split_ratio*num_images))
    train_indices = [idx for idx in all_dataset.df.index if idx not in valid_indices]
    valid_dataset = ImageDataSet(csv_path,images_path,valid_indices,valid_transforms)
    train_dataset = ImageDataSet(csv_path,images_path,train_indices,train_transforms)
    
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size = batchsize, shuffle = True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size = batchsize)
    
    print("Data loaded!")
    return trainloader, validloader

    

# Create Model

In [None]:
model, criterion, optimizer = build_model("alexnet",0.01,[1024,1024],train_data)

In [None]:
model, optimizer, train_losses, valid_losses, accuracy_data = train_model(model,criterion,optimizer,trainloader,validloader,6,5,False)

In [None]:
plot_stats(train_losses, valid_losses)

In [None]:
len(valid_losses) == len(train_losses)

In [None]:
n = 2
plt.plot(range(n,len(train_losses)), train_losses[n:], label="train losses")
plt.plot(range(n,len(train_losses)), valid_losses[n:], label="valid losses")

In [None]:
create_checkpoint(model,optimizer,"alexnet",0.01,6,working_path)

In [None]:
accuracy = 0
column="is_covid"
cat_to_name, name_to_cat = make_cat_dicts(images_path,data[column].unique())
images, images_in_data, data_not_in_images = list_images_in_data(images_path)
predicted_covid = 0
actual_covid = 0
for image in images_in_data:
    image_data = data[data['filename']==image]
    image_index = image_data.index[0]
    image_column = image_data[column][image_index]
    image_cat = name_to_cat[image_column]

    top_prob, top_class = predict(images_path+"/"+image,model,1,False)       
    accuracy += (top_class[0] == image_cat) 
    if top_class[0] == "1":
        predicted_covid += 1
    if image_cat == "1":
        actual_covid += 1
print("Accuracy: {}".format(accuracy / len(images_in_data)))
print("Predicted Covid: {}, Actual Covid: {}".format(predicted_covid,actual_covid))

In [None]:
model = load_checkpoint(working_path+"/../workingcheckpoint.pth")

In [None]:
image = images_in_data[0]
image_data = data[data['filename']==image]
image_index = image_data.index[0]
image_column = image_data[column][image_index]
image_cat = name_to_cat[image_column]

top_prob, top_class = predict(images_path+"/"+image,model,1,False)
print(top_prob, top_class)
print(image_cat)
print(top_class[0] == image_cat)
print_predictions(top_prob, top_class,images_path+"/../cat_to_name.json")
data[data['filename']==image]

In [None]:
275/len(images_in_data)

In [None]:
precision = 0
for i in range(1000):
    top_prob, top_cat = predict(images_path+"/"+image,model,1,False)
    precision += int(top_cat[0])
print(precision/100)