# Preprocessing and Data Manipulation

In [2]:
import torchvision as tv
import shutil
import os 
import pandas as pd
import pydicom
from PIL import Image
import cv2
import re
import numpy as np
import torch.nn as nn
import torch.optim
import copy
from torchvision import transforms, utils, models

In [3]:
#Load info csv's 
train_info = pd.read_csv('./mass_case_description_train_set.csv')
test_info = pd.read_csv('./mass_case_description_test_set.csv')

In [4]:
#open training image directory
train_str = "./ImageData/CBIS-DDSM_TRAIN"
train_dir = os.fsencode(train_str)

#open testing image directory
test_str = "./ImageData/CBIS-DDSM_TESTING"
test_dir = os.fsencode(test_str)

In [5]:
#All Preprocessing steps involved from the first part of the project 
def load_image(filepath, height, width):
    img = pydicom.dcmread(filepath, force=True)
    try:
        img = img.pixel_array
    except:
        return((0,0))
    img = cv2.resize(img, (height, width))
    pattern_id = re.compile(r'P\_\d\d\d\d\d')
    match_id = re.search(pattern_id, filepath)
    breast_density = - 1 
    breast_density = [get_density(filepath, match_id)]
    return((img, breast_density))            

#Pairs the image with density value 
def get_density(filepath, match_id):
    patient_id = filepath[match_id.span()[0]:match_id.span()[1]]
    pattern_train = re.compile(r'Training')
    match_train = re.search(pattern_train, filepath)
    if match_train:
        des_file = train_info
    else:
        des_file = test_info
    density_val = des_file.loc[des_file['patient_id'] == patient_id, 'breast_density']
    density_val = density_val.iloc[0]
    return(density_val)

In [6]:
#Function for loading data given directory
def load_data(directory):
    X = []
    y = []
    count = 0
    for root, dirs, files in os.walk(directory):
        if count != 0 and count % 3 == 0:
            for file in files:
                p=os.path.join(root,file)
                abs_filepath = os.path.abspath(p)
                loaded_img = load_image(str(abs_filepath)[2:-1], 299, 299)
                X.append(loaded_img[0])
                y.append(loaded_img[1])
        count += 1
    return(X, y)

In [7]:
#Loading training and testing data 
X_train, y_train = load_data(train_dir)
X_test, y_test = load_data(test_dir)

In [8]:
#Preprocessing to remove failed loads
def reformat_data(X, y):
    temp_X = []
    temp_y = []
    for i in range(len(X)):
        if type(X[i]) != int:
            temp_X.append(X[i])
            temp_y.append(y[i][0])  

    #turning y into binary values 
    temp_y = [0 if i < 2 else 1 for i in temp_y]
    return(temp_X, temp_y)

In [9]:
#reformat the data 
X_train, y_train = reformat_data(X_train, y_train)
X_test, y_test = reformat_data(X_test, y_test)

In [32]:
training_loaded = np.load("img_training_data.npz")

# Creating the ResNet

In [9]:
#Define number of output classes 
output_classes = 2

#Define the Learning rate 
learning_rate = 0.03

#Create model with output_classes number of nodes on the last layer
resnet18 = models.resnet18(num_classes=output_classes)

In [10]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

In [11]:
from torch.utils import data

def preprocess_image(image):
    data_copy = copy.deepcopy(image)
    data_copy = np.asarray(data_copy)
    data_copy = data_copy.astype(dtype = 'float32')
    final_list = [data_copy, data_copy, data_copy]
    final_list = np.asarray(final_list)
    return(final_list)


def make_torch_dataset(numpy_images, numpy_labels):
    torch_images = [torch.from_numpy(preprocess_image(x)).float() for x in numpy_images]
    # TODO: figure out what the datatype of the labels should be
    torch_labels = torch.from_numpy(numpy_labels).long()
    return torch_images, torch_labels


class ImageDataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, numpy_images, numpy_labels):
        'Initialization'
        torch_images, torch_labels = make_torch_dataset(numpy_images, numpy_labels)
        self.images = torch_images
        self.labels = torch_labels

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.images)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample

        return self.images[index], self.labels[index]
    
torch_train_dataset = ImageDataset(X_train, y_train)
torch_test_dataset = ImageDataset(X_test, y_test)

batch_size = 8
shuffle = True

train_dataloader = data.DataLoader(torch_train_dataset, batch_size=batch_size, shuffle=shuffle)
test_dataloader = data.DataLoader(torch_train_dataset, batch_size=batch_size, shuffle=shuffle)

In [42]:
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(params=resnet18.parameters(), lr=learning_rate)

epochs = 50

def train(train_loader, model, criterion, optimizer, epoch):
    # switch to train mode
    model.train()

    for i, (inp, target) in enumerate(train_loader):

        # compute output
        output = model(inp)
        loss = criterion(output, target) # will barf if the label datatype is wrong
        print("Loss: ", loss)
        
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

for epoch in range(epochs):
    # train for one epoch
    train(train_dataloader, resnet18, criterion, optimizer, epoch)

In [13]:
resnet18.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_dataloader:
        outputs = resnet18(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))

Accuracy of the model on the test images: 76.28865979381443 %


# Visualisation and Evaluation of Model

In [15]:
final_predictions = []
final_labels = []

for images, labels in test_dataloader:
    outputs = resnet18(images)
    _, predicted = torch.max(outputs.data, 1)
    final_predictions.append(predicted.numpy())
    final_labels.append(labels.numpy())

preds = []
labels = []
for val in final_predictions:
    for v in val:
        preds.append(v)
        
for val in final_labels:
    for v in val:
        labels.append(v)

In [16]:
import sklearn.metrics 
import matplotlib.pyplot as plt

In [18]:
sklearn.metrics.confusion_matrix(labels, preds)

array([[  0,  46],
       [  0, 148]])

The above confusion matrix depicts the number of true negatives (top, left), false negatives (bottom, left), true positives (bottom, right) and false positives (top, right)

In [76]:
sklearn.metrics.roc_curve(labels, preds)
plt.show()

(array([0.        , 0.17391304, 1.        ]), array([0.        , 0.87162162, 1.        ]), array([2, 1, 0]))


In [19]:
preds

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]