# Prostate MRI Dataset

In [82]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from PIL import Image
import numpy as np

import cv2
import scipy.io
import os
import matplotlib.pyplot as plt
import scipy.ndimage
import pickle

%run -i LoadZoneData

In [4]:
def trim_zeros_2D(array):

    # trim x
    for i in range(array.shape[0]):
        if np.sum(array[i,:]) > 0:
            break
    for j in reversed(range(array.shape[0])):
        if np.sum(array[j,:]) > 0:
            break
    array = array[i:j+1,:]
    
    # trim y
    for i in range(array.shape[1]):
        if np.sum(array[:,i]) > 0:
            break
    for j in reversed(range(array.shape[1])):
        if np.sum(array[:,j]) > 0:
            break
    array = array[:,i:j+1]
    
    return(array)

In [3]:
data = []
for i, file in enumerate(os.listdir(dir_path)):
    
    mat_file = scipy.io.loadmat(os.path.join(dir_path, file))
    
    pid = mat_file['casesTableArr'][0][0][0][0][0][0][0]
    
    patient_dict = {}
    
    patient_dict['id'] = pid
    patient_dict['T2'] = mat_file['T2']
    patient_dict['ADC'] = mat_file['ADC']
    patient_dict['CDI'] = mat_file['CDI']
    patient_dict['HBV'] = mat_file['HBV']
    patient_dict['PIRADS_score'] = mat_file['casesTableArr'][0][0][1][0][0]
    patient_dict['curGleason_score'] = mat_file['casesTableArr'][0][0][2][0][0]
    patient_dict['maxGleason_score'] = mat_file['casesTableArr'][0][0][3][0][0]
    patient_dict['PIRADS_map'] = mat_file['casesTableArr'][0][0][4]
    patient_dict['curGleason_map'] = mat_file['casesTableArr'][0][0][5]
    patient_dict['maxGleason_map'] = mat_file['casesTableArr'][0][0][6]
    patient_dict['mask'] = mat_file['PMask']
    patient_dict['zone_map'] = mat_file['casesTableArr'][0][0][7]
        
    data.append(patient_dict)

In [43]:
# Load the data
save_filename = 'raw_data'

dir_path = os.path.join('.','data', 'data','data')
data = load_obj(save_filename)

In [142]:
models = {
	"knn": KNeighborsClassifier(n_neighbors=1),
	"naive_bayes": GaussianNB(),
	"logit": LogisticRegression(solver="lbfgs"),
	"svm": SVC(kernel="linear"),
	"decision_tree": DecisionTreeClassifier(),
	"random_forest": RandomForestClassifier(n_estimators=100),
	"mlp": MLPClassifier()
}

In [44]:
fold_dict = {}
with open('5folds.txt', 'r') as f:
    for line in f:
        pid, fold_no = line.split()
        fold_dict[str(pid)] = int(fold_no)

In [45]:
examples = {}
labels = []

for modality in ['ADC']: # TODO: add T2-weighted images (if we get the labels)
    
    if modality == 'ADC':
        label_map = 'maxGleason_map'
        zone_map = 'zone_map'
        
    examples[modality] = []
    
    for _ in range(5):
        examples[modality].append([])
        labels.append([])

    for patient in data:
        
        pid = patient['id']
        if pid in ['P00000015', 'P00000249', 'P00000429']: # remove bad data
            continue

        fold_id = fold_dict[pid] - 1

        patient_examples = []
        patient_labels = []

        if patient[zone_map].shape[-1] != patient[modality].shape[-1]: # check if segmentation map has same num slices as mri
            continue

        for slice_index in range(patient[modality].shape[-1]):

            for zone_index in range(10):
                zone_number = zone_index + 1

                if zone_number in patient[zone_map][:,:,slice_index]: # check zone map to see if the slice contains the zone

                    binary_mask = patient[zone_map][:,:,slice_index] == zone_number  # create a binary mask
                    example = patient[modality][:,:,slice_index] * binary_mask  # apply the mask to the slice
                    trimmed_example, idx = trim_zeros_2D(example)  # trim the slice to the dimensions of the prostate zone

                    patient_examples.append(trimmed_example)
                    patient_labels.append(1 if patient[label_map][slice_index][zone_index] >0 else 0)

        examples[modality][fold_id].extend(patient_examples)
        labels[fold_id].extend(patient_labels)

In [158]:
def predict_and_report(model):
    aucs = []
    numfeats = 3
    rus = RandomUnderSampler()
    for i in range(5): # CV Loop

        x_train = []
        y_train = []

        x_test = examples['ADC'][i]
        X_test = np.array([None] * numfeats).reshape(-1,1)
        Y_test = np.array([])
        for idx, test_item in enumerate(x_test):
            test_img = cv2.resize(test_item, (32, 32))
            features = np.array(feat_extraction_1d(test_img))

            X_test = np.concatenate((X_test, features), axis=1)
            Y_test = np.concatenate((Y_test, np.array([labels[i][idx]] * 1024)))
        X_test = X_test[:, 1:].T
        #print(X_test.shape)
        #print(len(Y_test))
        for j in range(5):
            if i != j:
                x_train.extend(examples['ADC'][j])
                y_train.extend(labels[j])

        X_train = np.array([None] * numfeats).reshape(-1,1)
        Y_train = np.array([])
        # TODO: feature extraction / features selection / classification here
        for idx, item in enumerate(x_train):
            img = cv2.resize(item, (32, 32))

            features = np.array(feat_extraction_1d(img))
            X_train = np.concatenate((X_train, features),axis=1)# = [X_train, features]
            Y_train = np.concatenate((Y_train, np.array([y_train[idx]] * 1024)))
        X_train = X_train[:,1:]

        
        # Undersampling, balancing
        x_rus, y_rus = rus.fit_sample(X_train, Y_train)

        # Standardize
        print('Standardizing...')
        sc = StandardScaler().fit(x_rus)

        stand_X = sc.transform(x_rus)
        stand_X_test = sc.transform(X_test)

        # Train Model
        print('Training on model... ' + model + ' - Run ' + str(i+1))

        x_df = pd.DataFrame(stand_X)

        mod = models[model]
        mod.fit(x_df, y_rus)

        print('Evaluating on model... ' + model + ' - Run ' + str(i+1))

        y_pred = mod.predict(stand_X_test)

        auc = roc_auc_score(Y_test, y_pred)
        aucs.append(auc)
        print(auc)
        print(confusion_matrix(Y_test, y_pred))
        print(classification_report(Y_test,y_pred))

    mean_auc = sum(aucs) / float(len(aucs))
    print(mean_auc)

In [157]:
predict_and_report('decision_tree')

(2778112, 3)
2778112


MemoryError: 

In [148]:
print(len(examples['ADC'][0]))
print(len(labels[0]))

2713
2713
