In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Making sure that we got all the data
def folder_size(path='.'):
    total = 0
    for entry in os.scandir(path):
        if entry.is_file():
            total += entry.stat().st_size
        elif entry.is_dir():
            total += folder_size(entry.path)
    return total


print("Size in bytes: " + str(folder_size('/kaggle/input/'))) # This is the size in bytes
print("Size in GB: " +str(folder_size('/kaggle/input/') / (1024**3))) # This is the size in gigabytes

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import scipy.ndimage
import skimage.filters

In [None]:
#preprocessing steps

#laplacian filtering
filtered = scipy.ndimage.filters.laplace(img_array[:, :10000, :])

In [None]:
plt.imshow(filtered)

In [None]:
import skimage.io
import cv2

def load_and_resize_image(img_path):
    """
    Edited from https://www.kaggle.com/xhlulu/panda-resize-and-save-train-data
    """
    biopsy = skimage.io.MultiImage(img_path)
    return cv2.resize(biopsy[-1], (512, 512))

In [None]:
img = load_and_resize_image('/content/0005f7aaab2800f6170c399693a96917.tiff')
plt.imshow(img)

In [None]:
filtered = scipy.ndimage.filters.laplace(img)
plt.imshow(filtered)

In [None]:
from skimage.color import rgb2gray

grayscale_img = rgb2gray(img)
threshold = skimage.filters.threshold_otsu(grayscale_img)

In [None]:
threshold

In [None]:
plt.imshow(grayscale_img, cmap = 'gray')

In [None]:
plt.imshow(grayscale_img > threshold, cmap = 'gray')

In [None]:
def generate_patches(image, window_size=200, stride=200):

  max_width, max_height = image.shape[0], image.shape[1]
  regions_container = []

  i = 0
  while window_size + stride*i <= max_height:
      j = 0

      while window_size + stride*j <= max_width:            
          x_top_left_pixel = j * stride
          y_top_left_pixel = i * stride

          patch = image[
              x_top_left_pixel : x_top_left_pixel + window_size,
              y_top_left_pixel : y_top_left_pixel + window_size
          ]

          if np.sum(patch) == window_size * window_size * 255:
            regions_container.append(patch)


          j += 1

      i += 1

  return regions_container

In [None]:
generate_patches(grayscale_img, window_size=200, stride=200))

In [None]:
# load functions

import re

def Filter(string, substr): 
    return [str for str in string if
             any(sub in str for sub in substr)]

In [None]:
# unzip all image files

import zipfile, os

base_dir = "/kaggle/"

for file in os.listdir(base_dir):   # get the list of files
    if zipfile.is_zipfile(file): # if it is a zipfile, extract it
        with zipfile.ZipFile(file) as item: # treat the file as a zip
           item.extractall()  # extract it in the working directory

In [None]:
# load packages, both original images and masks
import os, shutil
import tifffile as tiff
import os
#import pydicom as dcm
import matplotlib.pyplot as plt
from skimage.filters import threshold_otsu
from skimage.color import rgb2gray
import tifffile as tiff
import csv
import pandas as pd
import skimage.io
from skimage.transform import resize, rescale
import numpy as np
import cv2
from skimage import data
from skimage import filters
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from keras.utils import to_categorical

base_dir = "/content/"

list_of_files = os.listdir(base_dir)
list_of_zips = Filter(list_of_files, ['zip'])
list_of_files = set(list_of_files) - set(list_of_zips)
list_of_files=Filter(list_of_files, ['tiff'])
list_masks = Filter(list_of_files, ['mask'])
list_imgs = set(list_of_files) - set(list_masks)
list_imgs =  list(list_imgs)

validation_proportion = 0.5

list_masks_train = list_masks[:int(round(len(list_masks)*0.5,0))]
list_masks_val = list_masks[int(round(len(list_masks)*0.5,0)):]
list_imgs_train = list_imgs[:int(round(len(list_imgs)*0.5,0))]
list_imgs_val = list_imgs[int(round(len(list_imgs)*0.5,0)):]

print('total training images:', len(list_imgs_train))
print('total validation images:', len(list_imgs_val))

In [None]:
!pip install imagecodecs
!pip install Tifffile

In [None]:
# load images and labels
# 2 types of labels: 1) multiple: isup_grade 2) binary: benign/malignant

import os
import matplotlib.pyplot as plt
from skimage.filters import threshold_otsu
from skimage.color import rgb2gray
import tifffile as tiff
import csv
import pandas as pd
import skimage.io
from skimage.io import imread
from skimage.transform import resize, rescale
import numpy as np
import cv2
from skimage import data
from skimage import filters
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from keras.utils import to_categorical

img_height = 1024
img_width = 1024

base_dir = "/content/"
train_dir = os.path.join(base_dir, 'train/')

# read label file
labels = pd.read_csv(os.path.join(base_dir,'train.csv'), sep=',')
labels = labels.to_numpy() # labels[:,2] 0 image_id, 1 data_provider, 2 isup_grade, 3 gleason_score
benign_malignant = np.zeros((labels.shape[0],1))
labels = np.concatenate((labels, benign_malignant), axis=1)
labels[:,4] = (labels[:,2]>0)*1 # labels[:,4] convert to binary benign(0)/malignant(1) for binary classification

#--------------

# read images
list_of_imgs = list_imgs_train # list_masks_train
train_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
train_labels_binary = np.empty([len(list_of_imgs)])

# create 3D train array
imgs_train = np.zeros((len(list_of_imgs),img_height,img_width))

print("start building input array")
for i in range(0,len(list_of_imgs)): # stack images into appropriate 3D training array
    img_num = i
    img_path = os.path.join(base_dir,list_of_imgs[img_num])
    print(img_path)
    #img_mask = skimage.io.MultiImage(img_path)
    #print(img_mask.shape)
    img_mask = tiff.imread(img_path)
    #img_mask = cv2.resize(img_mask[-1], (img_height, img_width))
    img_mask = cv2.resize(img_mask[:,:,0], (img_height, img_width))
    imgs_train[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    # if the image is in 'train.csv', add to labels array
    train_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    train_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    print(i)
    
train_labels_multiple = to_categorical(train_labels_multiple)
train_labels_binary = to_categorical(train_labels_binary)

In [None]:
# create validation set

base_dir = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/"
validation_dir = os.path.join(base_dir, 'validation/')

# read images
list_of_imgs = os.listdir(validation_dir)
val_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
val_labels_binary = np.empty([len(list_of_imgs)])

# create 3D validation array
imgs_val = np.zeros((len(list_of_imgs),img_height,img_width))

#for i in range(0,10):
for i in range(0,len(imgs_val)-1): # stack images into appropriate 3D training array
    img_num = i
    img_path = os.path.join(validation_dir,list_of_imgs[img_num])
    img_mask = tiff.imread(img_path)
    #img_mask = cv2.resize(img_mask[-1], (img_height, img_width))
    img_mask = cv2.resize(img_mask[:,:,0], (img_height, img_width))
    imgs_val[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    # if the image is in 'train.csv', add to labels array
    val_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    val_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    print(list_of_imgs[i].split('.')[0]," ",labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4])
    
val_labels_multiple = to_categorical(val_labels_multiple)
val_labels_binary = to_categorical(val_labels_binary)

In [None]:
# Set up a convnet model
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

# model: multiple classification; model1: binary classification
Num_neurous_layer1 = 32
Num_neurous_layer2 = 32
Num_neurous_layer3 = 32
Num_neurous_layer4 = 32
Num_classifications = len(set(labels[:,2])) # 0-5, total of 6 categories
Kernel_height = 5
Kernel_width = Kernel_height
Pooling_height = 5
Pooling_width = Pooling_height 

# multiple classification model
model = models.Sequential()
model.add(layers.Conv2D(Num_neurous_layer1, (Kernel_height, Kernel_width), activation='relu', input_shape=(img_height, img_width, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(Num_neurous_layer2, (Kernel_height, Kernel_width), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(Num_neurous_layer3, (Kernel_height, Kernel_width), activation='relu'))
model.add(layers.Flatten())
# add regularizer
model.add(layers.Dense(Num_neurous_layer4,kernel_regularizer=regularizers.l2(0.001), activation='relu'))
# add dropout
model.add(layers.Dropout(0.1))
model.add(layers.Dense(Num_classifications, activation='softmax'))
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
# binary classification model
model1 = models.Sequential()
model1.add(layers.Conv2D(Num_neurous_layer1, (Kernel_height, Kernel_width), activation='relu', input_shape=(img_height, img_width, 1)))
model1.add(layers.MaxPooling2D((2, 2)))
model1.add(layers.Conv2D(Num_neurous_layer2, (Kernel_height, Kernel_width), activation='relu'))
model1.add(layers.MaxPooling2D((2, 2)))
model1.add(layers.Conv2D(Num_neurous_layer3, (Kernel_height, Kernel_width), activation='relu'))
model1.add(layers.Flatten())
# add regularizer
model1.add(layers.Dense(Num_neurous_layer4,kernel_regularizer=regularizers.l2(0.001),activation='relu'))
# add dropout
model1.add(layers.Dropout(0.1))
model1.add(layers.Dense(2, activation='sigmoid'))
model1.compile(optimizer=optimizers.RMSprop(lr=1e-4),loss='binary_crossentropy',metrics=['accuracy'])

model1.summary()

In [None]:
# train binary classification model
from keras.preprocessing.image import ImageDataGenerator

batch_size_input = 100
epochs_input = 20

imgs_train = imgs_train.reshape((imgs_train.shape[0], imgs_train.shape[1], imgs_train.shape[2], 1))
imgs_val = imgs_val.reshape((imgs_val.shape[0], imgs_val.shape[1], imgs_val.shape[2], 1))

# augmentation
datagen = ImageDataGenerator(rotation_range=30,width_shift_range=0.1,height_shift_range=0.1,shear_range=0.1,zoom_range=0.1,fill_mode='nearest')
datagen.fit(imgs_train)

# regular model fit
#history = model1.fit(imgs_train, train_labels_binary, epochs=epochs_input, batch_size=batch_size_input, validation_data=(imgs_val,val_labels_binary))

history = model1.fit_generator(datagen.flow(imgs_train, train_labels_binary,batch_size=batch_size_input),epochs=epochs_input,validation_data=(imgs_val,val_labels_binary))

model1.save('PC_binary.h5')

In [None]:
# Displaying curves of loss and accuracy during training

import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# train multiple classification model

history = model.fit_generator(datagen.flow(imgs_train, train_labels_multiple,batch_size=batch_size_input),epochs=epochs_input,validation_data=(imgs_val,val_labels_multiple))
model.save('PC_gleason_score.h5')

In [None]:
# Use HW3 data as an example to illustrate regular NN from extracted features rather than images
import numpy as np

features = pd.read_csv('features_matrix.csv', sep=',')
train_set = pd.read_csv('mass_case_description_train_set.csv', sep=',')
id_side_view = train_set.iloc[:,0]+"_"+train_set.iloc[:,2]+"_"+train_set.iloc[:,3]
train_set['id_side_view'] = id_side_view
full_data = pd.merge(features,train_set,'inner',left_on=['id_side_view'],right_on=['id_side_view'])
full_data = full_data.to_numpy()
full_data = full_data[full_data[:,22]=='MLO',:] # 554 rows of features and labels

# normalize features by column
features = full_data[:,1:19] # margin, shape, columns 24/25
features = np.float32(features)
mean = features.mean(axis=0)
features -= mean
std = features.std(axis=0)
features /= std

labels = full_data[:,26]
labels = to_categorical(labels)

cut_off = 400
features_train = features[0:cut_off,:]
features_val = features[cut_off:,:]
labels_train = labels[0:cut_off,:]
labels_val = labels[cut_off:,:]

In [None]:
# DNN model, takes extracted features as input
model = models.Sequential()
model.add(layers.Dense(features_train.shape[1]*2, activation='relu', input_shape=(features_train.shape[1],)))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(features_train.shape[1]*2, kernel_regularizer=regularizers.l2(0.001),activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(features_train.shape[1]*2, kernel_regularizer=regularizers.l2(0.001),activation='relu'))
model.add(layers.Dense(labels_train.shape[1], activation='softmax'))

model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
# train multiple classification model
epochs_input = 30
batch_size_input = 50

history = model.fit(features_train,labels_train,epochs=epochs_input,batch_size=batch_size_input,validation_data=(features_val, labels_val))
model.save('PC_features_gleason_score.h5')

In [None]:
# Displaying curves of loss and accuracy during training

import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## Classification that takes original images as inputs:

In [None]:
# divide mask images into training, validation and test sets

import os, shutil

imgs_path = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/Orig_imgs/"
list_of_imgs = os.listdir(imgs_path)
base_dir = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/Orig_imgs/"

train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)
validation_dir = os.path.join(base_dir, 'validation')
os.mkdir(validation_dir)
test_dir = os.path.join(base_dir, 'test')
os.mkdir(test_dir)
list_of_imgs = os.listdir(imgs_path)

# assign number of images in each set
num_training_imgs = 500
num_validation_imgs = 200
num_test_imgs = 200

for i in range(0,num_training_imgs):
    src = os.path.join(imgs_path, list_of_imgs[i])
    dst = os.path.join(train_dir, list_of_imgs[i])
    shutil.copyfile(src, dst)

for i in range(num_training_imgs,(num_training_imgs+num_validation_imgs)):
    src = os.path.join(imgs_path, list_of_imgs[i])
    dst = os.path.join(validation_dir, list_of_imgs[i])
    shutil.copyfile(src, dst)

for i in range((num_validation_imgs+num_training_imgs),(num_training_imgs+num_validation_imgs+num_test_imgs)):
    src = os.path.join(imgs_path, list_of_imgs[i])
    dst = os.path.join(test_dir, list_of_imgs[i])
    shutil.copyfile(src, dst)

print('total training images:', len(os.listdir(train_dir)))
print('total validation images:', len(os.listdir(validation_dir)))
print('total validation images:', len(os.listdir(test_dir)))

In [None]:
# load images and labels
# create training array
# 2 types of labels: 1) multiple: isup_grade 2) binary: benign/malignant

import os
import pydicom as dcm
import matplotlib.pyplot as plt
from skimage.filters import threshold_otsu
from skimage.color import rgb2gray
import tifffile as tiff
import csv
import pandas as pd
import skimage.io
from skimage.transform import resize, rescale
import numpy as np
import cv2
from skimage import data
from skimage import filters
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from PIL import Image
import openslide

img_height = 1024
img_width = 1024

base_dir = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/Orig_imgs/"
train_dir = os.path.join(base_dir, 'train/')

# read label file
labels = pd.read_csv(os.path.join(base_dir,'train.csv'), sep=',')
labels = labels.to_numpy() # labels[:,2] 0 image_id, 1 data_provider, 2 isup_grade, 3 gleason_score
benign_malignant = np.zeros((labels.shape[0],1))
labels = np.concatenate((labels, benign_malignant), axis=1)
labels[:,4] = (labels[:,2]>0)*1 # labels[:,4] convert to binary benign(0)/malignant(1) for binary classification

# read images
list_of_imgs = os.listdir(train_dir)
train_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
train_labels_binary = np.empty([len(list_of_imgs)])

# create 3D training array
imgs_train = np.zeros((len(list_of_imgs),img_height,img_width))

for i in range(0,len(list_of_imgs)): # stack images into appropriate 3D training array
    img_num = i
    img_path = os.path.join(train_dir,list_of_imgs[img_num])
    img_mask = openslide.open_slide(img_path)
    img_mask = np.array(img_mask.read_region((0,0),1, (img_mask.level_dimensions[1][0],img_mask.level_dimensions[1][1])))
    img_mask = cv2.resize(img_mask[:,:,2], (img_height, img_width))
    imgs_train[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    # if the image is in 'train.csv', add to labels array
    train_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    train_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    print(i)
    
train_labels_multiple = to_categorical(train_labels_multiple)
train_labels_binary = to_categorical(train_labels_binary)

In [None]:
# create validation set

validation_dir = os.path.join(base_dir, 'validation/')

# read images
list_of_imgs = os.listdir(validation_dir)
val_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
val_labels_binary = np.empty([len(list_of_imgs)])

# create 3D validation array
imgs_val = np.zeros((len(list_of_imgs),img_height,img_width))

for i in range(0,len(imgs_val)-1): # stack images into appropriate 3D validation array
    img_num = i
    img_path = os.path.join(validation_dir,list_of_imgs[img_num])
    img_mask = openslide.open_slide(img_path)
    img_mask = np.array(img_mask.read_region((0,0),1, (img_mask.level_dimensions[1][0],img_mask.level_dimensions[1][1])))
    img_mask = cv2.resize(img_mask[:,:,2], (img_height, img_width))
    imgs_val[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    # if the image is in 'train.csv', add to labels array
    val_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    val_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    #print(list_of_imgs[i].split('.')[0]," ",labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4])
    
val_labels_multiple = to_categorical(val_labels_multiple)
val_labels_binary = to_categorical(val_labels_binary)

In [None]:
# create test set

test_dir = os.path.join(base_dir, 'test/')

# read images
list_of_imgs = os.listdir(test_dir)
test_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
test_labels_binary = np.empty([len(list_of_imgs)])

# create 3D validation array
imgs_test = np.zeros((len(list_of_imgs),img_height,img_width))

for i in range(0,len(imgs_test)-1): # stack images into appropriate 3D validation array
    img_num = i
    img_path = os.path.join(test_dir,list_of_imgs[img_num])
    img_mask = tiff.imread(img_path)
    img_mask = cv2.resize(img_mask[:,:,0], (img_height, img_width))
    imgs_test[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    test_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    test_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    #print(list_of_imgs[i].split('.')[0]," ",labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4])
    
test_labels_multiple = to_categorical(test_labels_multiple)
test_labels_binary = to_categorical(test_labels_binary)

In [None]:
print(imgs_train.shape)
print(train_labels_multiple.shape)
print(train_labels_binary.shape)

np.save('img_train_107.npy', imgs_train)
np.save('img_train_labels_multiple_107.npy', train_labels_multiple)
np.save('img_train_labels_binary_107.npy', train_labels_binary)

np.save('img_val_27.npy', imgs_val)
np.save('img_val_labels_multiple_27.npy', val_labels_multiple)
np.save('img_val_labels_binary_27.npy', val_labels_binary)

#np.save('img_mask_test_200.npy', imgs_test)
#np.save('img_mask_test_labels_multiple_200.npy', test_labels_multiple)
#np.save('img_mask_test_labels_binary_200.npy', test_labels_binary)

In [None]:
# dispaly masked images

plt.figure(1)
plt.subplot(131)
plt.imshow(img_mask[:,:,0], cmap="gray")
plt.subplot(132)
plt.imshow(img_mask[:,:,1], cmap="gray")
plt.subplot(133)
plt.imshow(img_mask[:,:,2], cmap="gray")
plt.show()

In [None]:
# Set up a convnet model
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

# model: multiple classification; model1: binary classification
Num_neurous_layer1 = 32
Num_neurous_layer2 = 32
Num_neurous_layer3 = 32
Num_neurous_layer4 = 32
Num_classifications = len(set(labels[:,2])) # 0-5, total of 6 categories
Kernel_height = 5
Kernel_width = Kernel_height
Pooling_height = 5
Pooling_width = Pooling_height

# multiple classification model
model = models.Sequential()
model.add(layers.Conv2D(Num_neurous_layer1, (Kernel_height, Kernel_width), activation='relu', input_shape=(img_height, img_width, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(Num_neurous_layer2, (Kernel_height, Kernel_width), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(Num_neurous_layer3, (Kernel_height, Kernel_width), activation='relu'))
model.add(layers.Flatten())
# add regularizer
model.add(layers.Dense(Num_neurous_layer4,kernel_regularizer=regularizers.l2(0.001), activation='relu'))
# add dropout
model.add(layers.Dropout(0.1))
model.add(layers.Dense(Num_classifications, activation='softmax'))
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
# binary classification model
model1 = models.Sequential()
model1.add(layers.Conv2D(Num_neurous_layer1, (Kernel_height, Kernel_width), activation='relu', input_shape=(img_height, img_width, 1)))
model1.add(layers.MaxPooling2D((2, 2)))
model1.add(layers.Conv2D(Num_neurous_layer2, (Kernel_height, Kernel_width), activation='relu'))
model1.add(layers.MaxPooling2D((2, 2)))
model1.add(layers.Conv2D(Num_neurous_layer3, (Kernel_height, Kernel_width), activation='relu'))
model1.add(layers.Flatten())
# add regularizer
model1.add(layers.Dense(Num_neurous_layer4,kernel_regularizer=regularizers.l2(0.001),activation='relu'))
# add dropout
model1.add(layers.Dropout(0.1))
model1.add(layers.Dense(2, activation='sigmoid'))
model1.compile(optimizer=optimizers.RMSprop(lr=1e-4),loss='binary_crossentropy',metrics=['accuracy'])

model1.summary()

In [None]:
# train binary classification model
from keras.preprocessing.image import ImageDataGenerator

batch_size_input = 20
epochs_input = 10

imgs_train = imgs_train.reshape((imgs_train.shape[0], imgs_train.shape[1], imgs_train.shape[2], 1))
imgs_val = imgs_val.reshape((imgs_val.shape[0], imgs_val.shape[1], imgs_val.shape[2], 1))

# augmentation
datagen = ImageDataGenerator(rotation_range=30,width_shift_range=0.1,height_shift_range=0.1,shear_range=0.1,zoom_range=0.1,fill_mode='nearest')
datagen.fit(imgs_train)

# regular model fit
#history = model1.fit(imgs_train, train_labels_binary, epochs=epochs_input, batch_size=batch_size_input, validation_data=(imgs_val,val_labels_binary))

history = model1.fit_generator(datagen.flow(imgs_train, train_labels_binary,batch_size=batch_size_input),epochs=epochs_input,validation_data=(imgs_val,val_labels_binary))

model1.save('PC_binary.h5')

In [None]:
# Displaying curves of loss and accuracy during training

import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# train multiple classification model

history = model.fit_generator(datagen.flow(imgs_train, train_labels_multiple,batch_size=batch_size_input),epochs=epochs_input,validation_data=(imgs_val,val_labels_multiple))
model.save('PC_gleason_score.h5')

In [None]:
# Displaying curves of loss and accuracy during training

import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## Classification that takes masks as inputs:

In [None]:
# divide mask images into training, validation and test sets

import os, shutil

imgs_path = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/Imgs_mask/"
list_of_imgs = os.listdir(imgs_path)
base_dir = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/"

train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)
validation_dir = os.path.join(base_dir, 'validation')
os.mkdir(validation_dir)
test_dir = os.path.join(base_dir, 'test')
os.mkdir(test_dir)
list_of_imgs = os.listdir(imgs_path)

# assign number of images in each set
num_training_imgs = 500
num_validation_imgs = 200
num_test_imgs = 200

for i in range(0,num_training_imgs):
    src = os.path.join(imgs_path, list_of_imgs[i])
    dst = os.path.join(train_dir, list_of_imgs[i])
    shutil.copyfile(src, dst)

for i in range(num_training_imgs,(num_training_imgs+num_validation_imgs)):
    src = os.path.join(imgs_path, list_of_imgs[i])
    dst = os.path.join(validation_dir, list_of_imgs[i])
    shutil.copyfile(src, dst)

for i in range((num_validation_imgs+num_training_imgs),(num_training_imgs+num_validation_imgs+num_test_imgs)):
    src = os.path.join(imgs_path, list_of_imgs[i])
    dst = os.path.join(test_dir, list_of_imgs[i])
    shutil.copyfile(src, dst)

print('total training images:', len(os.listdir(train_dir)))
print('total validation images:', len(os.listdir(validation_dir)))
print('total validation images:', len(os.listdir(test_dir)))

In [None]:
# load images and labels
# create training array
# 2 types of labels: 1) multiple: isup_grade 2) binary: benign/malignant

import os
import pydicom as dcm
import matplotlib.pyplot as plt
from skimage.filters import threshold_otsu
from skimage.color import rgb2gray
import tifffile as tiff
import csv
import pandas as pd
import skimage.io
from skimage.transform import resize, rescale
import numpy as np
import cv2
from skimage import data
from skimage import filters
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from keras.utils import to_categorical

img_height = 1024
img_width = 1024

base_dir = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/"
train_dir = os.path.join(base_dir, 'train/')

# read label file
labels = pd.read_csv(os.path.join(base_dir,'train.csv'), sep=',')
labels = labels.to_numpy() # labels[:,2] 0 image_id, 1 data_provider, 2 isup_grade, 3 gleason_score
benign_malignant = np.zeros((labels.shape[0],1))
labels = np.concatenate((labels, benign_malignant), axis=1)
labels[:,4] = (labels[:,2]>0)*1 # labels[:,4] convert to binary benign(0)/malignant(1) for binary classification

# read images
list_of_imgs = os.listdir(train_dir)
train_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
train_labels_binary = np.empty([len(list_of_imgs)])

# create 3D training array
imgs_train = np.zeros((len(list_of_imgs),img_height,img_width))

for i in range(0,len(list_of_imgs)): # stack images into appropriate 3D training array
    img_num = i
    img_path = os.path.join(train_dir,list_of_imgs[img_num])
    img_mask = tiff.imread(img_path)
    img_mask = cv2.resize(img_mask[:,:,0], (img_height, img_width))
    imgs_train[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    # if the image is in 'train.csv', add to labels array
    train_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    train_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    print(i)
    
train_labels_multiple = to_categorical(train_labels_multiple)
train_labels_binary = to_categorical(train_labels_binary)

In [None]:
# create validation set

validation_dir = os.path.join(base_dir, 'validation/')

# read images
list_of_imgs = os.listdir(validation_dir)
val_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
val_labels_binary = np.empty([len(list_of_imgs)])

# create 3D validation array
imgs_val = np.zeros((len(list_of_imgs),img_height,img_width))

for i in range(0,len(imgs_val)-1): # stack images into appropriate 3D validation array
    img_num = i
    img_path = os.path.join(validation_dir,list_of_imgs[img_num])
    img_mask = tiff.imread(img_path)
    img_mask = cv2.resize(img_mask[:,:,0], (img_height, img_width))
    imgs_val[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    # if the image is in 'train.csv', add to labels array
    val_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    val_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    #print(list_of_imgs[i].split('.')[0]," ",labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4])
    
val_labels_multiple = to_categorical(val_labels_multiple)
val_labels_binary = to_categorical(val_labels_binary)

In [None]:
# create test set

test_dir = os.path.join(base_dir, 'test/')

# read images
list_of_imgs = os.listdir(test_dir)
test_labels_multiple = np.empty([len(list_of_imgs)]) # create label array   
test_labels_binary = np.empty([len(list_of_imgs)])

# create 3D validation array
imgs_test = np.zeros((len(list_of_imgs),img_height,img_width))

for i in range(0,len(imgs_test)-1): # stack images into appropriate 3D validation array
    img_num = i
    img_path = os.path.join(test_dir,list_of_imgs[img_num])
    img_mask = tiff.imread(img_path)
    img_mask = cv2.resize(img_mask[:,:,0], (img_height, img_width))
    imgs_test[i,:,:]=img_mask.reshape(1,img_height,img_width)
    
    test_labels_multiple[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),2]
    test_labels_binary[i]=labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4]
    #print(list_of_imgs[i].split('.')[0]," ",labels[(np.argwhere(labels == list_of_imgs[i].split('.')[0].split('_')[0])[0,0]),4])
    
test_labels_multiple = to_categorical(test_labels_multiple)
test_labels_binary = to_categorical(test_labels_binary)

In [None]:
#print(imgs_test.shape)
#print(test_labels_multiple.shape)
#print(test_labels_binary.shape)
import os
import pydicom as dcm
import matplotlib.pyplot as plt
from skimage.filters import threshold_otsu
from skimage.color import rgb2gray
import tifffile as tiff
import csv
import pandas as pd
import skimage.io
from skimage.transform import resize, rescale
import numpy as np
import cv2
from skimage import data
from skimage import filters
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from keras.utils import to_categorical

img_height = 1024
img_width = 1024

base_dir = "C:/Users/Qi/Desktop/Education/Stanford MS/BIOMED 260 Biomedical Imaging/Final Project/Kaggle Data/"
labels = pd.read_csv(os.path.join(base_dir,'train.csv'), sep=',')
labels = labels.to_numpy() # labels[:,2] 0 image_id, 1 data_provider, 2 isup_grade, 3 gleason_score
benign_malignant = np.zeros((labels.shape[0],1))
labels = np.concatenate((labels, benign_malignant), axis=1)
labels[:,4] = (labels[:,2]>0)*1 # labels[:,4] convert to binary benign(0)/malignant(1) for binary classification

imgs_train = np.load('img_mask_train_500.npy')
train_labels_multiple = np.load('img_mask_train_labels_multiple_500.npy')
train_labels_binary = np.load('img_mask_train_labels_binary_500.npy')

imgs_val = np.load('img_mask_val_200.npy')
val_labels_multiple = np.load('img_mask_val_labels_multiple_200.npy')
val_labels_binary = np.load('img_mask_val_labels_binary_200.npy')

#np.save('img_mask_train_500.npy', imgs_train)
#np.save('img_mask_train_labels_multiple_500.npy', train_labels_multiple)
#np.save('img_mask_train_labels_binary_500.npy', train_labels_binary)

#np.save('img_mask_val_200.npy', imgs_val)
#np.save('img_mask_val_labels_multiple_200.npy', val_labels_multiple)
#np.save('img_mask_val_labels_binary_200.npy', val_labels_binary)

#np.save('img_mask_test_200.npy', imgs_test)
#np.save('img_mask_test_labels_multiple_200.npy', test_labels_multiple)
#np.save('img_mask_test_labels_binary_200.npy', test_labels_binary)

In [None]:
# dispaly masked images

plt.figure(1)
plt.subplot(131)
plt.imshow(img_mask[:,:,0], cmap="gray")
plt.subplot(132)
plt.imshow(img_mask[:,:,1], cmap="gray")
plt.subplot(133)
plt.imshow(img_mask[:,:,2], cmap="gray")
plt.show()

In [None]:
# Set up a convnet model
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

# model: multiple classification; model1: binary classification
Num_neurous_layer1 = 32
Num_neurous_layer2 = 32
Num_neurous_layer3 = 32
Num_neurous_layer4 = 32
Num_classifications = len(set(labels[:,2])) # 0-5, total of 6 categories
Kernel_height = 5
Kernel_width = Kernel_height
Pooling_height = 5
Pooling_width = Pooling_height 

# multiple classification model
model = models.Sequential()
model.add(layers.Conv2D(Num_neurous_layer1, (Kernel_height, Kernel_width), activation='relu', input_shape=(img_height, img_width, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(Num_neurous_layer2, (Kernel_height, Kernel_width), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(Num_neurous_layer3, (Kernel_height, Kernel_width), activation='relu'))
model.add(layers.Flatten())
# add regularizer
model.add(layers.Dense(Num_neurous_layer4,kernel_regularizer=regularizers.l2(0.001), activation='relu'))
# add dropout
model.add(layers.Dropout(0.1))
model.add(layers.Dense(Num_classifications, activation='softmax'))
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
# binary classification model
model1 = models.Sequential()
model1.add(layers.Conv2D(Num_neurous_layer1, (Kernel_height, Kernel_width), activation='relu', input_shape=(img_height, img_width, 1)))
model1.add(layers.MaxPooling2D((2, 2)))
model1.add(layers.Conv2D(Num_neurous_layer2, (Kernel_height, Kernel_width), activation='relu'))
model1.add(layers.MaxPooling2D((2, 2)))
model1.add(layers.Conv2D(Num_neurous_layer3, (Kernel_height, Kernel_width), activation='relu'))
model1.add(layers.Flatten())
# add regularizer
model1.add(layers.Dense(Num_neurous_layer4,kernel_regularizer=regularizers.l2(0.001),activation='relu'))
# add dropout
model1.add(layers.Dropout(0.1))
model1.add(layers.Dense(2, activation='sigmoid'))
model1.compile(optimizer=optimizers.RMSprop(lr=1e-4),loss='binary_crossentropy',metrics=['accuracy'])

model1.summary()

In [None]:
# train binary classification model
from keras.preprocessing.image import ImageDataGenerator

batch_size_input = 50
epochs_input = 5

imgs_train = imgs_train.reshape((imgs_train.shape[0], imgs_train.shape[1], imgs_train.shape[2], 1))
imgs_val = imgs_val.reshape((imgs_val.shape[0], imgs_val.shape[1], imgs_val.shape[2], 1))

# augmentation
datagen = ImageDataGenerator(rotation_range=30,width_shift_range=0.1,height_shift_range=0.1,shear_range=0.1,zoom_range=0.1,fill_mode='nearest')
datagen.fit(imgs_train)

# regular model fit
#history = model1.fit(imgs_train, train_labels_binary, epochs=epochs_input, batch_size=batch_size_input, validation_data=(imgs_val,val_labels_binary))

history = model1.fit_generator(datagen.flow(imgs_train, train_labels_binary,batch_size=batch_size_input),epochs=epochs_input,validation_data=(imgs_val,val_labels_binary))

#model1.save('PC_binary.h5')

In [None]:
# Displaying curves of loss and accuracy during training

import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# train multiple classification model

history = model.fit_generator(datagen.flow(imgs_train, train_labels_multiple,batch_size=batch_size_input),epochs=epochs_input,validation_data=(imgs_val,val_labels_multiple))
#model.save('PC_gleason_score.h5')

In [None]:
# Displaying curves of loss and accuracy during training

import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()