<a href="https://colab.research.google.com/github/nakib103/pancancer_classification/blob/master/conv_2D_with_modularity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import packages

import numpy as np
import pandas as pd
import pickle
import math

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
# run this cell to mount your Google Drive.

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load data and label

data = pd.read_pickle("/content/drive/My Drive/data/data_df.pkl")
label = pd.read_pickle("/content/drive/My Drive/data/label_df.pkl")

In [None]:
def preprocess_data(data):
  # create data variable from loaded data
  data_log_transformed = data.apply(lambda x: np.log2(x + 1))

  return data_log_transformed

In [None]:
def shuffle_data(data, label):
  ## same patient check
  # generate shuffled index
  shuffled_index = np.random.permutation(data.index)

  # shiffle data and label
  data_shuffled = data.reindex(shuffled_index)
  label_shuffled = label.reindex(shuffled_index)

  return data_shuffled, label_shuffled

In [None]:
def data_pd_to_np(data, shape):
  # convert dataframe to numpy array
  x = data.iloc[:].values

  # pad the training postion with zero
  x = np.pad(x, ((0, 0),(0, shape[0] * shape[1] - x.shape[1])), 'constant', constant_values=0)

  # reshape to 2D
  x = np.reshape(x, (x.shape[0], 1, shape[0], shape[1]))

  # normalize data to [0, 255] range
  x = ( x / np.max(x) ) * 255

  # convert data format to integer
  x = x.astype(int)
  
  return x

In [None]:
def label_pd_to_np(label):
  # create label variable from loaded label
  y = label.iloc[:].values

  for id_y, lab_y in enumerate(y):
    for id_class, lab_class in enumerate(classes):
      if lab_y[0] == lab_class:
        y[id_y] = id_class
        break
  print(y.shape)
  y = y[:,0]
  y = to_categorical(y)

  return y

In [None]:
# store the 2D data and corresponding label

# np.save('/content/drive/My Drive/data/data_np_2D.npy', x)
# np.save('/content/drive/My Drive/data/label_np_2D.npy', y)

In [None]:
# custom losss function - wighted categorical-crossentropy
## need testing
"""
A weighted version of categorical_crossentropy for keras (2.0.6). This lets you apply a weight to unbalanced classes.
@url: https://gist.github.com/wassname/ce364fddfc8a025bfab4348cf5de852d
@author: wassname
"""
from keras import backend as K
def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss
    
    return loss

Using TensorFlow backend.


In [None]:
# model definition
def define_model(input_shape, num_of_class):
  regularizer = regularizers.l1_l2(l1=0.000, l2=0.000)
  x = input_shape[0]
  y = input_shape[1]

  model = Sequential()
  model.add(Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), data_format='channels_first', input_shape=(1, x, y), activation='relu'))
  model.add(MaxPooling2D(pool_size=(2, 2), data_format='channels_first'))
  model.add(Dropout(rate = 0.1))

  model.add(Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), activation='relu', data_format='channels_first'))
  model.add(MaxPooling2D(pool_size=(2, 2), data_format='channels_first'))
  model.add(Dropout(rate = 0.1))

  model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 1), activation='relu', data_format='channels_first'))
  model.add(MaxPooling2D(pool_size=(2, 2), data_format='channels_first'))
  model.add(Dropout(rate = 0.1))
            
  model.add(Flatten())

  model.add(Dense(units =1024, activation = 'relu', kernel_regularizer = regularizer))
  model.add(Dense(units = 1024, activation = 'relu', kernel_regularizer = regularizer))
  model.add(Dense(units = 512, activation = 'relu', kernel_regularizer = regularizer))
            
  model.add(Dense(units = num_of_class, activation = 'softmax'))

  print(model.summary())

  return model

In [None]:
# generate data and label
def generate_dataset(data, label, shape):
 
  data = preprocess_data(data)
  data, label = shuffle_data(data, label)
  data = data_pd_to_np(data, shape)
  label = label_pd_to_np(label)

  return data, label

In [None]:
# let's generate data and train model
classes = ['GBM', 'OV', 'LUAD', 'LUSC', 'PRAD', 'UCEC', 'BLCA', 'TGCT', 'ESCA', 'PAAD', 'KIRP', 'LIHC', 'CESC', 'SARC', 'BRCA', 'THYM', 'MESO', 'COAD', 'STAD', 'SKCM', 'CHOL', 'KIRC', 'THCA', 'HNSC', 'LAML', 'READ', 'LGG', 'DLBC', 'KICH', 'UCS', 'ACC', 'PCPG', 'UVM']
shape = (150, 150)

x, y = generate_dataset(data, label, shape)

model = define_model(shape, y.shape[1])

In [None]:
# define learning process

sgd = optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=True)
#adam  = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

# weights = [174, 309, 576, 554, 550, 567, 427, 156, 196, 183, 323, 424, 310, 265, 1218, 122, 87, 495, 450, 474, 45, 606, 572, 566, 173, 171, 534, 48, 91, 57, 79, 187, 80]
# weights = [ weight/11069 for weight in weights]

# model.compile(loss = weighted_categorical_crossentropy(weights), optimizer = sgd, metrics=['accuracy'])
model.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics=['accuracy'])

In [None]:
# train model

history = model.fit(x, y, validation_split = 0.1, batch_size=32, epochs=80, verbose=1, shuffle=True)
with open('/content/drive/My Drive/results/check', 'wb') as file_pi:
  pickle.dump(history.history, file_pi)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [None]:
# load models and weights

model_json = model.to_json()
model.save("/content/drive/My Drive/results/model_without_featreduction_2D_1.h5")