<a href="https://colab.research.google.com/github/nakib103/pancancer_classification/blob/master/feed_forward.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# check GPU resources

!nvidia-smi

In [None]:
# import packages

import numpy as np
import pandas as pd
import pickle
import math

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras import regularizers
from keras import optimizers
from keras.utils import to_categorical

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
# run this cell to mount your Google Drive.

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load data and label

data = pd.read_pickle("/content/drive/My Drive/main/data_df.pkl")
label = pd.read_pickle("/content/drive/My Drive/main/label_df.pkl")

In [None]:
# check data and label integrity

print(data)
print(label)

In [None]:
# create data variable from loaded data

data_points = 11069
x = data.iloc[:data_points].values
x.shape

In [None]:
# create label variable from loaded label

y = label.iloc[:data_points].values


classes = ['GBM', 'OV', 'LUAD', 'LUSC', 'PRAD', 'UCEC', 'BLCA', 'TGCT', 'ESCA', 'PAAD', 'KIRP', 'LIHC', 'CESC', 'SARC', 'BRCA', 'THYM', 'MESO', 'COAD', 'STAD', 'SKCM', 'CHOL', 'KIRC', 'THCA', 'HNSC', 'LAML', 'READ', 'LGG', 'DLBC', 'KICH', 'UCS', 'ACC', 'PCPG', 'UVM']
for id_y, lab_y in enumerate(y):
  for id_class, lab_class in enumerate(classes):
    if lab_y[0] == lab_class:
      y[id_y] = id_class
      break

y = np.reshape(y, -1, 2)

y = to_categorical(y)
print(y.shape)
print(y)

In [None]:
# preprocessing - convert to log2 

x = np.log2(x + 1)
print (x)

In [None]:
# preprocessing - reduce feature space based on variance threshold
### need checking

variance = np.var(x, axis=0)
print(variance.shape)
ommit_index = np.where(variance < 1.19)
print (ommit_index)
x = np.delete(x, ommit_index, axis = 1)
print(x.shape)

In [None]:
# shuffle the data and label if necessary
## same patient check
## concatenate x, y then shuffle
## check if diff generate each time

rng_state = np.random.get_state()
np.random.shuffle(x)
np.random.set_state(rng_state)
np.random.shuffle(y)

In [None]:
# custom losss function - wighted categorical-crossentropy
## need testing
"""
A weighted version of categorical_crossentropy for keras (2.0.6). This lets you apply a weight to unbalanced classes.
@url: https://gist.github.com/wassname/ce364fddfc8a025bfab4348cf5de852d
@author: wassname
"""
from keras import backend as K
def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss
    
    return loss

In [None]:
# model definition

regularizer = regularizers.l1_l2(l1=0.000, l2=0.000)

model = Sequential()
model.add(Dense(units = 512, input_shape = (x.shape[1],), activation = 'relu', kernel_regularizer = regularizer))
model.add(Dropout(rate = 0.1, noise_shape=None, seed=None))

model.add(Dense(units = 512, activation = 'relu', kernel_regularizer = regularizer))
model.add(Dropout(rate = 0.1, noise_shape=None, seed=None))

# model.add(Dense(units = 512, activation = 'relu', kernel_regularizer = regularizer))
# model.add(Dropout(rate = 0.1, noise_shape=None, seed=None))

model.add(Dense(units = y.shape[1], activation = 'softmax'))

In [None]:
# define learning process

sgd = optimizers.SGD(lr=0.1, momentum=0.0, decay=0.0, nesterov=True)
adam  = optimizers.Adam(lr=0.000001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

weights = [174, 309, 576, 554, 550, 567, 427, 156, 196, 183, 323, 424, 310, 265, 1218, 122, 87, 495, 450, 474, 45, 606, 572, 566, 173, 171, 534, 48, 91, 57, 79, 187, 80]
weights = [ weight/11069 for weight in weights]

model.compile(loss = weighted_categorical_crossentropy(weights), optimizer = adam, metrics=['accuracy'])

In [None]:
# train model

history = model.fit(x, y, validation_split = 0.1, batch_size=256, epochs=300, verbose=1, shuffle=True)
with open('/content/drive/My Drive/main/2H002', 'wb') as file_pi:
  pickle.dump(history.history, file_pi)
  print(history)

In [None]:
# K-fold validation

seed = 7
np.random.seed(seed)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

cvscores = []
for train, test in kfold.split(x, y):
  
  regularizer = regularizers.l1_l2(l1=0.0000001, l2=0.00001)
  model = Sequential()
  model.add(Dense(units = 4096, input_shape = (x.shape[1],), activation = 'relu', kernel_regularizer = regularizer))
  model.add(Dropout(rate = 0.3, noise_shape=None, seed=None))
  # model.add(Dense(units = 1024, activation = 'relu', kernel_regularizer = regularizer))
  model.add(Dense(units = 2048, activation = 'relu', kernel_regularizer = regularizer))
  model.add(Dropout(rate = 0.3, noise_shape=None, seed=None))
  model.add(Dense(units = 512, activation = 'relu', kernel_regularizer = regularizer))
  model.add(Dropout(rate = 0.3, noise_shape=None, seed=None))
  # model.add(Dense(units = 4096, activation = 'relu'))
  model.add(Dense(units = y.shape[1], activation = 'softmax'))

  sgd = optimizers.SGD(lr=0.1, momentum=0.0, decay=0.0, nesterov=True)
  adam  = optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)
  weights = [174, 309, 576, 554, 550, 567, 427, 156, 196, 183, 323, 424, 310, 265, 1218, 122, 87, 495, 450, 474, 45, 606, 572, 566, 173, 171, 534, 48, 91, 57, 79, 187, 80]
  weights = [ weight/11069 for weight in weights]
  model.compile(loss = weighted_categorical_crossentropy(weights), optimizer = adam, metrics=['accuracy'])

  history = model.fit(x[train], y[train], validation_split = 0.0, batch_size=256, epochs=100, verbose=1, shuffle=True)
  with open('/content/drive/My Drive/main/2H002', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)
#     print(history)
    scores = model.evaluate(x[test], y[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))