In [1]:
#  Import libraries and define variables
import os
import shutil
import glob
from tqdm import tqdm
from PIL import Image
import nibabel as nib
import numpy as np
import pandas as pd
import pickle as pkl
import cv2
import random as rn
from multiprocessing import Pool,Process
import config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling2D, Dense, Flatten, Dropout, LeakyReLU, Activation, AveragePooling2D, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from keras.layers.convolutional import Conv2D
import tensorflow as tf
from tensorflow.keras.metrics import TruePositives,TrueNegatives,FalsePositives,FalseNegatives,AUC,Recall,Precision
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping
import time
import shutil
import sys
sys.path.append('..')


# Define the modalities and classifications
modalities = ['T1', 'T1GD', 'T2', 'FLAIR']
classifications = ['MGMT_positive', 'MGMT_negative']

# Define patch size and stride
block_h, block_w = config.PATCH_SIZE
stride = 2

# Interpolated image dimestions
inter_dim = (110, 90)

# Define epoch
epoch = 100
batch_size = 128

# Define paths to the BraTS dataset folders
path = config.MAIN_DIR

PATH = config.MAIN_DIR + 'Data_100_64x64/'
Org_Dir = PATH + 'Original_Data_Backup/'
Work_Dir = PATH + 'Working_Data/'

In [2]:
# Reading upenn_data.csv
pat_df=pd.read_csv('../upenn_data.csv')

# considering pateints id's where patches are True
pat_df=pat_df[pat_df['patches']==True]
pat_df.head()

Unnamed: 0.1,Unnamed: 0,id,mgmt,age,gender,survival,patches,patches64
0,0,UPENN-GBM-00022_11,0,53.88,F,1882,True,True
1,0,UPENN-GBM-00034_11,0,53.63,F,464,True,True
2,0,UPENN-GBM-00088_11,0,47.32,M,334,True,True
3,0,UPENN-GBM-00091_11,0,70.54,M,200,True,True
4,0,UPENN-GBM-00093_11,0,51.3,F,616,True,True


#### **Train Test Split**

In [3]:
# Splitting Data into train and test
train_data,test_data=train_test_split(pat_df[['id','mgmt']],
                                      stratify=pat_df['mgmt'],
                                      random_state=100,
                                      test_size=0.2
                                      )
print(f'Shape of train_data {train_data.shape}')
print(f'Shape of test_data {test_data.shape}')
train_data.head()

Shape of train_data (80, 2)
Shape of test_data (20, 2)


Unnamed: 0,id,mgmt
47,UPENN-GBM-00294_11,0
116,UPENN-GBM-00445_11,1
17,UPENN-GBM-00132_11,0
33,UPENN-GBM-00150_11,0
4,UPENN-GBM-00093_11,0


In [4]:
# Saving Train and test data
train_data.to_csv(config.MAIN_DIR+'results/train_data.csv',mode='w')
test_data.to_csv(config.MAIN_DIR+'results/test_data.csv',mode='w')

### **One Split Training**

In [2]:
# Loading the train npy
train_T2_pos_one =np.load('D:/MGMT research project/data for one split/Patch 64x64/T2/pos_one_split_train_data_grayscale.npy')
train_T2_neg_one =np.load('D:/MGMT research project/data for one split/Patch 64x64/T2/neg_one_split_train_data_grayscale.npy')
X_train = np.append(train_T2_pos_one,train_T2_neg_one,axis=0)/255.0             # Normalising the array
# y_train = [1]*40000 + [0]*40000
y_train = [1]*len(train_T2_pos_one) + [0]*len(train_T2_neg_one)
y_train = np.array(y_train)
del train_T2_neg_one,train_T2_pos_one
print("X_train and y_train was intialised")

# Loading the val npy
val_T2_pos_one =np.load('D:/MGMT research project/data for one split/Patch 64x64/T2/pos_one_split_val_data_grayscale.npy')
val_T2_neg_one =np.load('D:/MGMT research project/data for one split/Patch 64x64/T2/neg_one_split_val_data_grayscale.npy')
X_val = np.append(val_T2_pos_one,val_T2_neg_one,axis=0)/255.0                   # Normalising the array
# y_val = [1]*10000 + [0]*10000
y_val = [1]*len(val_T2_pos_one) + [0]*len(val_T2_neg_one)
y_val = np.array(y_val)
del val_T2_neg_one,val_T2_pos_one
print("X_val and y_val was intialised")

X_train and y_train was intialised
X_val and y_val was intialised


In [3]:
def model_training():
    # Model intialisation with GPU
    with tf.device('GPU:0'):
        model = Sequential()
        model.add(Conv2D(8, (3, 3), padding='same',
                         input_shape=(block_h,block_w,1)))
        # model.add(Conv2D(8, (3, 3), padding='same'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling2D(pool_size=(4, 4)))
        BatchNormalization()
        model.add(Dropout(0.5))

        model.add(Conv2D(8, (3, 3), padding='same'))
        # model.add(Conv2D(16, (3, 3),padding='same'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling2D(pool_size=(4,4)))
        BatchNormalization()
        model.add(Dropout(0.5))

        # model.add(Conv2D(48, (3, 3), padding='same'))
        # model.add(LeakyReLU(alpha=0.1))
        # model.add(MaxPooling2D(pool_size=(3, 3)))
        # model.add(Dropout(0.1))

        model.add(Flatten())  # Convert 3D feature map to 1D feature vector.

        model.add(Dense(64))
        model.add(LeakyReLU(alpha=0.1))
        BatchNormalization()
        model.add(Dropout(0.5))

        model.add(Dense(8))
        model.add(LeakyReLU(alpha=0.1))
        BatchNormalization()
        model.add(Dropout(0.3))
        
        model.add(Dense(1, activation='sigmoid'))

        # Compiling the model
        model.compile(loss='binary_crossentropy',
                    optimizer='adam', metrics=['accuracy',TruePositives(),TrueNegatives(),FalsePositives(),FalseNegatives(),AUC(),Recall(),
                                                Precision()])

    
    # print(y_train[:50])

    # Model Checkpoints
    # checkpoint_filepath = config.MAIN_DIR+f'results/model checkpoints/model(k={cv+1})'
    # model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath+'_epoch-{epoch:02d}_valloss-{val_loss:.4f}.ckpt',
    #                                             # save_format='h5',
    #                                             monitor='val_loss', 
    #                                             mode='min',
    #                                             save_best_only=True)
    # Reduce LRPlateau
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)

    # Early stoping
    early_stoping =EarlyStopping(monitor="val_loss",patience=5,mode="min") 

    # Model training
    history = model.fit(X_train, y_train, batch_size=64, epochs=100,
                        validation_data=(X_val, y_val), 
                        shuffle=True,
                        callbacks=[reduce_lr,early_stoping])
    
    return history

In [4]:
model = model_training()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


#### **Cross Validation**

In [17]:
# Cross Validation --> Splitting the data and saving the train and val indices of each cv in pickle in dictionary
def patient_val_split(df:pd.DataFrame,cv: int) -> dict:
    dict_cv={}
    skf=StratifiedKFold(n_splits=cv, random_state=100, shuffle=True)

    for i,(train_idx,val_idx) in enumerate(skf.split(df[['id']],df['mgmt'])):
        dict_cv[i+1]=[train_idx,val_idx]
    
    pkl.dump(dict_cv,open(config.MAIN_DIR+'results/cross_validation_indexes.pkl','wb'))
    print("Splitting indices is completed")
    return dict_cv
    

    


In [18]:
# Cross validating data and storing in pickle
cv_idx_dict = patient_val_split(df=train_data,cv=10)

Splitting indices is completed


In [19]:
x=91/8
print(int(x))

11


In [25]:
# Fubction definition --> All patients of training/validation were appended to one train list and val list 
# for x and y so that they be used in training
def making_all_arrays_to_list(idx: list,x: list,y: list):
    for row in tqdm(idx):
        if train_data['mgmt'].iloc[row]==0:
            label = 'MGMT_negative/'
        else: label = 'MGMT_positive/'
        pkl_file = pkl.load(open(config.MAIN_DIR+'preprocessed/64X64/'+label+train_data['id'].iloc[row],'rb'))
        for arr in pkl_file:
            x.append(arr)
            y.append(train_data['mgmt'].iloc[row])
    # x=np.array(x)
    # y=np.array(y)
def randomisation(list_large: list, list_small: list):
    indices = []
    temp, j = 0, 0
    for i in range(len(list_large)):
        indices.append(list_large[i])
        temp+=1

        if(temp == int(len(list_large)/len(list_small))) and j<len(list_small):
            # print(len(list_small))
            # print(j)
            indices.append(list_small[j])
            j+=1
            temp=0

        if len(indices) % batch_size == 0:
            seed_value = 1
            rn.seed(seed_value)
            iteration = int(len(indices) / batch_size)
            start_idx = (iteration-1)*batch_size
            rn.shuffle(indices[start_idx:start_idx+batch_size-1])
            # break

    return indices



def data_stratified_indices(l1:list):
    # Lenght list y
    total_samples = len(l1)
    idx_0, idx_1 = [], []

    for idx in range(total_samples):
        if l1[idx]==0: idx_0.append(idx)
        else: idx_1.append(idx)

    if len(idx_0) > len(idx_1): 
        return randomisation(list_large= idx_0,list_small= idx_1)
    else :
        return randomisation(list_large= idx_1,list_small= idx_0)

    


# Function definintion --> It calls making_all_arrays_to_list() for train and val
def data_arr(indexes: list):
    train_idx,val_idx = indexes
    train_x, train_y = [], []
    val_x, val_y = [], []

    # fetching Train Data 
    making_all_arrays_to_list(idx = train_idx,x = train_x,y = train_y)
    print("Training data was fetched")
    # fetching Val Data 
    making_all_arrays_to_list(idx = val_idx,x = val_x,y = val_y)
    print("Validation data was fetched")
    train_stratify_index = data_stratified_indices(l1= train_y)
    # val_stratify_index = data_stratified_indices(l1= val_y)
    print("stratify indices done")
    train_x = list(train_x[i] for i in train_stratify_index)
    train_y = list(train_y[i] for i in train_stratify_index)
    print('data_arr done')
    return train_x, train_y, val_x, val_y

In [21]:
gpu_device = tf.config.list_physical_devices('GPU')[0]
gpu_device

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

In [22]:
# Function Definition --> Make model checkpoint folder empty
def make_ckpt_folder_empty(folder_path):
    # Check if the folder exists
    if os.path.exists(folder_path):
        # List all files and directories in the folder
        contents = os.listdir(folder_path)
        
        # Check if the folder is empty
        if len(contents) == 0:
            print(f"The folder '{folder_path}' is already empty.")
        else:
            # Deletes the folder tree
            shutil.rmtree(folder_path)

            # Recreate the folder
            os.makedirs(folder_path)
            print(f"The folder '{folder_path}' has been emptied.")
    else:
        print(f"The folder '{folder_path}' does not exist.")




In [23]:
#  Funtion Defination --> train the model and stores the history
# import h5py
def model_training(data_idx: list):
    cv, cv_indexes = data_idx

    # Model intialisation with GPU
    with tf.device('GPU:0'):
        model = Sequential()

        model.add(Conv2D(32, (3, 3), padding='same',input_shape=(block_h,block_w,1)))
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling2D(pool_size=(3, 3)))
        model.add(Dropout(0.1))

        model.add(Conv2D(48, (3, 3), padding='same'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling2D(pool_size=(3, 3)))
        model.add(Dropout(0.1))

        model.add(Conv2D(48, (3, 3), padding='same'))
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling2D(pool_size=(3, 3)))
        model.add(Dropout(0.1))

        model.add(Flatten())  # Convert 3D feature map to 1D feature vector.

        model.add(Dense(1096))
        model.add(LeakyReLU(alpha=0.1))
        model.add(Dropout(0.1))
        model.add(Dense(1, activation='sigmoid'))

        # Compiling the model
        model.compile(loss='binary_crossentropy',
                    optimizer='adam', metrics=['accuracy',TruePositives(),TrueNegatives(),FalsePositives(),FalseNegatives(),AUC(),Recall(),
                                                Precision()])

    # Selecting the data from train_idx and test_idx
    X_train, y_train, X_val, y_val = data_arr(indexes = cv_indexes)
    print("all data fetched")
    X_train=np.array(X_train)
    y_train=np.array(y_train)
    X_val=np.array(X_val)
    y_val=np.array(y_val)
    # print(y_train[:50])

    # Model Checkpoints
    checkpoint_filepath = config.MAIN_DIR+f'results/model checkpoints/model(k={cv+1})'
    model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath+'_epoch-{epoch:02d}_valloss-{val_loss:.4f}.ckpt',
                                                # save_format='h5',
                                                monitor='val_loss', 
                                                mode='min',
                                                save_best_only=True)
    # Reduce LRPlateau
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.0001)

    # Early stoping
    early_stoping =EarlyStopping(monitor="val_loss",patience=5,mode="min") 
    # Model training
    print(f"Model Training for {cv} was started...")
    
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epoch,
                        validation_data=(X_val, y_val,), shuffle=True,callbacks=[model_checkpoint_callback,reduce_lr,early_stoping])
    
    # Stores the history in pickle
    pkl.dump(history.history,open(config.MAIN_DIR+f'results/history/history_k={cv}.pkl','wb'))
    print(f"Model Training for cv-{cv} was completed....")



In [26]:
# Empting the model checkpoint folder before running new cross validation
make_ckpt_folder_empty(config.MAIN_DIR+'results/model checkpoints')

# Starts time
start=time.time()

# Cross validation starts here
for cv in cv_idx_dict:
    print(f"Cross Validation k={cv} was started....")
    model_training([cv,cv_idx_dict[cv]])

# End time
end=time.time()
print(f'Total Time taken by cross validation - {end-start}')


The folder 'D:/MGMT research project/results/model checkpoints' is already empty.
Cross Validation k=1 was started....


100%|██████████| 63/63 [02:21<00:00,  2.24s/it]


Training data was fetched


100%|██████████| 7/7 [00:17<00:00,  2.45s/it]


Validation data was fetched
stratify indices done
data_arr done
all data fetched
Model Training for 1 was started...
Epoch 1/100



INFO:tensorflow:Assets written to: D:/MGMT research project/results/model checkpoints\model(k=2)_epoch-01_valloss-1.9183.ckpt\assets


INFO:tensorflow:Assets written to: D:/MGMT research project/results/model checkpoints\model(k=2)_epoch-01_valloss-1.9183.ckpt\assets


Epoch 2/100



INFO:tensorflow:Assets written to: D:/MGMT research project/results/model checkpoints\model(k=2)_epoch-02_valloss-1.7778.ckpt\assets


INFO:tensorflow:Assets written to: D:/MGMT research project/results/model checkpoints\model(k=2)_epoch-02_valloss-1.7778.ckpt\assets


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Model Training for cv-1 was completed....
Cross Validation k=2 was started....


100%|██████████| 63/63 [02:28<00:00,  2.36s/it]


Training data was fetched


100%|██████████| 7/7 [06:42<00:00, 57.45s/it]


Validation data was fetched
stratify indices done
data_arr done
all data fetched
Model Training for 2 was started...
Epoch 1/100

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [None]:
# x=[1,2,3,4,53,2,1]
# rn.shuffle(x[0:5])
# x

In [None]:
# model = Sequential()

# model.add(Conv2D(32, (3, 3), padding='same',input_shape=(32,32,1)))
# model.add(LeakyReLU(alpha=0.1))
# model.add(MaxPooling2D(pool_size=(3, 3)))
# model.add(Dropout(0.1))

# model.add(Conv2D(48, (3, 3), padding='same'))
# model.add(LeakyReLU(alpha=0.1))
# model.add(MaxPooling2D(pool_size=(3, 3)))
# model.add(Dropout(0.1))

# model.add(Conv2D(48, (3, 3), padding='same'))
# model.add(LeakyReLU(alpha=0.1))
# model.add(MaxPooling2D(pool_size=(3, 3)))
# model.add(Dropout(0.1))

# model.add(Flatten())  # Convert 3D feature map to 1D feature vector.

# model.add(Dense(1096))
# model.add(LeakyReLU(alpha=0.1))
# model.add(Dropout(0.1))
# model.add(Dense(1, activation='sigmoid'))

In [None]:
# Load checkpoint
# checkpoint = tf.train.Checkpoint(model=model)
# ckpt_load_path = 'model(k=9)_epoch-06_valloss-0.0000.ckpt'
# checkpoint.restore(config.MAIN_DIR+'results/model checkpoints/'+ckpt_load_path)

# print(f"Model restored from {ckpt_load_path}")