In [1]:
#  Import libraries and define variables
import os
import shutil
import glob
from tqdm import tqdm
from PIL import Image
import nibabel as nib
import numpy as np
import pickle
import cv2
import pandas as pd
from multiprocessing import Process, current_process
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")  # Adds the parent directory to sys.path
import config 

# Define the modalities and classifications
# modalities = ['T1', 'T1GD', 'T2', 'FLAIR']
modalities = ['T1']
classifications = ['MGMT_positive', 'MGMT_negative']

# Define patch size and stride
block_h, block_w = config.PATCH_SIZE
stride = 2

# Interpolated image dimestions
inter_dim = (110, 90)

# Define paths to the BraTS dataset folders
path = config.MAIN_DIR

Preprocess_Dir = path + 'Preprocessed/layers/'
PATH = path + 'Data/'
Org_Dir = PATH + 'Original_Data_Backup/'
Work_Dir = PATH + 'Working_Data/'


#### One Split

In [2]:
train_data = pd.read_csv(config.MAIN_DIR+'results/train_data.csv')

In [3]:
# Spliting train to train and val
# train1_data => Train data fixed for one split
# val1_data => val data fixed for one split
train1_data,val1_data=train_test_split(train_data[['id','mgmt']],
                                      stratify=train_data['mgmt'],
                                      random_state=100,
                                      test_size=0.2
                                      )
print(f'Shape of train_data {train1_data.shape}')
print(f'Shape of test_data {val1_data.shape}')
train1_data.head()

Shape of train_data (64, 2)
Shape of test_data (16, 2)


Unnamed: 0,id,mgmt
41,UPENN-GBM-00494_11,1
10,UPENN-GBM-00034_11,0
11,UPENN-GBM-00124_11,1
12,UPENN-GBM-00312_11,0
71,UPENN-GBM-00442_11,1


In [None]:
# Function Definitions --> For reading images and storing all patches of a patient in a pickle file

# def patches_to_pickle(mgmt_type):
#     print('Reading Images')
#     workdir = os.listdir(Work_Dir)
#     # for mgmt_type in workdir:
#     for patient in os.listdir(Work_Dir + mgmt_type + '/'):
#         patient_patches = []
#         for patch in tqdm(os.listdir(Work_Dir + mgmt_type + '/' + patient + '/')):
#             try:
#                 img_array = cv2.imread(os.path.join(Work_Dir, mgmt_type+'/'+patient+'/'+patch), cv2.IMREAD_GRAYSCALE)
#                 patient_patches.append(img_array)
#             except Exception as e:
#                 print(e)
#         print(patient + " ✔")
#         pickle.dump(patient_patches, open(Preprocess_Dir + mgmt_type + '/' + patient, 'wb'))
#     print(mgmt_type + " ✔✔")
        


In [4]:
from multiprocessing import Pool
import preprocess_worker

# stoer paths of patients whose patches have been made
def img_to_npy(df: pd.DataFrame,selection: str,type: str):
    patient_paths = [] 
    os.chdir(Work_Dir)
    for i in tqdm(range(len(df))):
        cls = 'MGMT_negative' if df['mgmt'].iloc[i] == 0 else 'MGMT_positive'
        patient_path = cls + '/' + df['id'].iloc[i]
        pool_imgs = os.listdir('./'+patient_path)
        # print(patient_path+'/'+pool_imgs[0])
    
        for path in pool_imgs:
            if selection in path:
                patient_paths.append('./'+patient_path+'/'+path)


    print("T2 Pateints paths are selected",len(patient_paths))
    # print(patient_paths[0:10])
    if __name__ ==  '__main__': 
        num_processors = 15
        p=Pool(processes = num_processors)
        res = p.map(preprocess_worker.worker,[pat_id for pat_id in tqdm(patient_paths)])
        pos_arr = []
        neg_arr = []
        for arr, path in res:
            if 'MGMT_positive' in path:
                pos_arr.append(arr)

            elif 'MGMT_neg' in path:
                neg_arr.append(arr)
        pos_store_path = f'D:/MGMT research project/data for one split/Patch 64x64/{selection}/pos_one_split_{type}_data.npy'
        neg_store_path = f'D:/MGMT research project/data for one split/Patch 64x64/{selection}/neg_one_split_{type}_data.npy'
        np.save(pos_store_path,pos_arr)
        np.save(neg_store_path,neg_arr)


In [5]:
# Converting train patients in npy
img_to_npy(train1_data,selection='T2',type='train')

# Converting val patients in npy
img_to_npy(val1_data,selection='T2',type='val')

  0%|          | 0/64 [00:00<?, ?it/s]

100%|██████████| 64/64 [02:37<00:00,  2.47s/it]


T2 Pateints paths are selected 1176672


100%|██████████| 1176672/1176672 [00:00<00:00, 2463376.23it/s]
100%|██████████| 16/16 [00:42<00:00,  2.64s/it]


T2 Pateints paths are selected 326592


100%|██████████| 326592/326592 [00:00<00:00, 1899984.51it/s]


In [None]:
# Storing train1_data and val1_data into a pickle file
def img_to_npy(df: pd.DataFrame, selection: str,type: str):
    os.chdir(Work_Dir)
    res=np.array([])
    for i in tqdm(range(df.shape[0])):
        patient_id = df['id'].iloc[i]
        cls = 'MGMT_positive' if df['mgmt'].iloc[i]==1 else 'MGMT_negative'
        patient_path = os.path.join(cls,patient_id)
        pool_img = os.listdir(patient_path)
        # print(pool_img)
        for img in pool_img:
                if selection in img:
                    img_path = f'./{cls}/{patient_id}/{img}'
                    x = cv2.imread(img_path)
                    if len(res)==0:
                         res = np.array([x])
                    
                    else:
                         res = np.append(res,np.array([x]),axis=0)
                         
        # print(res)
        # Storing the data pkl format
        storing_path = f'D:/MGMT research project/data for one split/Patch 64x64/{selection}/{cls}_one_split_{type}_data.npy'
        # print(storing_path)
        np.save(storing_path,res)

#### image to array pickle, split during train

In [None]:
from multiprocessing import Pool
import preprocess_worker

# stoer paths of patients whose patches have been made
df = pd.read_csv(path + "Codes/upenn_data.csv")
patient_paths = []
for i in tqdm(range(len(df))):
    if(df.patches64[i] == True):
        patient_path = ('MGMT_negative' if df.mgmt[i] == 0 else 'MGMT_positive') + '/' + df.id[i]
        patient_paths.append(patient_path)

print(len(patient_paths))

if __name__ ==  '__main__': 
 num_processors = 15
 p=Pool(processes = num_processors)
 p.map(preprocess_worker.worker,[pat_id for pat_id in tqdm(patient_paths)])

In [None]:
# Function Definitions --> For reading images and appending it to a list

def read_image(data):
    print('Reading Images')
    class_num = 0
    workdir = os.listdir(Work_Dir)
    if '.DS_Store' in workdir:
          workdir.remove('.DS_Store')
          print('Removed .DS_Store')
    for classi in classifications:
        if classi in workdir:
            workdir.remove(classi)
    for pool in workdir:
        pool_dir = Work_Dir + pool + '/'
        pool_dir_list = os.listdir(pool_dir)
        if '.DS_Store' in pool_dir_list:
            pool_dir_list.remove('.DS_Store')
            print('Removed .DS_Store')
        # i = 0
        for img in tqdm(pool_dir_list):
            # i += 1
            # if(i == 100): break
            try:
                img_array = cv2.imread(os.path.join(pool_dir, img), cv2.IMREAD_GRAYSCALE)
                # Saving images in the list
                data.append([img_array, class_num])
            except Exception as e:
                print(e)
        class_num = 1
        


In [None]:
# Function Definitions --> Initialize the feature & labels of the processes image in the list X & Y

def Initializing_feature_labels(data, X, Y):
    print('Initializing Features & Labels')
    for features, label in data:
        X.append(features)
        Y.append(label)
    print('List Size: ', len(X), len(Y))

In [None]:
# Function Defination --> Reshape the list to numpy array

def Converting(block_h, block_w, X, Y):
    print('Converting to Array')
    global x, y

    # -1 is added to solve dimension mismatch while converting list to an array.
    x = np.array(X).reshape((-1, block_h, block_w, 1))
    y = np.array(Y)

    print('Array Size with Reshape: ', len(X), len(y))
    print('Array Shape with Reshape: ', x.shape, y.shape)


In [None]:
# Main cell to execute all the functions

# # Creating list for storing processed data
# data = []

# # Reading Images
# read_image(data) 

# #  Printing the length of the data
# print('Size of the data: ', len(data))

# # Initializing all features & labels of the processed image in the list X & Y
# X = []
# Y = []

# # Initializing the features and labels
# Initializing_feature_labels(data, X, Y)

# # Converting the list into numpy array
# Converting(block_h, block_w, X, Y)

# # Storing the numpy array in a pickle file
# Storing_Preprocessed_Data = open(Work_Dir + 'X.pickle', 'wb')
# pickle.dump(X, Storing_Preprocessed_Data)
# Storing_Preprocessed_Data.close()

# Storing_Preprocessed_Data = open(Work_Dir + 'y.pickle', 'wb')
# pickle.dump(y, Storing_Preprocessed_Data)
# Storing_Preprocessed_Data.close()


patches_to_pickle()


In [None]:
# Trying dataframe in place of imgarray

def create_dataframe():
    modality_in_annotated = sorted(os.listdir(Work_Dir))

    image=[]
    label=[]

    for classi in classifications:
        if classi in modality_in_annotated:
            modality_in_annotated.remove(classi)
                
    for pool_modality in modality_in_annotated:
        for img in tqdm(os.listdir(Work_Dir + pool_modality + '/')):
            image.append(Work_Dir + pool_modality + '/' + img)
            label.append(0 if 'MGMT_negative' in pool_modality else 1) 

    df = pd.DataFrame()
    df['images']=[str(x) for x in image]
    df['labels']=[str(x) for x in label]
    # df = df.sample(frac=1, random_state=1).reset_index(drop=True)
    df.to_csv('step2_file_paths.csv')
    df.head()

In [None]:
# create_dataframe()

#### Layer wise interpolated images Preprocessing/layers

In [2]:
from multiprocessing import Pool
import preprocess_worker

patient_paths = []

for type in tqdm(os.listdir(Work_Dir)):
   for mod in modalities:
      for patient in os.listdir(Work_Dir + type + '/' + mod + '/'):
        patient_path = type + '/' + mod + '/' + patient
        patient_paths.append(patient_path)

print(len(patient_paths))

if __name__ ==  '__main__': 
 num_processors = 15
 p=Pool(processes = num_processors)
 p.map(preprocess_worker.worker,[pat_id for pat_id in tqdm(patient_paths)])

100%|██████████| 2/2 [00:00<00:00, 2009.73it/s]




149


100%|██████████| 149/149 [00:00<?, ?it/s]
