
1.	ADI: adipose tissue, consists of adipocytes
2.	BACK: background of histopathological images
3.	DEB: debris, usefull for diagnosis of cancer
4.	LYM: lymphocytes, cells of lymphatic system
5.	MUC: mucus, protective layer on tissue
6.	MUS: smooth muscle
7.	NORM: normal tissue of colon
8.	STR: stroma tissue associated with cancer
9.	TUM: epithelium tissues of adenocarcinoma


credit: based on the [nct/crc notebook](https://www.kaggle.com/code/hosseindaqiqi/diagnosing-colon-cancer-using-transfer-learning?scriptVersionId=92363738) by [mayson](https://www.kaggle.com/hosseindaqiqi)

In [1]:
# Imports
import os, warnings, cv2
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import tensorflow as tf
import pandas as pd

from os import listdir
from os.path import isfile, join
from shutil import rmtree

from tensorflow import keras
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.callbacks import ModelCheckpoint,  ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Flatten

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import make_classification



In [2]:
#sanity check tf
print(tf.__version__)
#sanity check gpus on system
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print(gpu)

2.12.0
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
#Load the directory
main_path = '../input/nct-crc-he-100k/NCT-CRC-HE-100K'
sub_dir =os.listdir(main_path)

In [4]:
#Creat The DataFrame
Data_df=pd.DataFrame({'ID': [], 'Class':[]})

for subdir in sub_dir:
    mypath = join(main_path,subdir)
    files=[f for f in listdir(mypath)]
    classes=[subdir for c in files]
    files_df = pd.DataFrame({'ID':files, 'Class':classes})
    Data_df = pd.concat([Data_df, files_df])

Data_df['Old_class'] = Data_df['Class']
#merge classes for our setting
Data_df['Class'] = Data_df['Class'].apply(lambda x: 1 if x in ['TUM'] else 0)
Data_df

Unnamed: 0,ID,Class,Old_class
0,MUC-GQLYEALK.tif,0,MUC
1,MUC-FYTGWFGD.tif,0,MUC
2,MUC-AVNDTFWS.tif,0,MUC
3,MUC-DQPFGFLS.tif,0,MUC
4,MUC-FNEDYLHG.tif,0,MUC
...,...,...,...
10441,STR-NQFCFNPH.tif,0,STR
10442,STR-KPWLGGCI.tif,0,STR
10443,STR-FFCLCFPE.tif,0,STR
10444,STR-HGPELQKY.tif,0,STR


In [5]:
# Distribution of the Classes
class_of_patch = Data_df.Class.value_counts()
print('Distribution of classes is as followed:\n{}'.format(class_of_patch[::-1]),'\n','\n')

Distribution of classes is as followed:
Class
1    14317
0    85683
Name: count, dtype: int64 
 



# split data 

In [6]:
# split validation and training data
y_1 = Data_df['Class']

train_test_df, val_df = train_test_split(Data_df,train_size = 0.85,random_state=101,shuffle=True,stratify=y_1 ) 

y_2 = train_test_df['Class']

train_df, test_df = train_test_split(train_test_df,train_size=0.823529,random_state=101,shuffle=True,stratify=y_2)

In [7]:
#check the distribution of data in training and validation sets
class_of_train_patch = train_df.Class.value_counts()
class_of_val_patch = val_df.Class.value_counts()
class_of_test_patch = test_df.Class.value_counts()

print('Class Distribution of Training Examples:\n{}'.format(class_of_train_patch),'\n','\n','\n','\n')
print('Class Distribution of Validation Examples:\n{}'.format(class_of_val_patch),'\n','\n','\n','\n')
print('Class Distribution of Test Examples:\n{}'.format(class_of_test_patch),'\n','\n','\n','\n')

Class Distribution of Training Examples:
Class
0    59978
1    10021
Name: count, dtype: int64 
 
 
 

Class Distribution of Validation Examples:
Class
0    12852
1     2148
Name: count, dtype: int64 
 
 
 

Class Distribution of Test Examples:
Class
0    12853
1     2148
Name: count, dtype: int64 
 
 
 



## write splits to local filesystem

In [8]:
"""
main_dir = 'SplittedData'
os.mkdir(main_dir)

train_dir = join(main_dir,'Training')
os.mkdir(train_dir)

val_dir = join(main_dir,'Validation')
os.mkdir(val_dir)

test_dir = join(main_dir,'Test')
os.mkdir(test_dir)


for subdir in ["0", "1"]:
    train_sub_dir = join(train_dir,subdir)
    os.mkdir(train_sub_dir)
    
    val_sub_dir = join(val_dir,subdir)
    os.mkdir(val_sub_dir)
    
    test_sub_dir = join(test_dir,subdir)
    os.mkdir(test_sub_dir)
"""

'\nmain_dir = \'SplittedData\'\nos.mkdir(main_dir)\n\ntrain_dir = join(main_dir,\'Training\')\nos.mkdir(train_dir)\n\nval_dir = join(main_dir,\'Validation\')\nos.mkdir(val_dir)\n\ntest_dir = join(main_dir,\'Test\')\nos.mkdir(test_dir)\n\n\nfor subdir in ["0", "1"]:\n    train_sub_dir = join(train_dir,subdir)\n    os.mkdir(train_sub_dir)\n    \n    val_sub_dir = join(val_dir,subdir)\n    os.mkdir(val_sub_dir)\n    \n    test_sub_dir = join(test_dir,subdir)\n    os.mkdir(test_sub_dir)\n'

In [9]:
main_dir = 'SplittedData'

train_dir = join(main_dir,'Training')

val_dir = join(main_dir,'Validation')

test_dir = join(main_dir,'Test')


In [10]:
"""
Data_df.set_index('ID', inplace=True)

for img in train_df['ID']:
    label_old = Data_df.loc[img,'Old_class']
    label_new = Data_df.loc[img,'Class']
    source = join(main_path,label_old,img)
    dest = join(train_dir,str(label_new),img)
    
    cv2_img = cv2.imread(source)
    cv2.imwrite(dest,cv2_img)

    
for img in val_df['ID']:
    label_old = Data_df.loc[img,'Old_class']
    label_new = Data_df.loc[img,'Class']
    source = join(main_path,label_old,img)
    dest = join(val_dir,str(label_new),img)
    
    cv2_img = cv2.imread(source)
    cv2.imwrite(dest,cv2_img)

    
for img in test_df['ID']:
    label_old = Data_df.loc[img,'Old_class']
    label_new = Data_df.loc[img,'Class']
    source = join(main_path,label_old,img)
    dest = join(test_dir,str(label_new),img)
    
    cv2_img = cv2.imread(source)
    cv2.imwrite(dest,cv2_img)
"""

"\nData_df.set_index('ID', inplace=True)\n\nfor img in train_df['ID']:\n    label_old = Data_df.loc[img,'Old_class']\n    label_new = Data_df.loc[img,'Class']\n    source = join(main_path,label_old,img)\n    dest = join(train_dir,str(label_new),img)\n    \n    cv2_img = cv2.imread(source)\n    cv2.imwrite(dest,cv2_img)\n\n    \nfor img in val_df['ID']:\n    label_old = Data_df.loc[img,'Old_class']\n    label_new = Data_df.loc[img,'Class']\n    source = join(main_path,label_old,img)\n    dest = join(val_dir,str(label_new),img)\n    \n    cv2_img = cv2.imread(source)\n    cv2.imwrite(dest,cv2_img)\n\n    \nfor img in test_df['ID']:\n    label_old = Data_df.loc[img,'Old_class']\n    label_new = Data_df.loc[img,'Class']\n    source = join(main_path,label_old,img)\n    dest = join(test_dir,str(label_new),img)\n    \n    cv2_img = cv2.imread(source)\n    cv2.imwrite(dest,cv2_img)\n"

# data loaders using keras flow from dir

In [11]:
img_size = 224
img_channel=3
input_shape=(img_size,img_size,img_channel)
BATCH_size =1024
learning_rate = 3E-4
train_step = (len(train_df)/BATCH_size)
val_step = (len(val_df)/BATCH_size)

#datagen = ImageDataGenerator(rescale=1.0/255)
datagen = ImageDataGenerator()

#turn shuffle off for train
train_ds= datagen.flow_from_directory(train_dir,
                                     target_size=(img_size,img_size),
                                     batch_size=BATCH_size,
                                     shuffle = False,
                                     seed = 101,
                                     interpolation = 'nearest',
                                     class_mode='binary')
val_ds = datagen.flow_from_directory(val_dir,
                                     target_size=(img_size,img_size),
                                     batch_size=BATCH_size,
                                     shuffle = False,
                                     interpolation = 'nearest',
                                     class_mode='binary')

test_ds = datagen.flow_from_directory(test_dir,
                                     target_size=(img_size,img_size),
                                     batch_size=BATCH_size,
                                     shuffle = False,
                                     interpolation = 'nearest',
                                     class_mode='binary')

print(train_ds.class_indices)

FileNotFoundError: [Errno 2] No such file or directory: 'SplittedData/Training'

# model eval

In [None]:
#get kai's model from gh
#!git clone https://github.com/luisoala/kai-class.git

In [None]:
#load model
real_model1 = tf.saved_model.load('kai-class/real/real/seed1/network')
real_model2 = tf.saved_model.load('kai-class/real/real/seed2/network')
real_model3 = tf.saved_model.load('kai-class/real/real/seed0/network')
syn_model1 = tf.saved_model.load('kai-class/fake/fake/seed1/network')
syn_model2 = tf.saved_model.load('kai-class/fake/fake/seed2/network')
syn_model3 = tf.saved_model.load('kai-class/fake/fake/seed0/network')
aug_model1 = tf.saved_model.load('kai-class/augmented/augmented/seed1/network')
aug_model2 = tf.saved_model.load('kai-class/augmented/augmented/seed2/network')
aug_model3 = tf.saved_model.load('kai-class/augmented/augmented/seed0/network')

rm_pred1 = real_model1.signatures["pred_fn"]
rm_pred2 = real_model2.signatures["pred_fn"]
rm_pred3 = real_model3.signatures["pred_fn"]
syn_pred1 = syn_model1.signatures["pred_fn"]
syn_pred2 = syn_model2.signatures["pred_fn"]
syn_pred3 = syn_model3.signatures["pred_fn"]
aug_pred1 = aug_model1.signatures["pred_fn"]
aug_pred2 = aug_model2.signatures["pred_fn"]
aug_pred3 = aug_model3.signatures["pred_fn"]

In [None]:
#get keys for pred
#print(list(real_model.signatures.keys()))
#print(list(syn_model.signatures.keys()))

In [None]:
"""
rm_pred = real_model.signatures["pred_fn"]
syn_pred = syn_model.signatures["pred_fn"]
aug_pred = aug_model.signatures["pred_fn"]
"""

In [None]:
def get_preds(model, ds, steps):
    all_predictions = []

    # iterate over all batches in val_ds
    for i in range(int(steps)+1):
        images, labels = ds.next()
        batch_predictions = model(tf.constant(images))['output_0'].numpy()
        all_predictions.append(batch_predictions)

    # concatenate all batch predictions
    all_predictions = np.concatenate(all_predictions, axis=0)
    
    return all_predictions


In [None]:
import sklearn

In [None]:
models = [rm_pred1, rm_pred2, rm_pred3, syn_pred1, syn_pred2, syn_pred3, aug_pred1, aug_pred2, aug_pred3]
#models = [aug_pred2, aug_pred3]
splits = [train_ds, val_ds, test_ds]
stepss = [train_step, val_step, val_step]
#model loop
for model in models:
    #print(model)
    #data loop
    for data, steps in zip(splits, stepss):
        #print(data)
        y_true = data.classes
        #print(y_true)
        preds = get_preds(model, data, steps)
        y_pred=np.argmax(preds,axis=1)
        #print(y_pred)   
        
        ba = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
        print(ba)

# crc data

In [None]:
#Load the directory
main_path = '../input/crc-val-he-7k/CRC-VAL-HE-7K'
sub_dir =os.listdir(main_path)

In [None]:
#Creat The DataFrame
Data_df=pd.DataFrame({'ID': [], 'Class':[]})

for subdir in sub_dir:
    mypath = join(main_path,subdir)
    files=[f for f in listdir(mypath)]
    classes=[subdir for c in files]
    files_df = pd.DataFrame({'ID':files, 'Class':classes})
    Data_df = pd.concat([Data_df, files_df])

Data_df['Old_class'] = Data_df['Class']
#merge classes for our setting
Data_df['Class'] = Data_df['Class'].apply(lambda x: 1 if x in ['TUM'] else 0)
Data_df

In [None]:
print(Data_df.columns.tolist())

In [None]:
# Distribution of the Classes
class_of_patch = Data_df.Class.value_counts()
print('Distribution of classes is as followed:\n{}'.format(class_of_patch[::-1]),'\n','\n')

In [None]:
"""
main_dir = 'crc'
os.mkdir(main_dir)

test_dir = join(main_dir,'Test')
os.mkdir(test_dir)


for subdir in ["0", "1"]:
    test_sub_dir = join(test_dir,subdir)
    os.mkdir(test_sub_dir)
"""

In [None]:
main_dir = 'crc'

test_dir = join(main_dir,'Test')

In [None]:

test_df = Data_df.copy(deep=True)
print(test_df)

Data_df.set_index('ID', inplace=True)

for img in test_df['ID']:
    label_old = Data_df.loc[img,'Old_class']
    #print(label_old)
    label_new = Data_df.loc[img,'Class']
    source = join(main_path,label_old,img)
    dest = join(test_dir,str(label_new),img)
    
    cv2_img = cv2.imread(source)
    cv2.imwrite(dest,cv2_img)


In [None]:
crc_ds = datagen.flow_from_directory(test_dir, target_size= (224,224),batch_size = BATCH_size, 
                                       shuffle = False , class_mode='binary' )

In [None]:
import sklearn

In [None]:
models = models = [rm_pred1, rm_pred2, rm_pred3, syn_pred1, syn_pred2, syn_pred3, aug_pred1, aug_pred2, aug_pred3]
splits = [crc_ds]
stepss = [7180/BATCH_size]
#model loop
for model in models:
    #print(model)
    #data loop
    for data, steps in zip(splits, stepss):
        #print(data)
        y_true = data.classes
        #print(len(y_true))
        preds = get_preds(model, data, steps)
        y_pred=np.argmax(preds,axis=1)
        #print(len(y_pred))
        ba = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
        print(ba)