# Breast cancer detection using `Keras Applications` 

*    on [BreaKHis](https://web.inf.ufpr.br/vri/databases/breast-cancer-histopathological-database-breakhis) dataseet 

## import libraries and connect to drive

In [None]:
!pip install catboost

import numpy as np
import tensorflow as tf
import time
import glob
import os
import random
import matplotlib.pyplot as plt
from IPython.display import HTML, display, clear_output
from tensorflow.keras.preprocessing import image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_curve
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras.applications as k_apps
print(tf.version)

from google.colab import drive
drive.mount('/content/drive')


size = (460,700)
working_dir = 'drive/MyDrive/article-97'
keras_apps = {
    'VGG16'            : {'model': k_apps.vgg16.VGG16,'preprocessor': k_apps.vgg16.preprocess_input, 'activation': ''},
    'VGG19'            : {'model': k_apps.vgg19.VGG19,'preprocessor': k_apps.vgg19.preprocess_input, 'activation': ''},

    'ResNet50'         : {'model': k_apps.resnet50.ResNet50, 'preprocessor': k_apps.resnet.preprocess_input, 'activation': ''},
    'ResNet101'        : {'model': k_apps.resnet.ResNet101, 'preprocessor': k_apps.resnet.preprocess_input, 'activation': ''},
    'ResNet152'        : {'model': k_apps.resnet.ResNet152, 'preprocessor': k_apps.resnet.preprocess_input, 'activation': ''},
    'ResNet50V2'       : {'model': k_apps.resnet_v2.ResNet50V2, 'preprocessor': k_apps.resnet_v2.preprocess_input, 'activation': ''},
    'ResNet101V2'      : {'model': k_apps.resnet_v2.ResNet101V2, 'preprocessor': k_apps.resnet_v2.preprocess_input, 'activation': ''},
    'ResNet152V2'      : {'model': k_apps.resnet_v2.ResNet152V2, 'preprocessor': k_apps.resnet_v2.preprocess_input, 'activation': ''},

    'InceptionV3'      : {'model': k_apps.inception_v3.InceptionV3, 'preprocessor': k_apps.inception_v3.preprocess_input, 'activation': ''},
    'InceptionResNetV2': {'model': k_apps.inception_resnet_v2.InceptionResNetV2, 'preprocessor': k_apps.inception_resnet_v2.preprocess_input, 'activation': 'conv_7b_ac'},

    'DenseNet121'      : {'model': k_apps.densenet.DenseNet121, 'preprocessor': k_apps.densenet.preprocess_input, 'activation': ''},
    'DenseNet169'      : {'model': k_apps.densenet.DenseNet169, 'preprocessor': k_apps.densenet.preprocess_input, 'activation': ''},
    'DenseNet201'      : {'model': k_apps.densenet.DenseNet201, 'preprocessor': k_apps.densenet.preprocess_input, 'activation': ''},

    'Xception'         : {'model': k_apps.xception.Xception, 'preprocessor': k_apps.xception.preprocess_input, 'activation': 'block14_sepconv2_act'},
    'NASNetLarge'      : {'model': k_apps.nasnet.NASNetLarge, 'preprocessor': k_apps.nasnet.preprocess_input, 'activation': ''},

    'EfficientNetV2M'  : {'model': k_apps.efficientnet_v2.EfficientNetV2M, 'preprocessor': k_apps.efficientnet_v2.preprocess_input, 'activation': ''},
    'EfficientNetV2L'  : {'model': k_apps.efficientnet_v2.EfficientNetV2L, 'preprocessor': k_apps.efficientnet_v2.preprocess_input, 'activation': ''},
    'EfficientNetB6'   : {'model': k_apps.efficientnet.EfficientNetB6, 'preprocessor': k_apps.efficientnet.preprocess_input, 'activation': ''},
}


## Reading images and extracting features 

In [None]:
random_seed = 0
np.random.seed(random_seed)
random.seed(random_seed)
tf.random.set_seed(random_seed)

featureExt =  "EfficientNetB6" #@param ["VGG16", "VGG19", "ResNet50", "ResNet101", "ResNet152", "ResNet50V2", "ResNet101V2", "ResNet152V2", "InceptionV3", "InceptionResNetV2", "DenseNet121","DenseNet169","DenseNet201", "Xception", "NASNetLarge","EfficientNetV2M", "EfficientNetV2L", "EfficientNetB6"]
magnification =  "breast_400" #@param ["breast_40", "breast_100", "breast_200", "breast_400"]
extracting_mode = "read pre-extracted features if exist" #@param ["read pre-extracted features if exist", "force to extract features now"]

# y1: Benign or Malignant (Two class classification)
# y2: TA, PT, F, A, DC, PC, LC, MC (Eight class classification)
# y3: Slide id (Pateint)
X, y1, y2, y3 = [], [], [], []

pre_extracted_file = f'{working_dir}/extracted_features/{featureExt}-{magnification}.txt'

def read__tc_tt_id(path : 'str'):
  '''
  this mathod takes an image file full path
  and returns the tumor_class, tumuor_type and slide_id
  '''
  f = path.split('/')[-1].split('-')
  id = f[2]
  f = f[0].split('_')
  tt, tc= f[2], f[1]
  return tc, tt, id


def extract_features():
  global X, y1, y2, y3
  model = keras_apps[featureExt]['model'](input_shape=size+(3,), weights=None, pooling='max')
  preprocess_input = keras_apps[featureExt]['preprocessor']
  print(f'[start reading images and extracting features] \nfeture extractor: {featureExt}, {model} \npreprocessor    : {preprocess_input}\n')

  paths = glob.glob(f'{working_dir}/tf_files/{magnification}/*/*')

  out = display(HTML("<progress style='width: 50%'/>"), display_id=True)
  n = len(paths)
  for i in range(n):
      img = image.load_img(str(paths[i]), target_size=size)
      img = image.img_to_array(img)
      img = preprocess_input(img) 
      img = np.expand_dims(img, axis=0)
      features = model.predict(img)
      tc, tt, id = read__tc_tt_id(paths[i])
      X.append(features[0])
      y1.append(tc)
      y2.append(tt)
      y3.append(id)
      out.update(HTML(f"[{i+1}/{n} images are processed] <br><progress value={i+1} max={n} , style='width: 60%'/></progress>  [{(i+1)/n*100:.2f}]%"))

  X = np.array(X)
  le1, le2, le3 = LabelEncoder(), LabelEncoder(), LabelEncoder()
  y1, y2, y3 = np.array(le1.fit_transform(y1)), np.array(le2.fit_transform(y2)), np.array(le3.fit_transform(y3))

  #save etracted features
  with open(pre_extracted_file, 'w') as f:
    n, m = len(X), len(X[0])
    for i in range(n):
        for j in range(m):
            f.write(str(X[i][j])+ ' ')
        f.write(f'{y1[i]} {y2[i]} {y3[i]}\n')


if extracting_mode == "read pre-extracted features if exist": 
  if os.path.isfile(pre_extracted_file):
    print('[pre-extracted features file was found] : ' + pre_extracted_file)
    with open(pre_extracted_file , 'r') as f:
      l = f.readline()
      while l != '':
          X.append([float(x) for x in l.split()])
          l = f.readline()
      X = np.array(X)
      y1 = X[:, -3]
      y2 = X[:, -2]
      y3 = X[:, -1]
      X = X[:, :-3]
  else:
    print('[pre-extracted features file NOT found]')
    extract_features()
else:
  extract_features()
  


print(f'''
[available data]:
    - X : features matrix
          shape {X.shape}
    - y1: Binign/Malignant identifier 
          shape {y1.shape}
    - y2: Tumour type (A, DC, F, LC, MC, PC, PT, TA) identifier 
          shape {y2.shape}
    - y3: image slid id (patient id)
          shape {y3.shape} 
''')

## Classification

In [None]:
def accuracy_proba(y_test, p):
  '''
  this method takes y_test and list of probabilities
  of each class provided by classifier.predict_proba.
  Then returns the accuracy of that list in compare to
  passed y_test using accuracy_score.
  '''
  y_pred = []
  for r in p:
    mx = 0
    for i in range(len(r)):
      if r[i] > r[mx]:
        mx = i
    y_pred.append(mx)
  return accuracy_score(y_test, y_pred)


def eval(X, y, ctype:'str', clfs):
  '''
  This method evaluates the extracted features by three
  classifiers which are tuned before and prints the accuracies
  of predictions prepared by each classifier and the combination
  of those classifiers. The random state used to split train and test.
  '''
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = random_state)
  probas = []
  for c in clfs:
    c.fit(X_train, y_train)
    probas.append(c.predict_proba(X_test))

  print(f'''
  Feature extractor  : {featureExt}
  Magnification      : {magnification}
  Calssification type: {ctype}
          {accuracy_proba(y_test, probas[0]) *100 :.4f}%  by XGBClassifier
          {accuracy_proba(y_test, probas[1]) *100 :.4f}%  by LGBMClassifier
          {accuracy_proba(y_test, probas[2]) *100 :.4f}%  by CatBoostClassifier
          {accuracy_proba(y_test, probas[0]+probas[1]) *100 :.4f}%  by XGBClassifier  and LGBMClassifier
          {accuracy_proba(y_test, probas[0]+probas[2]) *100 :.4f}%  by XGBClassifier  and CatBoostClassifier
          {accuracy_proba(y_test, probas[1]+probas[2]) *100 :.4f}%  by LGBMClassifier and CatBoostClassifier
          {accuracy_proba(y_test, probas[0]+probas[1]+probas[2]) *100 :.4f}%  by XGBClassifier  and LGBMClassifier and CatBoostClassifier
  ''')

#### Two Class Classification (Benign or Malignnat)

In [None]:
eval(X, y1, rs, 'Tow-Class Classification', 
            [XGBClassifier(learning_rate = 0.34, silent = True),
             LGBMClassifier(learning_rate = 0.11, silent = True),
             CatBoostClassifier(l2_leaf_reg=3, border_count=128, iterations=1000, depth=7, logging_level='Silent')]) 