In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !pip install pycm
# !pip install livelossplot

In [3]:
%matplotlib inline
import pathlib
import pandas as pd
import numpy as np
import os

import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

from livelossplot import PlotLossesKerasTF
from pycm import ConfusionMatrix

os.environ["CUDA_VISIBLE_DEVICES"]="1"

SEED = 199510
np.random.seed(SEED)
tf.random.set_seed(SEED)

ROOT_DIR = pathlib.Path().resolve().parent

# Descomentar para correr en Google Colab usando Drive
# ROOT_DIR = pathlib.Path("drive/MyDrive/curso_cisc_2022")

DATA_DIR = ROOT_DIR / "Data" / "smear2005"
print(ROOT_DIR)

import utils

C:\Users\marco\Documents\Trabajo\Academia\curso_cisc_2022


In [4]:
dataset = pd.read_csv(f"{DATA_DIR}/dataset.csv")
CLASS_COLUMN = "Class_cat_7"

In [5]:
class_names = dataset[CLASS_COLUMN].unique()
num_classes = len(class_names)
print(f"Número de clases: {num_classes}")
print(f"Clases: {class_names}")

Número de clases: 7
Clases: ['normal_superficiel' 'normal_intermediate' 'normal_columnar'
 'light_dysplastic' 'moderate_dysplastic' 'severe_dysplastic'
 'carcinoma_in_situ']


In [6]:
IMG_HEIGHT = 100
IMG_WIDTH = 100
IMG_SHAPE = (IMG_HEIGHT, IMG_WIDTH, 3)
BATCH_SIZE = 256
EPOCHS = 100
LEARNING_RATE = 0.001
OPT = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
METRICS = [
    "accuracy", 
    tf.keras.metrics.Precision(name="precision"), 
    tf.keras.metrics.Recall(name="recall"),
    tfa.metrics.F1Score()
    ]
LOSS = "binary_crossentropy" if num_classes == 2 else "categorical_crossentropy"
OUTPUT_NEURONS = 1 if num_classes == 2 else num_classes
OUTPUT_ACTIVATION = "sigmoid" if num_classes == 2 else "softmax"
SPLIT = 0.2

In [58]:
train, test = train_test_split(dataset, test_size=SPLIT, random_state=SEED, stratify=dataset[CLASS_COLUMN])
train = train.reset_index(drop=True)

test_datagen =tf.keras.preprocessing.image.ImageDataGenerator()

test_generator = test_datagen.flow_from_dataframe(
                                        test,
                                        None,
                                        x_col='filename',
                                        target_size=(IMG_WIDTH, IMG_HEIGHT),
                                        y_col=CLASS_COLUMN,
                                        batch_size=BATCH_SIZE,
                                        seed=SEED,
                                        class_mode='binary' if num_classes == 2 else 'categorical'
                                        )

Found 184 validated image filenames belonging to 7 classes.


In [8]:
data_augmentation = tf.keras.Sequential(
  [
    tf.keras.layers.RandomFlip(input_shape=IMG_SHAPE),
    tf.keras.layers.RandomRotation(0.3, fill_mode="constant"),
    tf.keras.layers.RandomZoom(0.2, fill_mode="constant"),
  ]
)

In [9]:
def create_model():
    base_model = tf.keras.applications.EfficientNetB0(weights='imagenet', include_top=False)
    preprocess_input = tf.keras.applications.efficientnet.preprocess_input
    base_model.trainable = False
    model = tf.keras.Sequential([
    tf.keras.layers.Lambda(preprocess_input, input_shape=IMG_SHAPE),
    data_augmentation,
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(OUTPUT_NEURONS, activation=OUTPUT_ACTIVATION)
    ])

    model.compile(optimizer=OPT, loss=LOSS, metrics=METRICS)
    return model

In [59]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

for i, (train_indices, val_indices) in enumerate(skf.split(train, train[CLASS_COLUMN])):
    print(f'Iniciando Fold: {i}')
    print(f'Datos de entrenamiento: {len(train_indices)}')
    print(f'Datos de validacion: {len(val_indices)}')
    print('Dividiendo datos')
    print(len(train.index))
    train = train.iloc[train_indices]
    val = train.iloc[val_indices]
    model = create_model()
    train_datagen = tf.keras.preprocessing.image.ImageDataGenerator()

    val_datagen =tf.keras.preprocessing.image.ImageDataGenerator()

    train_generator = train_datagen.flow_from_dataframe(
                                            train,
                                            None,
                                            x_col='filename',
                                            target_size=(IMG_WIDTH, IMG_HEIGHT),
                                            y_col=CLASS_COLUMN,
                                            batch_size=BATCH_SIZE,
                                            seed=SEED,
                                            class_mode='binary' if num_classes == 2 else 'categorical'
                                            )

    val_generator = val_datagen.flow_from_dataframe(
                                            val,
                                            None,
                                            x_col='filename',
                                            target_size=(IMG_WIDTH, IMG_HEIGHT),
                                            y_col=CLASS_COLUMN,
                                            batch_size=1,
                                            seed=SEED,
                                            class_mode='binary' if num_classes == 2 else 'categorical',
                                            shuffle=False
                                            )
    
    history = model.fit(
        train_generator,
        validation_data=val_generator,
        callbacks=[PlotLossesKerasTF()],
        epochs=EPOCHS,
        steps_per_epoch=train_generator.n // train_generator.batch_size,
        validation_steps=val_generator.n // val_generator.batch_size
    )
    
    evaluations = model.evaluate(test_generator, verbose=0)

    for evaluation, metric_name in zip(evaluations, model.metrics_names):
        print(f"{metric_name}: {evaluation}")

Iniciando Fold: 0
Datos de entrenamiento: 586
Datos de validacion: 147
Dividiendo datos
733


IndexError: positional indexers are out-of-bounds

In [None]:
raw_pred = model.predict(val_generator)

In [None]:
pred = np.argmax(raw_pred, axis=1)

In [None]:
i = 2
utils.plot_softmax(i, raw_pred[i], val_generator.labels, val_generator.filenames, class_names)

In [None]:
df_pred = pd.DataFrame({"Real": val_generator.labels, "Pred": pred})
df_pred = df_pred.replace(utils.map_7_classes)
df_pred.head()

In [None]:
df_error = df_pred[df_pred["Real"] != df_pred["Pred"]]
print(len(df_error.index))

In [None]:
df_error.groupby("Pred").size()

In [None]:
cm = ConfusionMatrix(actual_vector=df_pred["Real"].values, predict_vector=df_pred["Pred"].values)
print(cm)

In [None]:
plt.figure(figsize=(15,15))
cm.plot(cmap=plt.cm.Blues, number_label=True);

In [None]:
plt.figure(figsize=(15,15))
cm.plot(cmap=plt.cm.Blues, number_label=True, normalized=True);

In [None]:
utils.plot_roc_multiclass(df_pred, class_list = dataset["Class_cat_7"].unique())

In [None]:
utils.plot_precision_recall_curve_multiclass(df_pred, class_list = dataset["Class_cat_7"].unique())