In [1]:
import os
import random
from math import ceil
from contextlib import redirect_stdout

import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from IPython.display import display
from pytictoc import TicToc
from utils import PlotLosses, grafica_kfold

NUM_CLASSES = 2
ruta = os.path.abspath(r'./')
directorio_experimento = os.path.join(ruta, f'{NUM_CLASSES}_class')
if not os.path.exists(directorio_experimento):
    os.mkdir(directorio_experimento)
os.chdir(directorio_experimento)
gpus = tf.config.experimental.list_physical_devices('GPU')

tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
tf.compat.v1.disable_eager_execution()

In [2]:
def build_model(dropout, fc_layers, num_classes, opt, metrics=['accuracy']):

    base_model = tf.keras.applications.vgg19.VGG19(weights='imagenet', include_top=False, input_shape=(HEIGHT, WIDTH, 3))
    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    x =  tf.keras.layers.Flatten()(x)
    for fc, drop in zip(fc_layers, dropout):
        x =  tf.keras.layers.Dense(fc, activation='relu')(x)
        x =  tf.keras.layers.Dropout(drop)(x)

    predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    
    finetune_model = tf.keras.Model(inputs=base_model.input, outputs=predictions)

    finetune_model.compile(opt, loss='categorical_crossentropy', metrics=metrics)

    return finetune_model

In [3]:
BATCH_SIZE = 64
WIDTH = 256 # 256
HEIGHT = 256 # 256
FC_LAYERS = [1024 * 1, 1024  * 1]  # 1024
LR = 1e-5
OPT = tf.keras.optimizers.Adam(lr=LR)
SEED = 111091
np.random.seed(SEED)
EPOCHS = 30 # 60
# mostrar experimento quitando y poniendo droputs
# con dos capas de droput el modelo infra ajusta
# dropout esta bien
DROPOUT = [0.5, 0.5]
EVALUACIONES = []
KFOLD_SPLITS = 10
avg_acc = []
avg_loss = []

datos = pd.read_csv(f'{ruta}\\database\\bd_aumentada\\database_augment.csv')

datos_random = datos.sample(frac=1).reset_index(drop=True)

HYP = dict(batch_size=BATCH_SIZE, 
           width=WIDTH, 
           height=HEIGHT, 
           learning_rate=LR, 
           optimizer=str(OPT), 
           seed=SEED, 
           epochs=EPOCHS, 
           dropout=str(DROPOUT), 
           kfold_splits=KFOLD_SPLITS, 
           fc_layers=str(FC_LAYERS),
           num_classes=NUM_CLASSES
)

HYP = {'Valor': [BATCH_SIZE, 
                 WIDTH, 
                 HEIGHT, 
                 LR, 
                 OPT, 
                 SEED, 
                 EPOCHS, 
                 DROPOUT, 
                 KFOLD_SPLITS, 
                 FC_LAYERS, 
                 NUM_CLASSES]}
df_hyp = pd.DataFrame(HYP, 
                      index=['BATCH_SIZE', 
                             'WIDTH', 
                             'HEIGHT', 
                             'LR', 
                             'OPT', 
                             'SEED', 
                             'EPOCHS', 
                             'DROPOUT', 
                             'KFOLD_SPLITS', 
                             'FC_LAYERS', 
                             'NUM_CLASSES']
                     )
display(df_hyp)
df_hyp.to_csv('hiperparametros.csv')

kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=SEED)

In [4]:
t = TicToc()
inicio = t.tic()
for i, (train_indices, val_indices) in enumerate(kf.split(datos_random)):

    fold_dir = f'fold_{i}'
    print(f'Iniciando Fold: {i}')
    if not os.path.exists(fold_dir):
        os.mkdir(fold_dir)
    print(f'Datos de entrenamiento: {len(train_indices)}')
    print(f'Datos de validacion: {len(val_indices)}')
    print('Dividiendo datos')
    train = datos.iloc[train_indices]
    val = datos.iloc[val_indices]
    train.to_csv(os.path.join(fold_dir, "datos_train.csv"))
    val.to_csv(os.path.join(fold_dir, "datos_val.csv")) 

    print('Construyendo modelo')
    metrics = ['accuracy']
    model = build_model(DROPOUT, FC_LAYERS, num_classes=NUM_CLASSES, opt=OPT, metrics=metrics)

    print('Creando generadores')

    train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.vgg19.preprocess_input,
        horizontal_flip=True,
        vertical_flip=True,
        # rotation_range=25,
        fill_mode='constant'
        
    )
    
    test_datagen =tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.vgg19.preprocess_input,
    )
    
    train_generator = train_datagen.flow_from_dataframe(
                                          train,
                                          None,
                                          x_col='file',
                                          target_size=(WIDTH, HEIGHT),
                                          y_col=f'Class_cat_{NUM_CLASSES}', 
                                          batch_size=BATCH_SIZE, 
                                          seed=SEED,
                                          has_ext=True,class_mode='categorical')

    
    test_generator = test_datagen.flow_from_dataframe(val, 
                                          None, 
                                          x_col='file', 
                                          target_size=(WIDTH, HEIGHT),
                                          y_col=f'Class_cat_{NUM_CLASSES}', 
                                          batch_size=BATCH_SIZE,
                                          seed=SEED,
                                          has_ext=True,
                                          class_mode='categorical', 
                                          shuffle=True)
    class_list = list(test_generator.class_indices.keys())
    
    with open('summary.txt', 'w') as f:
        with redirect_stdout(f):
            model.summary()
            
    tf.keras.utils.plot_model(
    model,
    to_file='model.png',
    show_shapes=True,
    show_layer_names=True,
    rankdir='TB'
    )

    checkpoint = tf.keras.callbacks.ModelCheckpoint(f"{fold_dir}/model.best.h5", 
                                                 monitor="val_accuracy", 
                                                 verbose=1, 
                                                 mode='max', 
                                                 save_best_only=True)
    
    plot_losses = PlotLosses(figsize=(10,6))
    callbacks_list = [
                      plot_losses, 
                      checkpoint, 
                      # tf.keras.callbacks.TensorBoard(log_dir=os.path.join(fold_dir, 'log/'), histogram_freq=1), 
                      tf.keras.callbacks.CSVLogger(os.path.join(fold_dir, 'log.csv'))
    ]
    print('Iniciando entrenamiento')
    history = model.fit(train_generator, 
                        epochs=EPOCHS, 
                        workers=16, 
                        shuffle=True, 
                        callbacks=callbacks_list, 
                        verbose=1, 
                        steps_per_epoch=ceil(len(train_indices) / BATCH_SIZE),
                        validation_data=test_generator, 
                        validation_steps=ceil(len(val_indices) / BATCH_SIZE)
                                 )
    
    acc = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    avg_loss.append(val_loss[-1])
    avg_acc.append(val_accuracy[-1]*100)
    
    print('Salvando modelo')       
    tf.saved_model.save(model, os.path.join(fold_dir, "vgg19_multiclass"))                                                             
    model.save(os.path.join(fold_dir, 'model.h5')) 

print(f'Resultados de la validación cruzada (K = {KFOLD_SPLITS})')
print(f'Pérdida {np.mean(avg_loss)} (+/- {np.std(avg_loss)}%)')
print(f'Precisión {np.mean(avg_acc)}% (+/- {np.std(avg_acc)}%)')

final_segundos = t.tocvalue()
final_minutos = final_segundos/60
final_horas = final_minutos/60

print('Tiempo transcurrido')
print(f'Segundos: {float(final_segundos)} sec')
print(f'Minutos: {float(final_minutos)} min')
print(f'Horas: {float(final_horas)} horas')

tiempos = {'tiempo_transcurrido': [final_segundos, final_minutos, final_horas]}
tiempo_df = pd.DataFrame(tiempos, index=['segundos', 'minutos', 'horas'])
tiempo_df.to_csv('tiempo_experimento.csv')

In [5]:
rendimiento = dict(acc=avg_acc, loss=avg_loss)
df_rendimiento = pd.DataFrame(rendimiento, index=list(range(0,KFOLD_SPLITS)))
df_rendimiento.index.name = 'fold'
display(df_rendimiento)
display(df_rendimiento.describe())
df_rendimiento.to_csv('rendimiento_experimento.csv')
datos_kfold = datos.sort_values(by=[f'Class_num_{NUM_CLASSES}'])
grafica_kfold(kf, datos_kfold['file'], 
              datos_kfold[f'Class_num_{NUM_CLASSES}'], 
              n_splits=KFOLD_SPLITS, 
              num_classes=NUM_CLASSES, 
              nom_archivo=f'kfold_{KFOLD_SPLITS }_{NUM_CLASSES}', 
              lw=20)