# Project 08: Epidemium - Model training
## Bloc n°6 - Jedha - dsmft - Paris14
### Joseph Abitbol & Nicolas Hegerle

## Library imports and function definition

### <ins>Import the necessary libraries</ins>

In [None]:

import pandas as pd
from IPython.display import clear_output
from train_func import *


### <ins>Define functions</ins>

* construction of the dataframe requires that we shuffle it a bit before training the model to avoid batch<br>
shuffle bias in which not all the target outcomes are distributed between train and val

In [None]:
# little function to shuffle our inital dataframe
def shuffle_dataframe(df, nb_shuffle):
    for i in range(nb_shuffle):
        df = df.sample(frac=1, random_state = 123)
    df.reset_index(drop=True, inplace = True)

    return df

## Load the data

In [None]:
# load the dataset and shuffle it
df = pd.read_csv("src/train/train_img_data.csv", index_col=[0])
df = shuffle_dataframe(df, 10)
print(f"Shape of the df: {df.shape}")
df.head()

## Model training

In [None]:
ConvNet = convnet(224,224,3)

## Lets train some models

### <ins>Train base models on all mixed cell types and split raw, red, blue and all sub-figures</ins>

In [None]:
# tensorflow applications used for transferlearning: keys = dir names used to save files, values = instances of the applications
app_dir, app_instance = ('CN' , ConvNet)

# level of layers we will free for model training: tl_dir = name used to save files, values = None implies we use the base model and only train prediction layer
tl_dir, tl_pct = ('base', None)

# cell types for which we want to train the model, cell_dir = name for files, None implies we use all img data
cell_dir, cell_type = ('all', None)

# format of the img used to filter data for training: raw => use only raw untransformed images; False => use red, blue and raw images
img_format = [None, 'raw', 'red', 'blue']

 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

for img_type in img_format: 

    if img_type: # generate mask and filter dataframe on image_format
        mask = df['img_format'] == img_type
        data = df.loc[mask]

    else:
        img_type = 'rrb'
        data = df

    # generate the ImageDataGenerator flow from dataframe image generators from the dataframe
    train_gen, val_gen = create_imgen(data)

    print(f"====Generated the img generators for {cell_dir} cell types and {img_type} sub-images====")

    # loop through the tf applications to train each model on each cell type
        
    # set paths to save checkpoints and tensorboard data
    check_path = f"model/{app_dir}_checkpoint/{cell_dir}_{img_type}_{app_dir}_{tl_dir}.hdf5"
    tb_path = f"src/tf_logs/{app_dir}_tl_logs/{cell_dir}_{img_type}_{app_dir}_{tl_dir}"
    checkpoint, tensorboard = create_callbacks(check_path, tb_path)
    model = app_instance
    print(model.summary())

    print(f"\n====Started training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

    best_model = train_model(model, train_gen, val_gen, checkpoint, tensorboard, loss = 'mae', learning_rate=0.05, epochs = 30)
    clear_output(wait = True)

    print(f"\n====Finished training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

    # generate predictions on train and val
    path = "src/pred_df/"
    dir = f"{app_dir}_{tl_dir}_preds"
    file_names = [f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_train.csv", f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_test.csv"]

    print(f"\n====Started generating predictions for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

    _, _ = generate_predictions(data, best_model, 30, train_gen, val_gen, save = True, path=path, dir = dir, file_names=file_names)

    print(f"\n====Loop finished for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")




In [None]:
# tensorflow applications used for transferlearning: keys = dir names used to save files, values = instances of the applications
applications = {'Iv3' : InceptionV3, 'IRNv2' : InceptionResNetV2, 'DN201' : DenseNet201}

# level of layers we will free for model training: tl_dir = name used to save files, values = None implies we use the base model and only train prediction layer
tl_dir, tl_pct = ('base', None)

# cell types for which we want to train the model, cell_dir = name for files, None implies we use all img data
cell_dir, cell_type = ('all', None)

# format of the img used to filter data for training: raw => use only raw untransformed images; False => use red, blue and raw images
img_format = ['raw', 'red', 'blue']

 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

for img_type in img_format: 

    if img_type: # generate mask and filter dataframe on image_format
        mask = df['img_format'] == img_type
        data = df.loc[mask]

    else:
        img_type = 'rrb'
        data = df

    # generate the ImageDataGenerator flow from dataframe image generators from the dataframe
    train_gen, val_gen = create_imgen(data)

    print(f"====Generated the img generators for {cell_dir} cell types and {img_type} sub-images====")

    # loop through the tf applications to train each model on each cell type

    for app_dir, app_instance in applications.items():
        
        # instantiate the model (if free == None base model is trained
        model = create_model(app_instance, free = tl_pct, activation = 'linear')
        print(f"\n====Generated model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")
        print(model.summary())

        # set paths to save checkpoints and tensorboard data
        check_path = f"model/{app_dir}_checkpoint/{cell_dir}_{img_type}_{app_dir}_{tl_dir}.hdf5"
        tb_path = f"src/tf_logs/{app_dir}_tl_logs/{cell_dir}_{img_type}_{app_dir}_{tl_dir}"
        checkpoint, tensorboard = create_callbacks(check_path, tb_path)

        print(f"\n====Started training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

        best_model = train_model(model, train_gen, val_gen, checkpoint, tensorboard, loss = 'mae', learning_rate=0.05, epochs = 30)
        clear_output(wait = True)

        print(f"\n====Finished training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

        # generate predictions on train and val
        path = "src/pred_df/"
        dir = f"{app_dir}_{tl_dir}_preds"
        file_names = [f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_train.csv", f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_test.csv"]

        print(f"\n====Started generating predictions for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

        _, _ = generate_predictions(data, best_model, 30, train_gen, val_gen, save = True, path=path, dir = dir, file_names=file_names)

        print(f"\n====Loop finished for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")
        




In [None]:
# tensorflow applications used for transferlearning: keys = dir names used to save files, values = instances of the applications
applications = {'Iv3' : InceptionV3, 'IRNv2' : InceptionResNetV2, 'DN201' : DenseNet201}

# level of layers we will free for model training: tl_dir = name used to save files, values = 20 implies we free 20% of the model for training
tl_dir, tl_pct = ('20pct', 20)

# cell types for which we have imgs available: keys = dir names used to save files, velues = cell type names used to subselect the data
cell_types = {
        'all' : False,
        'ci1' : 'cell_infla_1',
        'ci2' : 'cell_infla_2',
        'ci4' : 'cell_infla_4',
        'cti5' : 'cell_tum_infla_3',
        'cti6' : 'cell_tum_infla_6',
        'st' : 'seg_tissu'
            }

# format of the img used to filter data for training: raw => use only raw untransformed images; False => use red, blue and raw images
img_format = [None, 'raw', 'red', 'blue']

 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

for cell_dir, cell_type in cell_types.items():
    clear_output(wait = True)
    
    if cell_type: # if cell type is not False create a mask to filter the cell type
        mask_1 = df['raw_img_dir'] == cell_type

    else: # else generate a mask to keep the entire dataframe
        mask_1 = pd.Series([True for i in range(df.shape[0])])
    
    # loop through the image types to train on raw images (raw) only or raw with blue and red filtered images (False)
    # generate a second mask to filter the dataframe based on cell type and image type

    for img_type in img_format: 

        if cell_type == 'seg_tissu' and img_type: # no filtered images for seg_tissu so skip this step for raw seg_tissu
            continue

        elif img_type: # generate 2nd mask and filter dataframe on cell_type and image_format
            mask_2 = df['img_format'] == img_type
            data = df.loc[mask_1 & mask_2]

        else:
            data = df.loc[mask_1]
            img_type = 'rrb'

        # generate the ImageDataGenerator flow from dataframe image generators from the dataframe
        train_gen, val_gen = create_imgen(data)

        print(f"====Generated the img generators for {img_type}, {cell_dir}====")

        # loop through the tf applications to train each model on each cell type

        for app_dir, app_instance in applications.items():

            # instantiate the model if free == None base model is trained
            model = create_model(app_instance, free = tl_pct, activation = 'linear')
            print(f"\n====Generated model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")
            print(model.summary())

            # set paths to save checkpoints and tensorboard data
            check_path = f"model/{app_dir}_checkpoint/{cell_dir}_{img_type}_{app_dir}_{tl_dir}.hdf5"
            tb_path = f"src/tf_logs/{app_dir}_tl_logs/{cell_dir}_{img_type}_{app_dir}_{tl_dir}"
            checkpoint, tensorboard = create_callbacks(check_path, tb_path)

            print(f"\n====Started training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

            best_model = train_model(model, train_gen, val_gen, checkpoint, tensorboard, loss = 'mae', epochs = 70, learning_rate=0.05)
            clear_output(wait = True)

            print(f"\n====Finished training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

            # generate predictions on train and val
            path = "src/pred_df/"
            dir = f"{app_dir}_{tl_dir}_preds"
            file_names = [f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_train.csv", f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_test.csv"]

            print(f"\n====Started generating predictions for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

            _, _ = generate_predictions(data, best_model, 30, train_gen, val_gen, save = True, path=path, dir = dir, file_names=file_names)
        
            print(f"\n====Loop finshed for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")






In [None]:
# tensorflow applications used for transferlearning: keys = dir names used to save files, values = instances of the applications
applications = {'Iv3' : InceptionV3, 'IRNv2' : InceptionResNetV2, 'DN201' : DenseNet201}

# level of layers we will free for model training: tl_dir = name used to save files, values = 20 implies we free 20% of the model for training
tl_dir, tl_pct = ('30pct', 30)

# cell types for which we have imgs available: keys = dir names used to save files, velues = cell type names used to subselect the data
cell_types = {
        'all' : False,
        'ci1' : 'cell_infla_1',
        'ci2' : 'cell_infla_2',
        'ci4' : 'cell_infla_4',
        'cti5' : 'cell_tum_infla_3',
        'cti6' : 'cell_tum_infla_6',
        'st' : 'seg_tissu'
            }

# format of the img used to filter data for training: raw => use only raw untransformed images; False => use red, blue and raw images
img_format = [None, 'raw', 'red', 'blue']

 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

for cell_dir, cell_type in cell_types.items():
    clear_output(wait = True)
    
    if cell_type: # if cell type is not False create a mask to filter the cell type
        mask_1 = df['raw_img_dir'] == cell_type

    else: # else generate a mask to keep the entire dataframe
        mask_1 = pd.Series([True for i in range(df.shape[0])])
    
    # loop through the image types to train on raw images (raw) only or raw with blue and red filtered images (False)
    # generate a second mask to filter the dataframe based on cell type and image type

    for img_type in img_format: 

        if cell_type == 'seg_tissu' and img_type: # no filtered images for seg_tissu so skip this step for raw seg_tissu
            continue

        elif img_type: # generate 2nd mask and filter dataframe on cell_type and image_format
            mask_2 = df['img_format'] == img_type
            data = df.loc[mask_1 & mask_2]

        else:
            data = df.loc[mask_1]
            img_type = 'rrb'

        # generate the ImageDataGenerator flow from dataframe image generators from the dataframe
        train_gen, val_gen = create_imgen(data)

        print(f"====Generated the img generators for {img_type}, {cell_dir}====")

        # loop through the tf applications to train each model on each cell type

        for app_dir, app_instance in applications.items():

            # instantiate the model if free == None base model is trained
            model = create_model(app_instance, free = tl_pct, activation = 'linear')
            print(f"\n====Generated model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")
            print(model.summary())

            # set paths to save checkpoints and tensorboard data
            check_path = f"model/{app_dir}_checkpoint/{cell_dir}_{img_type}_{app_dir}_{tl_dir}.hdf5"
            tb_path = f"src/tf_logs/{app_dir}_tl_logs/{cell_dir}_{img_type}_{app_dir}_{tl_dir}"
            checkpoint, tensorboard = create_callbacks(check_path, tb_path)

            print(f"\n====Started training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

            best_model = train_model(model, train_gen, val_gen, checkpoint, tensorboard, loss = 'mae', epochs = 70, learning_rate=1)
            clear_output(wait = True)

            print(f"\n====Finished training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

            # generate predictions on train and val
            path = "src/pred_df/"
            dir = f"{app_dir}_{tl_dir}_preds"
            file_names = [f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_train.csv", f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_test.csv"]

            print(f"\n====Started generating predictions for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

            _, _ = generate_predictions(data, best_model, 30, train_gen, val_gen, save = True, path=path, dir = dir, file_names=file_names)
        
            print(f"\n====Loop finshed for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")






In [None]:
# dictionaries and lists used for model training

# tensorflow applications used for transferlearning: keys = dir names used to save files, values = instances of the applications
applications = {
        'Iv3' : InceptionV3,
        'IRNv2' : InceptionResNetV2,
        'DN201' : DenseNet201,
        'CN' : ConvNet
            }

# level of layers we will free for model training: keys = dir names used to save files, values = % of the layers we will free
tl_free_layers = {
        'base' : None,
        '20pct' : 20,
        '30pct' : 30
            }

# cell types for which we have imgs available: keys = dir names used to save files, values = cell type names used to subselect the data
cell_types = {
        'all' : False,
        'ci1' : 'cell_infla_1',
        'ci2' : 'cell_infla_2',
        'ci4' : 'cell_infla_4',
        'cti5' : 'cell_tum_infla_3',
        'cti6' : 'cell_tum_infla_6',
        'st' : 'seg_tissu'
            }

# format of the img used to filter data for training: raw => use only raw untransformed images; False => use red, blue and raw images
img_format = [
        False,
        'raw',
        'red',
        'blue'
            ]

In [None]:
 # loop through the different cell types to train on all images (False) or on each individual cell_type
 # genrate a mask to filter the dataframe

for cell_dir, cell_type in cell_types.items():
    clear_output(wait = True)
    
    if cell_type: # if cell type is not False create a mask to filter the cell type
        mask_1 = df['raw_img_dir'] == cell_type

    else: # else generate a mask to keep the entire dataframe
        mask_1 = pd.Series([True for i in range(df.shape[0])])
    
    # loop through the image types to train on raw images (raw) only or raw with blue and red filtered images (False)
    # generate a second mask to filter the dataframe based on cell type and image type

    for img_type in img_format: 

        if cell_type == 'seg_tissu' and img_type: # no filtered images for seg_tissu so skip this step for raw seg_tissu
            continue

        elif img_type: # generate 2nd mask and filter dataframe on cell_type and image_format
            mask_2 = df['img_format'] == img_type
            data = df.loc[mask_1 & mask_2]

        else:
            data = df.loc[mask_1]
            img_type = 'rrb'

        # generate the ImageDataGenerator flow from dataframe image generators from the dataframe
        train_gen, val_gen = create_imgen(data)

        print(f"====Generated the img generators for {img_type}, {cell_dir}====")

        # loop through the tf applications to train each model on each cell type

        for app_dir, app_instance in applications.items():
            
            # loop through the desired % of free layers we want for each model

            for tl_dir, tl_pct in tl_free_layers.items():
                # instantiate the model if free == None base model is trained
                model = create_model(app_instance, free = tl_pct, activation = 'linear')
                print(f"\n====Generated model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")
                print(model.summary())

                # set paths to save checkpoints and tensorboard data
                check_path = f"model/{app_dir}_checkpoint/{cell_dir}_{img_type}_{app_dir}_{tl_dir}.hdf5"
                tb_path = f"src/tf_logs/{app_dir}_tl_logs/{cell_dir}_{img_type}_{app_dir}_{tl_dir}"
                checkpoint, tensorboard = create_callbacks(check_path, tb_path)

                print(f"\n====Started training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

                best_model = train_model(model, train_gen, val_gen, checkpoint, tensorboard, loss = 'mae', epochs = 70)
                clear_output(wait = True)

                print(f"\n====Finished training model for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

                # generate predictions on train and val
                path = "src/pred_df/"
                dir = f"{app_dir}_{tl_dir}_preds"
                file_names = [f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_train.csv", f"{cell_dir}_{img_type}_{app_dir}_{tl_dir}_test.csv"]

                print(f"\n====Started generating predictions for {app_dir}, {tl_dir}, {img_type}, {cell_dir}====")

                _, _ = generate_predictions(data, best_model, 30, train_gen, val_gen, save = True, path=path, dir = dir, file_names=file_names)

