# Summary
This script uses the output of bottleneck (pooled last convolutional) layer of a specific pretrained model as a training input for various machine learning classifier.

In [None]:
import os
from time import time

import numpy as np
import pandas as pd

import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.imagenet_utils import preprocess_input

# CROSS VALIDATION
from sklearn.model_selection import StratifiedKFold

# Evaluation metrics
from sklearn.metrics import accuracy_score, recall_score

### Changeable section

In [None]:
# Import additional scikit learn model here as necessary
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Extra base model
import sys
sys.path.append("../ResNet152")
from resnet152 import ResNet152

In [None]:
# Parameters to be changed
# Mostly similiar with Util/T_SNE_plot, since it uses the same ideas

"""
Model Parameter
Since this script does not require anything beyond convolutional layer, we can just use the pre-trained one
TODO: In the future if we are going to test fine-tuned model, I will modify this script
"""
# Add as many base models as necessary
BaseModels = []
BaseModels.append(keras.applications.inception_resnet_v2.InceptionResNetV2)
BaseModels.append(keras.applications.xception.Xception)
BaseModels.append(keras.applications.inception_v3.InceptionV3)
BaseModels.append(ResNet152)

# The preprocessing functions for each base model, make sure that they are in the right order
BaseModelPreprocessings = []
BaseModelPreprocessings.append(keras.applications.inception_resnet_v2.preprocess_input)
BaseModelPreprocessings.append(keras.applications.xception.preprocess_input)
BaseModelPreprocessings.append(keras.applications.inception_v3.preprocess_input)
BaseModelPreprocessings.append(preprocess_input)

# No need for any custom preprocessing, use the one that comes out with the model itself
# Make sure this matches the one in BaseModel
# TODO: Does not work well with jpeg? Resulted in error when used with generator
# For now just use pre-determined rescale value instead
# model_preprocess_func = keras.applications.inception_resnet_v2.preprocess_input

INPUT_SHAPE = (480,480,3)
# Might have to keep this low if the INPUT_SHAPE is large
BATCH_SIZE = 32
# Either 'avg' or 'max'
POOLING = 'avg'
SEED = 5703

# If YOLO is used, just call it something like SS2-YOLO (anything without space or underscore)
# SS_NAME = r'SS1-CV0-Augment10DifferentClass'
SS_NAME = r'SS1-CV0'

# Whether to use pre-split train-validation or stratified k-fold split
IS_PRE_SPLIT = False

"""
Images Parameter
The one that is used to generate conv output array
"""
IMAGE_DIR = r'D:\Resources\Inat_Partial\Aves_Small_SS1'
# For averaging the evaluation performance
K_FOLD = 5

"""
Set this parameter instead in case that pre-split data is preferred over on-the run stratified split
For example, If we want to use augmented training data to predict the normal validation
"""
TRAIN_DIR = r'D:\Resources\Inat_Partial\Aves_Small_SS1_Augmented10DifferentClass\CV_0'
VAL_DIR = r'D:\Resources\Inat_Partial\Aves_Small_SS1_Validation\CV_0'

"""
Directory of the Convolutional output file, 
better to have so the pre-computed numpy array we don't have to rerun the predicition every single time
if the file not exist in the directory, this script will simply save the conv output in the pre-determined path
Name of the file will be generated based on the model name, subset name, and input shape
"""
CONV_OUTPUT_DIR = r'D:\Workspace\Jupyter\COMP5703\rpur7902_local\Resources'

In [None]:
# CHANGE this
# Append the tested classifier here, make sure to set the random state
classifiers = []
classifiers.append(LogisticRegression(C=0.1, solver='lbfgs', multi_class='multinomial', random_state=SEED))
# classifiers.append(SVC(C=0.1, random_state=SEED))
# classifiers.append(RandomForestClassifier(n_estimators=16, max_depth=16, random_state=SEED))

### Unchanged territory

In [None]:
# Filepath and output filename generator
conv_output_paths = []
conv_label_paths = []
# For pre-split data
conv_output_training_paths = []
conv_output_validation_paths = []
conv_label_training_paths = []
conv_label_validation_paths = []
# To be joined for the output path
base_model_names = []
for BaseModel in BaseModels:
    if IS_PRE_SPLIT:
        conv_output_training_filename = ('_'.join([BaseModel.__name__, SS_NAME, 'training',
                                                  str(INPUT_SHAPE[0]), str(INPUT_SHAPE[1])]) + 
                                        '.npy')
        conv_output_validation_filename = ('_'.join([BaseModel.__name__, SS_NAME, 'validation',
                                                  str(INPUT_SHAPE[0]), str(INPUT_SHAPE[1])]) + 
                                        '.npy')
        conv_label_training_filename = ('_'.join([BaseModel.__name__, SS_NAME, 'training',
                                                  str(INPUT_SHAPE[0]), str(INPUT_SHAPE[1])]) + 
                                        '_label.npy')
        conv_label_validation_filename = ('_'.join([BaseModel.__name__, SS_NAME, 'validation',
                                                  str(INPUT_SHAPE[0]), str(INPUT_SHAPE[1])]) + 
                                        '_label.npy')
        conv_output_training_path = os.path.join(CONV_OUTPUT_DIR, conv_output_training_filename)
        conv_output_validation_path = os.path.join(CONV_OUTPUT_DIR, conv_output_validation_filename)
        conv_label_training_path = os.path.join(CONV_OUTPUT_DIR, conv_label_training_filename)
        conv_label_validation_path = os.path.join(CONV_OUTPUT_DIR, conv_label_validation_filename)
        conv_output_training_paths.append(conv_output_training_path)
        conv_output_validation_paths.append(conv_output_validation_path)
        conv_label_training_paths.append(conv_label_training_path)
        conv_label_validation_paths.append(conv_label_validation_path)
    else:
        conv_output_filename = '_'.join([BaseModel.__name__, SS_NAME, str(INPUT_SHAPE[0]), str(INPUT_SHAPE[1])]) + '.npy'
        conv_label_filename = '_'.join([BaseModel.__name__, SS_NAME, str(INPUT_SHAPE[0]), str(INPUT_SHAPE[1])]) + '_label.npy'
        conv_output_path = os.path.join(CONV_OUTPUT_DIR, conv_output_filename)
        conv_label_path = os.path.join(CONV_OUTPUT_DIR, conv_label_filename)
        conv_output_paths.append(conv_output_path)
        conv_label_paths.append(conv_label_path)
    base_model_names.append(str(BaseModel.__name__))
    
output_csv_path = r'./' + SS_NAME + '_' + str(INPUT_SHAPE[0]) + '_' + '_'.join(sorted(base_model_names)) + '_output.csv'

In [None]:
clf_names = []
clf_params = []
for clf in classifiers:
    clf_name = clf.__class__.__name__
    index = 0
    while (clf_name + '_' + str(index)) in clf_names:
        index += 1
    clf_names.append(clf_name + '_' + str(index))
    clf_params.append(clf.get_params())

### Mode 1, whole image set, cross validated during runtime

In [None]:
# No need to do any prediction if the output is already saved
# Generating a prediction result array, with the size of (num_images, last_convolutional_layer_depth)

if IS_PRE_SPLIT == False:
    X = None
    Y = None
    for BaseModel, preprocess_function, conv_output_path, conv_label_path in \
        zip(BaseModels, BaseModelPreprocessings, conv_output_paths, conv_label_paths):
            
        print('Generating features from model',BaseModel.__name__)
        if os.path.isfile(conv_output_path):
            X = np.load(conv_output_path) if X is None else np.hstack([X, np.load(conv_output_path)])
            # reloading y as is should be fine
            y = np.load(conv_label_path)
        else:
            # Loading model, the slow process
            model_notop = BaseModel(include_top=False, weights='imagenet', input_shape=INPUT_SHAPE, pooling=POOLING)

            # Generator preparation
            datagen = ImageDataGenerator(preprocessing_function=preprocess_function)

            generator = datagen.flow_from_directory(IMAGE_DIR,
                                                    target_size=INPUT_SHAPE[:2],
                                                    class_mode='categorical',
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False,
                                                    seed=SEED)
            # Predicting 3k images is slow, don't run this without GPU
            X_cur_model = model_notop.predict_generator(generator)
            # Only works if shuffle is false, cos generator.classes just take the class label in order
            y_cur_model = generator.classes
            np.save(conv_output_path, X_cur_model)
            np.save(conv_label_path, y_cur_model)
            X = X_cur_model if X is None else np.hstack([X, X_cur_model])
            y = y_cur_model

In [None]:
if IS_PRE_SPLIT == False:
    clf_trainingtime = []
    clf_val_accuracy_mean = []
    clf_val_accuracy_std = []
    clf_val_recallmacro_mean = []
    clf_val_recallmacro_std = []

    for index,clf in enumerate(classifiers):
        print('Testing', clf_names[index])
        cur_clf_trainingtime = 0
        cur_clf_accuracy = []
        cur_clf_recallmacro = []

        # Better not shuffle the data, to match it with the transfer learning
        skf = StratifiedKFold(n_splits=K_FOLD,shuffle=False)
        for train_index, val_index in skf.split(X, y):
            X_train = X[train_index]
            y_train = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]

            time0 = time()
            clf.fit(X_train,y_train)
            # Only use the first training time
            cur_clf_trainingtime = (time() - time0) if cur_clf_trainingtime == 0 else cur_clf_trainingtime

            y_pred = clf.predict(X_val)

            cur_clf_accuracy.append(accuracy_score(y_val, y_pred))
            cur_clf_recallmacro.append(recall_score(y_val, y_pred, average='macro'))

        clf_trainingtime.append(cur_clf_trainingtime)
        clf_val_accuracy_mean.append(np.mean(cur_clf_accuracy))
        clf_val_accuracy_std.append(np.std(cur_clf_accuracy))
        clf_val_recallmacro_mean.append(np.mean(cur_clf_recallmacro))
        clf_val_recallmacro_std.append(np.std(cur_clf_recallmacro))

### Mode 2, set manually split into train and validation
Required when the training and validation data are unequal (e.g. training with augmentation but not validation)

In [None]:
if IS_PRE_SPLIT == True:
    X_train = None
    X_val = None
    Y_train = None
    Y_val = None
    for (BaseModel, preprocess_function, 
         conv_output_training_path, conv_output_validation_path,
         conv_label_training_path, conv_label_validation_path) in \
        zip(BaseModels, BaseModelPreprocessings, 
            conv_output_training_paths, conv_output_validation_paths, 
            conv_label_training_paths, conv_label_validation_paths):
            
        print('Generating features from model',BaseModel.__name__)
        if os.path.isfile(conv_output_training_path) and os.path.isfile(conv_output_validation_path):
            X_train = (np.load(conv_output_training_path) if X_train is None else 
                       np.hstack([X_train, np.load(conv_output_training_path)]))
            X_val = (np.load(conv_output_validation_path) if X_val is None else 
                       np.hstack([X_val, np.load(conv_output_validation_path)]))
            # reloading y as is should be fine
            y_train = np.load(conv_label_training_path)
            y_val = np.load(conv_label_validation_path)
        else:
            # Loading model, the slow process
            model_notop = BaseModel(include_top=False, weights='imagenet', input_shape=INPUT_SHAPE, pooling=POOLING)

            # Generator preparation
            datagen = ImageDataGenerator(preprocessing_function=preprocess_function)

            train_generator = datagen.flow_from_directory(TRAIN_DIR,
                                                    target_size=INPUT_SHAPE[:2],
                                                    class_mode='categorical',
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False,
                                                    seed=SEED)
            val_generator = datagen.flow_from_directory(VAL_DIR,
                                                    target_size=INPUT_SHAPE[:2],
                                                    class_mode='categorical',
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False,
                                                    seed=SEED)
            X_train_cur_model = model_notop.predict_generator(train_generator)
            X_val_cur_model = model_notop.predict_generator(val_generator)
            # Only works if shuffle is false, cos generator.classes just take the class label in order
            y_train_cur_model = train_generator.classes
            y_val_cur_model = val_generator.classes
            np.save(conv_output_training_path, X_train_cur_model)
            np.save(conv_output_validation_path, X_val_cur_model)
            np.save(conv_label_training_path, y_train_cur_model)
            np.save(conv_label_validation_path, y_val_cur_model)
            X_train = X_train_cur_model if X_train is None else np.hstack([X_train, X_train_cur_model])
            X_val = X_val_cur_model if X_val is None else np.hstack([X_val, X_val_cur_model])
            y_train = y_train_cur_model
            y_val = y_val_cur_model

In [None]:
if IS_PRE_SPLIT == True:
    clf_trainingtime = []
    clf_val_accuracy_mean = []
    clf_val_accuracy_std = []
    clf_val_recallmacro_mean = []
    clf_val_recallmacro_std = []

    for index,clf in enumerate(classifiers):
        print('Testing', clf_names[index])
        cur_clf_trainingtime = 0
        cur_clf_accuracy = []
        cur_clf_recallmacro = []

        # In mode 2, train and val are already precomputed
        time0 = time()
        clf.fit(X_train,y_train)
        # Only use the first training time
        cur_clf_trainingtime = (time() - time0) if cur_clf_trainingtime == 0 else cur_clf_trainingtime

        y_pred = clf.predict(X_val)

        cur_clf_accuracy.append(accuracy_score(y_val, y_pred))
        cur_clf_recallmacro.append(recall_score(y_val, y_pred, average='macro'))

        clf_trainingtime.append(cur_clf_trainingtime)
        clf_val_accuracy_mean.append(np.mean(cur_clf_accuracy))
        clf_val_accuracy_std.append(np.std(cur_clf_accuracy))
        clf_val_recallmacro_mean.append(np.mean(cur_clf_recallmacro))
        clf_val_recallmacro_std.append(np.std(cur_clf_recallmacro))

## Saving the record

In [None]:
# Dataframe to be saved as csv
result_df = pd.DataFrame(columns=['clf_name','clf_params','clf_trainingtime',
                                  'clf_val_accuracy_mean','clf_val_accuracy_std',
                                  'clf_val_recallmacro_mean','clf_val_recallmacro_std'])
result_df.set_index('clf_name', inplace=True)

In [None]:
for index,clf_name in enumerate(clf_names):
    data = [str(clf_params[index]), clf_trainingtime[index], 
            clf_val_accuracy_mean[index], clf_val_accuracy_std[index], 
            clf_val_recallmacro_mean[index], clf_val_recallmacro_std[index]]
    result_df.loc[clf_name] = data

In [None]:
result_df.to_csv(output_csv_path)