
Originally created by Chelsey McGowan-Yallop, SAMS-UHI (sa06cm@sams.ac.uk)

Modified by Muriel Dunn for fish mix analysis

This script uses model-predicted TS(f) spectra to train a machine learning
classifier, performs nested cross-validation, applies the classifier to
measured TS(f) spectra and outputs results files.

To use a different classifier, see the list of supported classifiers at:
https://github.com/hyperopt/hyperopt-sklearn and set as clf.

Sometimes the initial hyperparameter configuration selected by HyperOpt in each
split in the outer loop will be unsuccessful and all trials will fail. The
retry decorator forces it to try again until retry_limit is reached.

OUTPUT FILES:
    _NestedCV.pkl contains results of nested cross-validation procedure
    _Predictions.pkl contains measured TS(f) spectra with predicted labels
    _BestParams.pkl contains the optimal hyperparameters for the model
"""

In [217]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import scipy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
#from sklearn.neighbors import KNeighborsClassifier
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, k_neighbors_classifier, svc, lightgbm_classification
import lightgbm
from datetime import timedelta, date
from tenacity import retry, stop_after_attempt

import sys, errno  


# Read the dataframes

In [235]:
# Read pickle to open on Stokes
ppath = f'{path}Pcod_190123/select/'
p2path = f'{path}Pcod2_240123/select/'
apath = f'{path}Acod_200123/select/'
pbpath = f'{path}Pandalus_260123/select/'
kpath = f'{path}Krill_270123/'

In [236]:
a_df_120 = pd.read_feather(apath+'a_df_120.feather')
a_df_200 = pd.read_feather(apath+'a_df_200.feather')

p_df_120 = pd.read_feather(ppath+'p_df_120.feather')
p_df_200 = pd.read_feather(ppath+'p_df_200.feather')

p2_df_120 = pd.read_feather(p2path+'p2_df_120.feather')
p2_df_200 = pd.read_feather(p2path+'p2_df_200.feather')

pb_df_120 = pd.read_feather(pbpath+'pb_df_120.feather')
pb_df_200 = pd.read_feather(pbpath+'pb_df_200.feather')

## Organize the data into a single df

In [237]:
df_120 = pd.concat([a_df_120,p_df_120,pb_df_120])
#df_120 = df_120.reset_index(drop=True)
fish_120 = df_120.to_numpy()

# -- WRANGLE DATA ---------------------------------------------------------

le = LabelEncoder() # Maps labels -> int (e.g. Atlantic cod -> 0, Polar cod -> 1)
df_120['Species_le'] = le.fit_transform(df_120.Species)
X_120 = fish_120[:,:-1] # Features, TS(f) only
y_120 = df_120['Species_le'].to_numpy() # Labels



df_200 = pd.concat([a_df_200,p_df_200,pb_df_200])
#df_200 = df_200.reset_index(drop=True)
fish_200 = df_200.to_numpy()

# -- WRANGLE DATA ---------------------------------------------------------

le = LabelEncoder() # Maps labels -> int (e.g. Atlantic cod -> 0, Polar cod -> 1)
df_200['Species_le'] = le.fit_transform(df_200.Species)
X_200 = fish_200[:,:-1] # Features, TS(f) only
y_200 = df_200['Species_le'].to_numpy() # Labels



# Classification functions

In [232]:
#@retry(stop=stop_after_attempt(retry_limit))
def nested_cv(X, y, model, n_splits, n_folds, unique_id):
    
    """
    This function performs nested cross-validation with Bayesian hyperparameter
    optimisation. It uses stratified k-fold cross-validation in both the inner
    and outer loops. After each outer loop, it outputs the results to a .pkl
    file. As there is an element of randomness to the optimisation procedure,
    sometimes all trials will fail. If you re-run the script, it will import
    the incomplete .pkl file and try again.
    
    Note that this is a modified version that uses F1 score as the evaluation
    metric. It also calculates class-specific F1 scores and confusion matrices,
    which are added to the output dataframe.
    
    PARAMETERS:
        X: data minus labels
        y: labels
        model: HyperoptEstimator object
        n_splits: # of splits to use in outer K-fold cross-validation
        n_folds: # of folds to use in inner K-fold cross-validation
        unique_id: Unique name string for file output path
    """
    
    cv = StratifiedKFold(n_splits=n_splits,
                         shuffle=True,
                         random_state=42) # Outer CV
    
    i_start = 0
    i_list = []
    results_df = None
    cv_path = classifypath + unique_id + '_NestedCV.pkl'
        
    if os.path.isfile(cv_path) == True: # If CV is incomplete, resume
        results_df = pd.read_pickle(cv_path)
        i_start = results_df.Outer_fold.max() + 1
        print('Resuming cross-validation from fold ' + str(i_start + 1))
        
    # Generate indices to split data by StratifiedKFold
    # Append indices for each fold to list    
    for tr_i, te_i in cv.split(X,y):
        i_list.append([tr_i, te_i])
    
    # For each fold...
    for i in range(i_start, len(i_list)):
        results_list = []
        print('Beginning fold ' + str(i+1) + ' of ' + str(len(i_list)))
        
        # Split data into training and test tests
        X_train = X[i_list[i][0]]
        y_train = y[i_list[i][0]]
        X_test = X[i_list[i][1]]
        y_test = y[i_list[i][1]]

        start = time.time()
        
        # Fit the HyperoptEstimator to training data (optimise model)
        model.fit(X_train,
                  y_train,
                  n_folds=n_folds, # Inner stratified k-fold CV
                  cv_shuffle=True)
        
        end = time.time()
        duration = end - start

        # Use optimised model to predict labels for test data
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='weighted') # Evaluate
        
        # Everything below: formats and/or calculates results for output file
        sorted_labels = np.sort(np.unique(y_test))
        unweighted_score = f1_score(y_test, y_pred,
                                    average=None,
                                    labels=sorted_labels)
        c_matrix = confusion_matrix(y_test, y_pred,
                                    labels=sorted_labels)

        for trial in range(len(model.trials.trials)):
                if model.trials.trials[trial].get('result').get('status') == 'ok':
                    trial_loss = model.trials.trials[trial].get('result').get('loss')
                    trial_duration = model.trials.trials[trial].get('result').get('duration')
                else:
                    trial_loss = np.nan
                    trial_duration = np.nan
            
                results_list.append([i,
                                     score,
                                     unweighted_score,
                                     le.inverse_transform(sorted_labels),
                                     c_matrix,
                                     duration,
                                     trial,
                                     trial_loss,
                                     trial_duration])
        
        append_df = pd.DataFrame(results_list,
                                 columns=['Outer_fold',
                                          'Outer_score',
                                          'Outer_unweighted_scores',
                                          'Outer_unweighted_score_labels',
                                          'Outer_confusion_matrix',
                                          'Outer_training_duration',
                                          'Trial',
                                          'Trial_loss',
                                          'Trial_duration'])
        if i == i_start:
            if results_df is not None:
                final_df = pd.concat([results_df,
                                      append_df],
                                      ignore_index=True)
            else:
                final_df = append_df
            final_df.to_pickle(cv_path)
        
        else:
            results_df = pd.read_pickle(cv_path)
            final_df = pd.concat([results_df,
                                  append_df],
                                  ignore_index=True)
            final_df.to_pickle(cv_path)
     

In [227]:
def f1_loss(y_true, y_pred):
    """
    Custom loss function for HyperOpt.
    Uses F1 score instead of accuracy score, as the latter is inappropriate
    for multi-class classification.
    """
    return 1.0 - f1_score(y_true, y_pred, average='weighted')
    

# Script the classifier

In [238]:
def main_classify(X, y, clf, unique_id, preprocessing, path=classifypath):

    # -- NESTED CROSS-VALIDATION ----------------------------------------------

    model = HyperoptEstimator(classifier = clf,
                              preprocessing = preprocessing,
                              ex_preprocs = ex_preprocessing,
                              algo = tpe.suggest,
                              trial_timeout = timeout,
                              #loss_fn = f1_loss,
                              max_evals = max_evals,
                              n_jobs = n_jobs)
    model

    nested_cv(X, y, model, n_splits, n_folds, unique_id)

    # -- RETRAIN MODEL --------------------------------------------------------

    print('Retraining model on full dataset')

    model = HyperoptEstimator(classifier = clf,
                              preprocessing = preprocessing,
                              ex_preprocs = ex_preprocessing,
                              algo = tpe.suggest,
                              trial_timeout = timeout,
                              #loss_fn = f1_loss,
                              max_evals = max_evals,
                              n_jobs = n_jobs)

    model.fit(X, y, n_folds=n_folds, cv_shuffle=True)

    # -- PREDICT CLASSES FOR NEW DATA -----------------------------------------

    #print('Classifying new data')

    #y_pred = model.predict(measured_X) # Predict classes for measured TS(f)
    #y_pred = le.inverse_transform(y_pred) # Transform labels back to species

    # -- OUTPUT RESULTS -------------------------------------------------------

    #df_120['Prediction'] = y_pred
    #df_120.to_pickle(path + unique_id + '_Predictions.pkl')

    with open(path + unique_id + '_BestParams.pkl', 'wb') as handle:
        pickle.dump(model.best_model(), handle)




# Run different iterations
### No processing

In [248]:
# -- USER-DETERMINED PARAMETERS -----------------------------------------------
path = 'F:/AFKABAN/'
classifypath = 'F:/AFKABAN/Classify/'
d1 = date.today().strftime("%d-%m-%Y")

# NESTED CROSS-VALIDATION
preprocessing = [] # List of sklearn pre-processing modules
ex_preprocessing = [] # As above, see help(HyperoptEstimator) for info
n_splits = 10 # Value of k for k-fold cross-validation in outer loop
n_folds = 10 # Value of k for k-fold cross-validation in inner loop
max_evals = 50 # No. of HyperOpt trials
timeout = 300 # HyperOpt trial timeout (seconds)
n_jobs = -1 # No. of jobs to run in parallel; -1 uses all processors
retry_limit = 3 # No. of times to retry before failing

In [None]:
# CLASSIFIER kNN 120
unique_id = 'kNN_'+ d1 +'_120' # Unique ID for output file paths
clf = k_neighbors_classifier(unique_id)  # Classifier
main_classify(X_120, y_120, clf, unique_id, [])

Resuming cross-validation from fold 4
Beginning fold 4 of 10
100%|█████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.21s/trial, best loss: 0.09470752089136492]
100%|█████████████████████████████████████████████████| 2/2 [00:05<00:00,  5.23s/trial, best loss: 0.09470752089136492]
100%|█████████████████████████████████████████████████| 3/3 [00:04<00:00,  4.94s/trial, best loss: 0.07381615598885793]
 75%|█████████████████████████████████████████████████████████                   | 3/4 [00:00<?, ?trial/s, best loss=?]

In [None]:
# CLASSIFIER kNN 200
unique_id = 'kNN_'+ d1 +'_200' # Unique ID for output file paths
clf = k_neighbors_classifier(unique_id)  # Classifier

main_classify(X_20, y_200, clf, unique_id, [])

In [None]:
# CLASSIFIER kNN 120
unique_id = 'lightGBM_'+ d1 +'_120' # Unique ID for output file paths
clf = lightgbm_classification(unique_id)  # Classifier
main_classify(X_120, y_120, clf, unique_id, [])

In [None]:
# CLASSIFIER kNN 200
unique_id = 'lightGBM_'+ d1 +'_200' # Unique ID for output file paths
clf = lightgbm_classification(unique_id)  # Classifier

main_classify(X_20, y_200, clf, unique_id, [])