
Originally created by Chelsey McGowan-Yallop, SAMS-UHI (sa06cm@sams.ac.uk)

Modified by Muriel Dunn for fish mix analysis

This script uses model-predicted TS(f) spectra to train a machine learning
classifier, performs nested cross-validation, applies the classifier to
measured TS(f) spectra and outputs results files.

To use a different classifier, see the list of supported classifiers at:
https://github.com/hyperopt/hyperopt-sklearn and set as clf.

Sometimes the initial hyperparameter configuration selected by HyperOpt in each
split in the outer loop will be unsuccessful and all trials will fail. The
retry decorator forces it to try again until retry_limit is reached.

OUTPUT FILES:
    _NestedCV.pkl contains results of nested cross-validation procedure
    _Predictions.pkl contains measured TS(f) spectra with predicted labels
    _BestParams.pkl contains the optimal hyperparameters for the model
"""

In [2]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tenacity import retry, stop_after_attempt

import sys, errno  


In [9]:
# -- USER-DETERMINED PARAMETERS -----------------------------------------------
path = 'C:/Users/mbd/OneDrive - Akvaplan-niva AS/PhD-APN/ChaptersandExperiments/AZKABAN-light/ZoopMix_paper/'
# .CSV FILES FROM ECHOVIEW
ts_SED_path = path+"SED_ZoopMix_FTwindow33pl.csv" # Path to Echoview TS(f) file
ts_trackavg_path = path+"SED_ZoopMix_FTwindow33pl_tracks.csv" # Path to Echoview TS(f) file
ts_trackavg_path = path+"SED_ZoopMix_FTwindow33pl_tracks-averaged.csv" # Path to Echoview TS(f) file


# SCATTERING MODEL
#model_path = path+"AZKABAN_ZoopMix_data_newcopepod.feather" # Path to model outputs
#n_models_per_species = 1000 # No. of models per species

# CLASSIFIER
unique_id = '02-01-2023_kNN_AZKABAN' # Unique ID for output file paths
clf = KNeighborsClassifier(unique_id)  # Classifier

# NESTED CROSS-VALIDATION
preprocessing = [] # List of sklearn pre-processing modules
ex_preprocessing = [] # As above, see help(HyperoptEstimator) for info
n_splits = 10 # Value of k for k-fold cross-validation in outer loop
n_folds = 10 # Value of k for k-fold cross-validation in inner loop
max_evals = 50 # No. of HyperOpt trials
timeout = 300 # HyperOpt trial timeout (seconds)
n_jobs = -1 # No. of jobs to run in parallel; -1 uses all processors
retry_limit = 10 # No. of times to retry before failing

In [6]:
# -- PUT THE FUN IN FUNCTIONS -------------------------------------------------

def tsf_import(file_path):

    """
    This function imports wideband frequency response .csv files exported from
    Echoview and performs some basic housekeeping.
    
    PARAMETERS:
        file_path: Path to wideband frequency response .csv file from Echoview.
        
    RETURNS:
        df: Pandas dataframe.
    """
    
    def get_datetime(df):
        """
        Convert Echoview timestamps in imported .csv to datetime.
        """
        df['Ping_microseconds'] = df.Ping_milliseconds * 1000
        df['Ping_microseconds'] = [timedelta(microseconds = i) for i in df['Ping_microseconds']]
        df['Datetime'] = df['Ping_date_Ping_time'] + df['Ping_microseconds']
        df.drop(columns=['Ping_date_Ping_time',
                         'Ping_milliseconds',
                         'Ping_microseconds'],
               inplace=True)
        
        return df
    
    transposed_path = file_path[:-4] + '_transposed.csv'
    if os.path.isfile(transposed_path) == True:
        df = pd.read_csv(transposed_path,
                         index_col='Target_index',
                         skiprows=1,
                         skipfooter=2,
                         engine='python',
                         parse_dates=[['Ping_date', 'Ping_time']])
    else:
        df = pd.read_csv(file_path, low_memory=False).T
        file_path = transposed_path
        df.to_csv(path_or_buf=file_path)
        print('A transposed TS(f) file was created at: \n' + str(file_path))    
        df = pd.read_csv(file_path,
                         index_col='Target_index',
                         skiprows=1,
                         skipfooter=2,
                         engine='python',
                         parse_dates=[['Ping_date', 'Ping_time']])
    
    df = get_datetime(df)
    
    return df

In [4]:
@retry(stop=stop_after_attempt(retry_limit))
def nested_cv(X, y, model, n_splits, n_folds, unique_id):
    
    """
    This function performs nested cross-validation with Bayesian hyperparameter
    optimisation. It uses stratified k-fold cross-validation in both the inner
    and outer loops. After each outer loop, it outputs the results to a .pkl
    file. As there is an element of randomness to the optimisation procedure,
    sometimes all trials will fail. If you re-run the script, it will import
    the incomplete .pkl file and try again.
    
    Note that this is a modified version that uses F1 score as the evaluation
    metric. It also calculates class-specific F1 scores and confusion matrices,
    which are added to the output dataframe.
    
    PARAMETERS:
        X: data minus labels
        y: labels
        model: HyperoptEstimator object
        n_splits: # of splits to use in outer K-fold cross-validation
        n_folds: # of folds to use in inner K-fold cross-validation
        unique_id: Unique name string for file output path
    """
    
    cv = StratifiedKFold(n_splits=n_splits,
                         shuffle=True,
                         random_state=42) # Outer CV
    
    i_start = 0
    i_list = []
    results_df = None
    cv_path = unique_id + '_NestedCV.pkl'
        
    if os.path.isfile(cv_path) == True: # If CV is incomplete, resume
        results_df = pd.read_pickle(cv_path)
        i_start = results_df.Outer_fold.max() + 1
        print('Resuming cross-validation from fold ' + str(i_start + 1))
        
    # Generate indices to split data by StratifiedKFold
    # Append indices for each fold to list    
    for tr_i, te_i in cv.split(X,y):
        i_list.append([tr_i, te_i])
    
    # For each fold...
    for i in range(i_start, len(i_list)):
        results_list = []
        print('Beginning fold ' + str(i+1) + ' of ' + str(len(i_list)))
        
        # Split data into training and test tests
        X_train = X.loc[X.index.intersection(i_list[i][0])]
        y_train = y.loc[y.index.intersection(i_list[i][0])]
        X_test = X.loc[X.index.intersection(i_list[i][1])]
        y_test = y.loc[y.index.intersection(i_list[i][1])]

        start = time.time()
        
        # Fit the HyperoptEstimator to training data (optimise model)
        model.fit(X_train,
                  y_train,
                  n_folds=n_folds, # Inner stratified k-fold CV
                  cv_shuffle=True)
        
        end = time.time()
        duration = end - start

        # Use optimised model to predict labels for test data
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='weighted') # Evaluate
        
        # Everything below: formats and/or calculates results for output file
        sorted_labels = np.sort(y_test.unique())
        unweighted_score = f1_score(y_test, y_pred,
                                    average=None,
                                    labels=sorted_labels)
        c_matrix = confusion_matrix(y_test, y_pred,
                                    labels=sorted_labels)

        for trial in range(len(model.trials.trials)):
                if model.trials.trials[trial].get('result').get('status') == 'ok':
                    trial_loss = model.trials.trials[trial].get('result').get('loss')
                    trial_duration = model.trials.trials[trial].get('result').get('duration')
                else:
                    trial_loss = np.nan
                    trial_duration = np.nan
            
                results_list.append([i,
                                     score,
                                     unweighted_score,
                                     le.inverse_transform(sorted_labels),
                                     c_matrix,
                                     duration,
                                     trial,
                                     trial_loss,
                                     trial_duration])
        
        append_df = pd.DataFrame(results_list,
                                 columns=['Outer_fold',
                                          'Outer_score',
                                          'Outer_unweighted_scores',
                                          'Outer_unweighted_score_labels',
                                          'Outer_confusion_matrix',
                                          'Outer_training_duration',
                                          'Trial',
                                          'Trial_loss',
                                          'Trial_duration'])
        if i == i_start:
            if results_df is not None:
                final_df = pd.concat([results_df,
                                      append_df],
                                      ignore_index=True)
            else:
                final_df = append_df
            final_df.to_pickle(cv_path)
        
        else:
            results_df = pd.read_pickle(cv_path)
            final_df = pd.concat([results_df,
                                  append_df],
                                  ignore_index=True)
            final_df.to_pickle(cv_path)
     

In [5]:
def f1_loss(y_true, y_pred):
    """
    Custom loss function for HyperOpt.
    Uses F1 score instead of accuracy score, as the latter is inappropriate
    for multi-class classification.
    """
    return 1.0 - f1_score(y_true, y_pred, average='weighted')
    

In [6]:

def main():    
    # -- IMPORT FILES ---------------------------------------------------------
     
    measured_df = tsf_import(tsf_path) # Measured TS(f)
    model_df = pd.read_feather(model_path) # Modelled TS(f)

    # -- RESTRUCTURE MODEL DATA -----------------------------------------------

    measured_frequency = [float(i) for i in measured_df.columns.values[3:-1]]
    n_model_f_bins = len(model_df.freq.unique()) # No. freq bins in model data
    n_species = len(model_df.spec.unique()) # No. species in model data

    X_list = []
    y_list = []
    start_lim = 0
    stop_lim = n_model_f_bins

    for i in range(n_species * n_models_per_species):
        TS_array = model_df.TS[start_lim:stop_lim].values
        species_label = model_df.spec[start_lim:stop_lim].values[0]

        X_list.append(TS_array)
        y_list.append(species_label)

        start_lim += n_model_f_bins
        stop_lim += n_model_f_bins

    model_df = pd.DataFrame(X_list, columns=measured_frequency)
    model_df['Species'] = y_list

    # -- WRANGLE DATA ---------------------------------------------------------

    le = LabelEncoder() # Maps labels -> int (e.g. Copepods -> 0, Krill -> 1)
    model_df['Species'] = le.fit_transform(model_df.Species)
    X = model_df.iloc[:, :-1] # Features, TS(f) only
    y = model_df.Species # Labels

    if min_range != None:
        measured_df = measured_df[measured_df.Range > min_range]

    if max_range != None:
        measured_df = measured_df[measured_df.Range < max_range]

    measured_X = measured_df.iloc[:, 3:-1] # Features, TS(f) only

    # -- NESTED CROSS-VALIDATION ----------------------------------------------

    model = HyperoptEstimator(classifier = clf,
                              preprocessing = preprocessing,
                              ex_preprocs = ex_preprocessing,
                              algo = tpe.suggest,
                              trial_timeout = timeout,
                              loss = f1_loss,
                              max_evals = max_evals,
                              n_jobs = n_jobs)

    nested_cv(X, y, model, n_splits, n_folds, unique_id)

    # -- RETRAIN MODEL --------------------------------------------------------

    print('Retraining model on full dataset')

    model = HyperoptEstimator(classifier = clf,
                              preprocessing = preprocessing,
                              ex_preprocs = ex_preprocessing,
                              algo = tpe.suggest,
                              trial_timeout = timeout,
                              loss = f1_loss,
                              max_evals = max_evals,
                              n_jobs = n_jobs)

    model.fit(X, y, n_folds=n_folds, cv_shuffle=True)

    # -- PREDICT CLASSES FOR NEW DATA -----------------------------------------

    print('Classifying new data')

    y_pred = model.predict(measured_X) # Predict classes for measured TS(f)
    y_pred = le.inverse_transform(y_pred) # Transform labels back to species

    # -- OUTPUT RESULTS -------------------------------------------------------

    measured_df['Prediction'] = y_pred
    measured_df.to_pickle(unique_id + '_Predictions.pkl')

    with open(unique_id + '_BestParams.pkl', 'wb') as handle:
        pickle.dump(model.best_model(), handle)

    
   

In [10]:
measured_df = tsf_import(ts_SED_path)

A transposed TS(f) file was created at: 
C:/Users/mbd/OneDrive - Akvaplan-niva AS/PhD-APN/ChaptersandExperiments/AZKABAN-light/ZoopMix_paper/SED_ZoopMix_FTwindow33pl_transposed.csv


In [11]:
measured_df

Unnamed: 0_level_0,Ping_index,Range,Depth,185.000,185.500,186.000,186.500,187.000,187.500,188.000,...,251.000,251.500,252.000,252.500,253.000,253.500,254.000,254.500,255.000,Datetime
Target_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1.715900,1.715748,-69.453964,-68.521743,-67.986585,-67.733404,-67.754191,-68.130185,-68.597221,...,-71.595734,-71.547922,-71.147465,-70.542152,-69.911074,-69.383364,-69.019195,-68.878777,-68.918106,2022-01-17 09:45:17.944
1,0,2.191251,2.191001,-77.883264,-78.892482,-78.395912,-77.422381,-76.849307,-77.131158,-78.237995,...,-92.170344,-95.451009,-93.544346,-89.910568,-87.520267,-86.109659,-85.248782,-84.550531,-83.491767,2022-01-17 09:45:17.944
2,1,1.715900,1.715689,-70.599819,-71.358021,-71.912162,-72.177343,-72.210387,-72.189034,-71.994081,...,-69.456054,-69.767954,-70.247306,-70.808972,-71.315128,-71.605663,-71.531900,-71.057052,-70.153700,2022-01-17 09:45:18.478
3,2,1.710103,1.709843,-71.856611,-71.593160,-71.263903,-70.905700,-70.630484,-70.556040,-70.416283,...,-70.926331,-70.803783,-70.527596,-70.179028,-69.844091,-69.592659,-69.455403,-69.465011,-69.534129,2022-01-17 09:45:18.881
4,3,1.692712,1.692449,-70.293702,-69.615039,-69.264823,-69.141130,-69.228822,-69.573658,-69.870514,...,-70.792217,-70.863150,-70.716718,-70.376147,-69.915707,-69.430619,-68.994180,-68.689533,-68.493175,2022-01-17 09:45:19.286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37972,28033,1.762276,1.759939,-113.132523,-112.803816,-114.224551,-114.594160,-112.252564,-110.110741,-109.217116,...,-123.207650,-120.559402,-114.553570,-112.631176,-113.483047,-118.347945,-129.190662,-113.787562,-108.544500,2022-01-17 12:54:21.103
37973,28034,1.078235,1.077697,-118.312640,-123.990796,-123.327124,-119.759730,-118.139591,-118.087910,-118.933456,...,-114.110881,-114.346130,-115.713000,-118.186010,-120.146196,-118.397924,-115.607104,-113.822064,-113.064563,2022-01-17 12:54:21.505
37974,28076,1.078235,1.077310,-110.922932,-109.982501,-110.333450,-111.489347,-113.320570,-115.734220,-118.376056,...,-117.182811,-115.589224,-113.592313,-112.956288,-114.368750,-118.768406,-118.086530,-110.936258,-106.597035,2022-01-17 12:54:38.405
37975,28077,1.049250,1.048023,-113.313526,-110.980939,-110.684654,-111.727588,-114.326816,-119.098485,-120.863880,...,-119.942313,-112.628975,-108.383693,-106.570394,-106.370381,-107.497191,-109.581642,-110.497115,-108.405185,2022-01-17 12:54:38.797
