
Originally created by Chelsey McGowan-Yallop, SAMS-UHI (sa06cm@sams.ac.uk)

Modified by Muriel Dunn for fish mix analysis

This script uses model-predicted TS(f) spectra to train a machine learning
classifier, performs nested cross-validation, applies the classifier to
measured TS(f) spectra and outputs results files.

To use a different classifier, see the list of supported classifiers at:
https://github.com/hyperopt/hyperopt-sklearn and set as clf.

Sometimes the initial hyperparameter configuration selected by HyperOpt in each
split in the outer loop will be unsuccessful and all trials will fail. The
retry decorator forces it to try again until retry_limit is reached.

OUTPUT FILES:
    _NestedCV.pkl contains results of nested cross-validation procedure
    _Predictions.pkl contains measured TS(f) spectra with predicted labels
    _BestParams.pkl contains the optimal hyperparameters for the model
"""

In [1]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import scipy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
#from sklearn.neighbors import KNeighborsClassifier
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, k_neighbors_classifier, svc, lightgbm_classification
import lightgbm
from datetime import timedelta
from tenacity import retry, stop_after_attempt

import sys, errno  


In [2]:
# -- USER-DETERMINED PARAMETERS -----------------------------------------------
path = 'F:/AFKABAN/'

# CLASSIFIER
unique_id = '28-04-2023_kNN_AZKABAN' # Unique ID for output file paths
clf = k_neighbors_classifier(unique_id)  # Classifier

# NESTED CROSS-VALIDATION
preprocessing = [] # List of sklearn pre-processing modules
ex_preprocessing = [] # As above, see help(HyperoptEstimator) for info
n_splits = 2 # Value of k for k-fold cross-validation in outer loop
n_folds = 2 # Value of k for k-fold cross-validation in inner loop
max_evals = 10 # No. of HyperOpt trials
timeout = 120 # HyperOpt trial timeout (seconds)
n_jobs = -1 # No. of jobs to run in parallel; -1 uses all processors
retry_limit = 3 # No. of times to retry before failing

# Read the dataframes

In [3]:
# Read pickle to open on Stokes
ppath = f'{path}/Pcod_190123/select/'
p2path = f'{path}/Pcod2_240123/select/'
apath = f'{path}/Acod_200123/select/'
pbpath = f'{path}/Pandalus_260123/select/'
kpath = f'{path}/Krill_270123/'

In [4]:
a_df = pd.read_feather(apath+'/a_tilt_df.feather')
p_df = pd.read_feather(ppath+'/p_tilt_df.feather')
p2_df = pd.read_feather(p2path+'/p2_tilt_df.feather')
pb_df = pd.read_feather(pbpath+'/pb_tilt_df.feather')

In [5]:
def select_ts_bandwidth(df):
    'Function to seperated target spectra from 120 kHz and 200 kHz echosounders'
    ind_120 = np.where(np.isnan(df['201.129']))[0]
    ts_s_120 = np.where(df.columns.values=='94.032')[0][0] #remove 5% of spectra on either side b/c ramping
    ts_e_120 = np.where(df.columns.values=='160.565')[0][0]
    
    ind_200 = np.where(np.isnan(df['120.242']))[0]
    ts_s_200 = np.where(df.columns.values=='189.032')[0][0]
    ts_e_200 = np.where(df.columns.values=='251.532')[0][0]

    df_120 = df.iloc[ind_120,ts_s_120:ts_e_120]
    df_200 = df.iloc[ind_200,ts_s_200:ts_e_200]
    
    return df_120, df_200

In [6]:
a_df_120, a_df_200 = select_ts_bandwidth(a_df)
p_df_120, p_df_200 = select_ts_bandwidth(p_df)
p2_df_120, p2_df_200 = select_ts_bandwidth(p2_df)
pb_df_120, pb_df_200 = select_ts_bandwidth(pb_df)

In [7]:
# Add species label
a_df_120['Species'] = 'Atlantic cod'
a_df_200['Species'] = 'Atlantic cod'
p_df_120['Species'] = 'Polar cod'
p_df_200['Species'] = 'Polar cod'
p2_df_120['Species'] = 'Polar cod'
p2_df_200['Species'] = 'Polar cod'
pb_df_120['Species'] = 'Northern shrimp'
pb_df_200['Species'] = 'Northern shrimp'

# Classification functions

In [8]:
@retry(stop=stop_after_attempt(retry_limit))
def nested_cv(X, y, model, n_splits, n_folds, unique_id):
    
    """
    This function performs nested cross-validation with Bayesian hyperparameter
    optimisation. It uses stratified k-fold cross-validation in both the inner
    and outer loops. After each outer loop, it outputs the results to a .pkl
    file. As there is an element of randomness to the optimisation procedure,
    sometimes all trials will fail. If you re-run the script, it will import
    the incomplete .pkl file and try again.
    
    Note that this is a modified version that uses F1 score as the evaluation
    metric. It also calculates class-specific F1 scores and confusion matrices,
    which are added to the output dataframe.
    
    PARAMETERS:
        X: data minus labels
        y: labels
        model: HyperoptEstimator object
        n_splits: # of splits to use in outer K-fold cross-validation
        n_folds: # of folds to use in inner K-fold cross-validation
        unique_id: Unique name string for file output path
    """
    
    cv = StratifiedKFold(n_splits=n_splits,
                         shuffle=True,
                         random_state=42) # Outer CV
    
    i_start = 0
    i_list = []
    results_df = None
    cv_path = unique_id + '_NestedCV.pkl'
        
    if os.path.isfile(cv_path) == True: # If CV is incomplete, resume
        results_df = pd.read_pickle(cv_path)
        i_start = results_df.Outer_fold.max() + 1
        print('Resuming cross-validation from fold ' + str(i_start + 1))
        
    # Generate indices to split data by StratifiedKFold
    # Append indices for each fold to list    
    for tr_i, te_i in cv.split(X,y):
        i_list.append([tr_i, te_i])
    
    # For each fold...
    for i in range(i_start, len(i_list)):
        results_list = []
        print('Beginning fold ' + str(i+1) + ' of ' + str(len(i_list)))
        
        # Split data into training and test tests
        X_train = X.loc[X.index.intersection(i_list[i][0])]
        y_train = y.loc[y.index.intersection(i_list[i][0])]
        X_test = X.loc[X.index.intersection(i_list[i][1])]
        y_test = y.loc[y.index.intersection(i_list[i][1])]

        start = time.time()
        
        # Fit the HyperoptEstimator to training data (optimise model)
        model.fit(X_train,
                  y_train,
                  n_folds=n_folds, # Inner stratified k-fold CV
                  cv_shuffle=True)
        
        end = time.time()
        duration = end - start

        # Use optimised model to predict labels for test data
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='weighted') # Evaluate
        
        # Everything below: formats and/or calculates results for output file
        sorted_labels = np.sort(y_test.unique())
        unweighted_score = f1_score(y_test, y_pred,
                                    average=None,
                                    labels=sorted_labels)
        c_matrix = confusion_matrix(y_test, y_pred,
                                    labels=sorted_labels)

        for trial in range(len(model.trials.trials)):
                if model.trials.trials[trial].get('result').get('status') == 'ok':
                    trial_loss = model.trials.trials[trial].get('result').get('loss')
                    trial_duration = model.trials.trials[trial].get('result').get('duration')
                else:
                    trial_loss = np.nan
                    trial_duration = np.nan
            
                results_list.append([i,
                                     score,
                                     unweighted_score,
                                     le.inverse_transform(sorted_labels),
                                     c_matrix,
                                     duration,
                                     trial,
                                     trial_loss,
                                     trial_duration])
        
        append_df = pd.DataFrame(results_list,
                                 columns=['Outer_fold',
                                          'Outer_score',
                                          'Outer_unweighted_scores',
                                          'Outer_unweighted_score_labels',
                                          'Outer_confusion_matrix',
                                          'Outer_training_duration',
                                          'Trial',
                                          'Trial_loss',
                                          'Trial_duration'])
        if i == i_start:
            if results_df is not None:
                final_df = pd.concat([results_df,
                                      append_df],
                                      ignore_index=True)
            else:
                final_df = append_df
            final_df.to_pickle(cv_path)
        
        else:
            results_df = pd.read_pickle(cv_path)
            final_df = pd.concat([results_df,
                                  append_df],
                                  ignore_index=True)
            final_df.to_pickle(cv_path)
     

In [9]:
def f1_loss(y_true, y_pred):
    """
    Custom loss function for HyperOpt.
    Uses F1 score instead of accuracy score, as the latter is inappropriate
    for multi-class classification.
    """
    return 1.0 - f1_score(y_true, y_pred, average='weighted')
    

# Script the classifier

In [10]:
df_120 = a_df_120.append([p_df_120,pb_df_120])
df_120 = df_120.reset_index(drop=True)

df_200 = a_df_200.append([p_df_200,pb_df_200])
df_200 = df_200.reset_index(drop=True)

  df_120 = a_df_120.append([p_df_120,pb_df_120])
  df_200 = a_df_200.append([p_df_200,pb_df_200])


In [11]:
measured_frequency = [float(i) for i in df_120.columns[:-1].values]
n_model_f_bins = len(measured_frequency) # No. freq bins in model data
n_species = len(df_120.Species.unique()) # No. species in model data

# -- WRANGLE DATA ---------------------------------------------------------

le = LabelEncoder() # Maps labels -> int (e.g. Atlantic cod -> 0, Polar cod -> 1)
df_120['Species_le'] = le.fit_transform(df_120.Species)
X = df_120.iloc[:,:-2] # Features, TS(f) only
y = df_120.Species_le # Labels

model = HyperoptEstimator(classifier = clf,
                          preprocessing = preprocessing,
                          ex_preprocs = ex_preprocessing,
                          algo = tpe.suggest,
                          trial_timeout = timeout,
                          loss_fn = f1_loss,
                          max_evals = max_evals,
                          n_jobs = n_jobs)

In [12]:
model

In [13]:
i_list = []
i_start = 0
cv = StratifiedKFold(n_splits=n_splits,
                     shuffle=True,
                     random_state=42) # Outer CV
for tr_i, te_i in cv.split(X,y):
    i_list.append([tr_i, te_i])

# For each fold...
for i in range(i_start, len(i_list)):
    results_list = []
    print('Beginning fold ' + str(i+1) + ' of ' + str(len(i_list)))

    # Split data into training and test tests
    X_train = X.loc[X.index.intersection(i_list[i][0])]
    y_train = y.loc[y.index.intersection(i_list[i][0])]
    X_test = X.loc[X.index.intersection(i_list[i][1])]
    y_test = y.loc[y.index.intersection(i_list[i][1])]

Beginning fold 1 of 2
Beginning fold 2 of 2


In [14]:
X_train

Unnamed: 0,94.032,96.048,98.065,100.081,102.097,104.113,106.129,108.145,110.161,112.177,...,140.403,142.419,144.435,146.452,148.468,150.484,152.500,154.516,156.532,158.548
1,-32.759212,-33.361819,-35.976894,-39.245275,-44.600144,-41.406010,-37.398772,-35.470479,-35.223482,-35.929762,...,-37.264697,-35.857228,-35.125267,-33.725870,-34.305943,-36.027231,-37.488966,-40.002543,-43.195869,-45.250476
8,-33.822134,-33.621933,-34.616306,-35.740195,-36.227325,-36.622396,-36.995091,-36.618035,-36.095784,-35.926300,...,-41.477473,-43.542081,-47.137967,-43.627156,-38.618315,-36.595477,-34.481837,-33.473396,-33.032423,-32.842451
11,-36.643168,-37.110629,-36.026382,-37.250057,-40.297676,-41.859592,-40.820743,-38.128141,-34.475209,-34.114523,...,-33.247636,-40.300604,-47.474937,-36.747720,-36.495781,-39.804075,-47.176179,-45.899463,-42.068993,-39.368045
12,-33.716678,-32.501961,-30.857425,-30.304596,-30.530759,-30.713751,-31.648192,-32.837626,-36.695260,-53.501766,...,-38.794333,-52.972370,-33.666394,-27.530635,-26.744521,-28.838104,-33.902402,-44.675524,-32.408229,-29.331859
13,-28.757566,-27.670486,-28.555722,-30.618238,-31.389908,-30.417303,-29.604160,-28.685877,-28.885619,-31.044293,...,-29.193527,-28.743165,-29.040907,-29.146344,-31.015450,-34.544972,-39.013593,-41.765718,-38.070586,-35.430066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789,-73.968783,-74.456208,-74.707695,-73.887607,-74.112531,-74.288807,-74.150499,-73.925434,-73.661063,-73.639634,...,-73.381109,-73.566646,-74.026504,-73.373368,-73.538058,-74.382180,-74.467824,-74.072061,-73.701335,-73.019154
791,-74.001606,-72.226926,-72.907131,-73.678806,-74.261660,-74.291738,-74.292904,-74.026277,-73.676867,-73.956511,...,-74.007059,-73.718738,-73.958278,-73.530778,-73.751918,-74.796193,-74.922151,-73.994373,-73.919818,-73.590965
792,-74.947152,-75.035132,-74.069807,-73.518575,-74.231326,-74.225663,-73.977599,-74.132786,-73.639528,-73.422443,...,-73.974321,-73.415480,-73.744944,-73.692674,-73.604643,-74.179045,-74.481108,-73.994924,-73.615851,-72.999563
795,-73.634061,-71.845338,-73.610606,-74.453365,-74.013068,-73.734278,-74.157818,-74.220182,-73.520540,-73.414778,...,-73.648826,-73.103201,-73.701438,-73.785508,-73.710624,-74.371016,-74.559889,-73.998519,-73.947284,-73.250508


In [15]:
nested_cv(X, y, model, n_splits, n_folds, unique_id)

Beginning fold 1 of 2
100%|███████████████████████████████████████████████████████████████████| 1/1 [02:04<00:00, 124.03s/trial, best loss=?]
Beginning fold 1 of 2
100%|███████████████████████████████████████████████████████████████████| 1/1 [02:04<00:00, 124.29s/trial, best loss=?]
Beginning fold 1 of 2
100%|███████████████████████████████████████████████████████████████████| 1/1 [02:03<00:00, 123.78s/trial, best loss=?]


RetryError: RetryError[<Future at 0x24e6ff77a30 state=finished raised AllTrialsFailed>]

In [None]:
def main_classify():    

    ""
    #path: srt. path to dataframes
    #detections: str. Type of detections, SED, trackSED or trackavg.
    
    detections = 'SED'
    path = './'

    # -- IMPORT FILES ---------------------------------------------------------
    if detections == 'SED':
        labelled_df = pd.read_pickle(path+'single_SED_df.pkl')
        mix_df = pd.read_pickle(path+'fm_SED_df.pkl')

    elif detections == 'trackSED':
        labelled_df = pd.read_pickle(path+'single_trackSED_df.pkl')
        mix_df = pd.read_pickle(path+'fm_trackSED_df.pkl')

    else:
        labelled_df = pd.read_pickle(path+'single_trackavg_df.pkl')
        mix_df = pd.read_pickle(path+'fm_trackavg_df.pkl')

    # -- RESTRUCTURE MODEL DATA -----------------------------------------------

    measured_frequency = [float(i) for i in mix_df.columns.values]
    n_model_f_bins = len(measured_frequency) # No. freq bins in model data
    n_species = len(labelled_df.Species.unique()) # No. species in model data

    # -- WRANGLE DATA ---------------------------------------------------------

    le = LabelEncoder() # Maps labels -> int (e.g. Copepods -> 0, Krill -> 1)
    labelled_df['Species_le'] = le.fit_transform(labelled_df.Species)
    X = labelled_df.iloc[:,14+1:-15-2] # Features, TS(f) only
    y = labelled_df.Species_le # Labels


    measured_X = mix_df # Features, TS(f) only

    # -- NESTED CROSS-VALIDATION ----------------------------------------------

    model = HyperoptEstimator(classifier = clf,
                              preprocessing = preprocessing,
                              ex_preprocs = ex_preprocessing,
                              algo = tpe.suggest,
                              trial_timeout = timeout,
                              loss_fn = f1_loss,
                              max_evals = max_evals,
                              n_jobs = n_jobs)
    model

    nested_cv(X, y, model, n_splits, n_folds, unique_id)

    # -- RETRAIN MODEL --------------------------------------------------------

    print('Retraining model on full dataset')

    model = HyperoptEstimator(classifier = clf,
                              preprocessing = preprocessing,
                              ex_preprocs = ex_preprocessing,
                              algo = tpe.suggest,
                              trial_timeout = timeout,
                              loss_fn = f1_loss,
                              max_evals = max_evals,
                              n_jobs = n_jobs)

    model.fit(X, y, n_folds=n_folds, cv_shuffle=True)

    # -- PREDICT CLASSES FOR NEW DATA -----------------------------------------

    print('Classifying new data')

    y_pred = model.predict(measured_X) # Predict classes for measured TS(f)
    y_pred = le.inverse_transform(y_pred) # Transform labels back to species

    # -- OUTPUT RESULTS -------------------------------------------------------

    mix_df['Prediction'] = y_pred
    mix_df.to_pickle(unique_id + '_Predictions.pkl')

    with open(unique_id + '_BestParams.pkl', 'wb') as handle:
        pickle.dump(model.best_model(), handle)




In [None]:
main_classify(path, 'SED')