
Originally created by Chelsey McGowan-Yallop, SAMS-UHI (sa06cm@sams.ac.uk)

Modified by Muriel Dunn for fish mix analysis

This script uses model-predicted TS(f) spectra to train a machine learning
classifier, performs nested cross-validation, applies the classifier to
measured TS(f) spectra and outputs results files.

To use a different classifier, see the list of supported classifiers at:
https://github.com/hyperopt/hyperopt-sklearn and set as clf.

Sometimes the initial hyperparameter configuration selected by HyperOpt in each
split in the outer loop will be unsuccessful and all trials will fail. The
retry decorator forces it to try again until retry_limit is reached.

OUTPUT FILES:
    _NestedCV.pkl contains results of nested cross-validation procedure
    _Predictions.pkl contains measured TS(f) spectra with predicted labels
    _BestParams.pkl contains the optimal hyperparameters for the model
"""

In [1]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
import hyperopt
from hyperopt import tpe
from scipy.stats import sem

from hpsklearn import HyperoptEstimator, k_neighbors_classifier, svc, lightgbm_classification, gaussian_nb, any_preprocessing,min_max_scaler, normalizer
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tenacity import retry, stop_after_attempt

import AZKABANML

import sys, errno  


In [2]:
# -- USER-DETERMINED PARAMETERS -----------------------------------------------
path = 'C:/Users/mbd/OneDrive - Akvaplan-niva AS/PhD-APN/ChaptersandExperiments/AZKABAN-light/ZoopMix_paper/'
# .CSV FILES FROM ECHOVIEW
# renamed from 2 pinggap files
ts_SED_path = path+"SED_ZoopMix_FTwindow33pl.csv" # Path to Echoview TS(f) file
ts_track_path = path+"SED_ZoopMix_FTwindow33pl_tracks.csv" # Path to Echoview TS(f) file
ts_trackavg_path = path+"SED_ZoopMix_FTwindow33pl_trackavg.csv" # Path to Echoview TS(f) file


In [3]:
measured_df_SED = AZKABANML.tsf_targets_import(ts_SED_path)
measured_df_track = AZKABANML.tsf_targets_import(ts_track_path)
measured_df_trackavg = AZKABANML.tsf_targets_import(ts_trackavg_path)

In [50]:
len(measured_df_SED)

13149

In [43]:
# Count number of detections per track
detec_per_track  = np.mean(measured_df_track.groupby('Region_name').count()['Ping_index'])
detec_per_track_sd = sem(measured_df_track.groupby('Region_name').count()['Ping_index'])
detec_per_track_min = np.min(measured_df_track.groupby('Region_name').count()['Ping_index'])
detec_per_track_max = np.max(measured_df_track.groupby('Region_name').count()['Ping_index'])

print(f'There are {detec_per_track:.3} +- {detec_per_track_sd:.3} detections per track, with a minimum of {detec_per_track_min:1} detections and a maximum of {detec_per_track_max:1} detections')

There are 9.94 +- 0.392 detections per track, with a minimum of 4 detections and a maximum of 178 detections


In [4]:
# Trim
df_SED = measured_df_SED.iloc[:,3:-1]
df_track = measured_df_track.iloc[:,4:-1]
df_trackavg = measured_df_trackavg.iloc[:,12:-1]

In [6]:
df_SED = df_SED.reset_index(drop=True)
df_track = df_track.reset_index(drop=True)
df_trackavg = df_trackavg.reset_index(drop=True)

In [7]:
df_SED.to_pickle(path+'df_SED.pkl')
df_track.to_pickle(path+'df_track.pkl')
df_trackavg.to_pickle(path+'df_trackavg.pkl')

# Prep model results

In [8]:
fname_sim = '../ZooScatStuff/AZKABAN_ZoopMix_data_shapesmooth_fullbandwidth.feather'
simulations = pd.read_feather(fname_sim)

In [9]:
n_freq = len(simulations.freq.unique())
n_sim = len(simulations)

In [10]:
sim_np = simulations['TS'].to_numpy()
sim_np_spec = simulations['spec'].to_numpy()

In [11]:
TS = sim_np.reshape((int(n_sim/n_freq),n_freq))
spec = sim_np_spec.reshape((int(n_sim/n_freq),n_freq))

In [12]:
df_sim = pd.DataFrame(TS, columns=simulations.freq.unique())
df_sim['spec'] = spec[:,0]

In [13]:
# check for the chaetognath nan and remove
i_nan = np.where(df_sim.isnull().values)[0][0]
df_sim = df_sim.drop(axis=0,labels=i_nan)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
df_sim.to_pickle(path+'df_sim_smooth.pkl')

Unnamed: 0_level_0,Ping_index,Range,Depth,Region_name,185.000,185.500,186.000,186.500,187.000,187.500,...,251.000,251.500,252.000,252.500,253.000,253.500,254.000,254.500,255.000,Datetime
Target_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1.715900,1.715748,Region 1,-69.453964,-68.521743,-67.986585,-67.733404,-67.754191,-68.130185,...,-71.595734,-71.547922,-71.147465,-70.542152,-69.911074,-69.383364,-69.019195,-68.878777,-68.918106,2022-01-17 09:45:17.944
1,1,1.715900,1.715689,Region 1,-70.599819,-71.358021,-71.912162,-72.177343,-72.210387,-72.189034,...,-69.456054,-69.767954,-70.247306,-70.808972,-71.315128,-71.605663,-71.531900,-71.057052,-70.153700,2022-01-17 09:45:18.478
2,2,1.710103,1.709843,Region 1,-71.856611,-71.593160,-71.263903,-70.905700,-70.630484,-70.556040,...,-70.926331,-70.803783,-70.527596,-70.179028,-69.844091,-69.592659,-69.455403,-69.465011,-69.534129,2022-01-17 09:45:18.881
3,3,1.692712,1.692449,Region 1,-70.293702,-69.615039,-69.264823,-69.141130,-69.228822,-69.573658,...,-70.792217,-70.863150,-70.716718,-70.376147,-69.915707,-69.430619,-68.994180,-68.689533,-68.493175,2022-01-17 09:45:19.286
4,4,1.675321,1.675046,Region 1,-70.751471,-69.888469,-68.954974,-68.099867,-67.462875,-67.221682,...,-69.086218,-69.129377,-69.216366,-69.326730,-69.434331,-69.522459,-69.567731,-69.571901,-69.436181,2022-01-17 09:45:19.685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7717,27656,1.912997,1.912942,Region 782,-99.034302,-106.297606,-106.266039,-99.270877,-95.873425,-94.406686,...,-104.504560,-105.806997,-102.091240,-98.645742,-96.311438,-94.761352,-93.737226,-93.122784,-92.769297,2022-01-17 12:51:47.625
7718,27689,1.159392,1.158532,Region 783,-96.063808,-102.835533,-115.104164,-103.016003,-98.711747,-96.555251,...,-82.542800,-82.990978,-83.795413,-84.870780,-86.071126,-87.215220,-88.068163,-88.262104,-87.447262,2022-01-17 12:52:01.018
7719,27690,1.182580,1.182321,Region 783,-104.828141,-104.342768,-99.434993,-96.396264,-94.679829,-94.018032,...,-87.974473,-86.584174,-85.406449,-84.642937,-84.367469,-84.606328,-85.378224,-86.732273,-88.608961,2022-01-17 12:52:01.416
7720,27691,1.194174,1.194065,Region 783,-88.791879,-87.686150,-87.688305,-89.066373,-92.456765,-100.767745,...,-86.297288,-85.443296,-84.492389,-83.684937,-83.149182,-82.941372,-83.065294,-83.528300,-84.221820,2022-01-17 12:52:01.824
