Function to create a spectral response data frame for each frequency band and a corresponding track dataframe

In [1]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import scipy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score, confusion_matrix
#from sklearn.neighbors import KNeighborsClassifier
import hyperopt
from hyperopt import hp
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, k_neighbors_classifier, svc, lightgbm_classification, random_forest_classifier
from sklearn.metrics import ConfusionMatrixDisplay
import lightgbm
from datetime import timedelta
from tenacity import retry, stop_after_attempt
import matplotlib.pyplot as plt

import sys, errno

In [2]:
# -- USER-DETERMINED PARAMETERS -----------------------------------------------
letter_drive = 'F'
path = f'{letter_drive}:/AFKABAN/review2024/'
ppath = f'{letter_drive}:/AFKABAN/Pcod_190123/select/'
p2path = f'{letter_drive}:/AFKABAN/Pcod2_240123/select/'
apath = f'{letter_drive}:/AFKABAN/Acod_200123/select/'
pbpath = f'{letter_drive}:/AFKABAN/Pandalus_260123/select/'

# CLASSIFIER
unique_id = '13-03-2023_SVC_AFKABAN' # Unique ID for output file paths
clf = svc(unique_id)  # Classifier

# NESTED CROSS-VALIDATION
#preprocessing = [] # List of sklearn pre-processing modules
#ex_preprocessing = [] # As above, see help(HyperoptEstimator) for info
n_splits = 2 # Value of k for k-fold cross-validation in outer loop
n_folds = 2 # Value of k for k-fold cross-validation in inner loop
max_evals = 10 # No. of HyperOpt trials
timeout = 600 # HyperOpt trial timeout (seconds)
n_jobs = -1 # No. of jobs to run in parallel; -1 uses all processors
retry_limit = 3 # No. of times to retry before failing

# Read the dataframes

In [3]:
# Read pickle to open on Stokes
a_df = pd.read_feather(apath+'/a_tilt_df.feather')
p_df = pd.read_feather(ppath+'/p_tilt_df.feather')
p2_df = pd.read_feather(p2path+'/p2_tilt_df.feather')
pb_df = pd.read_feather(pbpath+'/pb_tilt_df.feather')

In [4]:
def select_ts_bandwidth(df):
    'Function to seperated target spectra from 120 kHz and 200 kHz echosounders'
    ind_120 = np.where(np.isnan(df['201.129']))[0]
    ts_s_120 = np.where(df.columns.values=='90.000')[0][0]
    ts_e_120 = np.where(df.columns.values=='162.581')[0][0]
    track_col = np.where(a_df.columns.values ==['Region_name'])[0][0]

    ind_200 = np.where(np.isnan(df['120.242']))[0]
    ts_s_200 = np.where(df.columns.values=='185.000')[0][0]
    ts_e_200 = np.where(df.columns.values=='255.565')[0][0]

    df_120 = df.iloc[ind_120,ts_s_120:ts_e_120]
    df_200 = df.iloc[ind_200,ts_s_200:ts_e_200]

    track_120 = df.iloc[ind_120,track_col]
    track_200 = df.iloc[ind_200,track_col]

    return df_120, df_200, track_120, track_200

In [5]:
a_df_120, a_df_200, a_track_120, a_track_200 = select_ts_bandwidth(a_df)
p_df_120, p_df_200, p_track_120, p_track_200 = select_ts_bandwidth(p_df)
p2_df_120, p2_df_200, p2_track_120, p2_track_200 = select_ts_bandwidth(p2_df)
pb_df_120, pb_df_200, pb_track_120, pb_track_200 = select_ts_bandwidth(pb_df)

# Classification functions

In [6]:
SED_120_df = pd.concat([a_df_120,p_df_120,p2_df_120,pb_df_120])
track_120_df = pd.concat([a_track_120,p_track_120,p2_track_120,pb_track_120]).to_frame()

Species = np.concatenate([np.repeat('Atlantic cod',len(a_df_120)),np.repeat('Polar cod',len(p_df_120)),np.repeat('Polar cod',len(p2_df_120)),np.repeat('Northern shrimp',len(pb_df_120))])
SED_120_df['Species']=Species

In [7]:
SED_200_df = pd.concat([a_df_200,p_df_200,p2_df_200,pb_df_200])
track_200_df = pd.concat([a_track_200,p_track_200,p2_track_200,pb_track_200]).to_frame()

Species = np.concatenate([np.repeat('Atlantic cod',len(a_df_200)),np.repeat('Polar cod',len(p_df_200)),np.repeat('Polar cod',len(p2_df_200)),np.repeat('Northern shrimp',len(pb_df_200))])
SED_200_df['Species']=Species

In [9]:
SED_120_df = SED_120_df.reset_index(drop=True)
SED_200_df = SED_200_df.reset_index(drop=True)

track_120_df = track_120_df.reset_index(drop=True)
track_200_df = track_200_df.reset_index(drop=True)

In [10]:
SED_120_df.to_feather(f'{path}/SED_120_df.feather')
SED_200_df.to_feather(f'{path}/SED_200_df.feather')

track_120_df.to_feather(f'{path}/track_120_df.feather')
track_200_df.to_feather(f'{path}/track_200_df.feather')
