In this notebook we solve the problem of the imbalance of classes in the AFKABAN datasets

In [1]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score, confusion_matrix
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tenacity import retry, stop_after_attempt
import sys, errno
import AZKABANML

In [2]:
# Set paths
path = 'F:/AFKABAN/review2024'
classifypath = 'F:/AFKABAN/Classify/'

df_120 = pd.read_feather(f'{path}/SED_120_df.feather')
df_200 = pd.read_feather(f'{path}/SED_200_df.feather')

track_120 = pd.read_feather(f'{path}/track_120_df.feather')
track_200 = pd.read_feather(f'{path}/track_200_df.feather')

#df_120_w_h = pd.read_feather(f'{path}/df_120_w_h.feather')
#df_200_w_h = pd.read_feather(f'{path}/df_200_w_h.feather')

In [3]:
df_train = df_120.loc[:,:].copy()

le = LabelEncoder() # Maps labels -> int (e.g. Atlantic cod -> 0, Polar cod -> 1)
df_train['Species_le'] = le.fit_transform(df_train.Species)
X = df_train.iloc[:,:-2].to_numpy() # Features, TS(f) only
y = df_train.loc[:,'Species_le'].to_numpy() # Labels

In [4]:
cv = GroupKFold(n_splits=5)
i_list = []
i_start=0
for tr_i, te_i in cv.split(X,y, track_120):
        i_list.append([tr_i, te_i])

In [6]:
for i in range(i_start, len(i_list)):
    results_list = []
    print('Beginning fold ' + str(i+1) + ' of ' + str(len(i_list)))

    # Split data into training and test tests
    X_train = X[i_list[i][0]]
    y_train = y[i_list[i][0]]
    X_test = X[i_list[i][1]]
    y_test = y[i_list[i][1]]
    print(track_120.iloc[i_list[i][0]])
    print(track_120.iloc[i_list[i][1]])

    start = time.time()

Beginning fold 1 of 5
     Region_name
0      Region 10
1      Region 10
2      Region 10
3     Region 100
4     Region 100
...          ...
1119  Region 326
1120  Region 326
1121  Region 326
1122  Region 326
1123  Region 326

[918 rows x 1 columns]
     Region_name
48    Region 112
49    Region 112
50    Region 112
51    Region 112
52    Region 112
...          ...
1143  Region 327
1144  Region 327
1145  Region 327
1146  Region 327
1147  Region 327

[230 rows x 1 columns]
Beginning fold 2 of 5
     Region_name
3     Region 100
4     Region 100
5     Region 100
6     Region 100
7     Region 100
...          ...
1143  Region 327
1144  Region 327
1145  Region 327
1146  Region 327
1147  Region 327

[918 rows x 1 columns]
     Region_name
0      Region 10
1      Region 10
2      Region 10
37    Region 108
38    Region 108
...          ...
1084  Region 311
1085  Region 311
1086  Region 311
1087  Region 311
1104  Region 323

[230 rows x 1 columns]
Beginning fold 3 of 5
     Region_name
0    