In this notebook we solve the problem of the imbalance of classes in the AFKABAN datasets

In [1]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tenacity import retry, stop_after_attempt
import sys, errno
import AZKABANML

In [66]:
# Set paths
path = 'F:/AFKABAN/review2024'
classifypath = 'F:/AFKABAN/Classify/'

df_120 = pd.read_feather(f'{path}/SED_120_df.feather')
df_200 = pd.read_feather(f'{path}/SED_200_df.feather')

track_120 = pd.read_feather(f'{path}/track_120_df.feather')
track_200 = pd.read_feather(f'{path}/track_200_df.feather')

#df_120_w_h = pd.read_feather(f'{path}/df_120_w_h.feather')
#df_200_w_h = pd.read_feather(f'{path}/df_200_w_h.feather')

Class imbalance is when the classes are not represented equally. Here, we have ~ 600 polar cod, ~300 atlantic cod and ~ 100 Pandalus. Maybe even more imbalance when looking per frequency bandwidth

https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

Methods to resolve te problem of imbalanced data:
    
    - over sampling (duplicate classes with fewer samples)
    - under sampling (delete from classes with many samples)
    - Use better metrics (F1 score solved this and accounts for false +)
    - SMOTE?

# Over sampling
Easiest. Not so many samples so prefer over sampling to under sampling. Try multiplying shrimps and atlantic cod to match polar cod #

In [140]:
df_200_balanced

Unnamed: 0,185.000,187.016,189.032,191.048,193.065,195.081,197.097,199.113,201.129,203.145,...,237.419,239.435,241.452,243.468,245.484,247.500,249.516,251.532,253.548,Species
0,,,-50.660561,-51.046710,-51.358699,-51.612825,-51.424584,-51.155831,-51.359114,-51.578282,...,-53.307989,-53.441241,-53.893967,-53.970738,-54.179025,-54.591815,-55.182587,,,Polar cod
1,,,-45.435610,-45.343172,-45.761126,-46.529985,-47.079285,-47.330443,-47.795485,-48.493341,...,-50.096258,-50.503126,-50.446278,-50.228928,-50.280159,-50.085306,-49.704672,,,Polar cod
2,,,-44.001082,-43.809126,-44.026256,-44.617600,-45.004993,-45.196625,-45.670032,-46.455143,...,-49.299591,-49.727897,-49.752013,-49.487170,-49.335863,-49.040599,-48.692744,,,Polar cod
3,,,-45.200966,-45.055727,-45.513760,-46.180745,-46.536828,-46.766413,-47.296982,-48.103326,...,-50.211715,-49.868304,-49.460239,-49.072646,-48.682511,-48.475872,-48.399914,,,Polar cod
4,,,-45.297240,-45.240176,-45.500930,-46.186268,-46.530776,-46.631649,-47.052407,-47.674976,...,-49.965125,-50.183953,-50.060258,-49.864733,-49.692293,-49.325106,-48.975325,,,Polar cod
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,,,-85.877120,-86.119891,-86.365457,-87.056101,-87.550118,-87.501930,-87.546797,-88.489784,...,-91.430888,-91.529106,-91.169089,-90.730235,-91.098540,-92.038710,-92.418063,,,Northern shrimp
2093,,,-85.823680,-85.562690,-85.937955,-86.702789,-87.094934,-87.450567,-87.932062,-88.218960,...,-91.507919,-91.999601,-91.581074,-91.724821,-92.172717,-92.699356,-93.048940,,,Northern shrimp
2094,,,-81.189643,-82.285973,-82.379533,-82.887209,-83.460710,-83.373625,-83.386254,-84.173560,...,-84.603309,-84.896019,-84.888220,-84.591877,-84.835739,-85.098990,-84.837387,,,Northern shrimp
2095,,,-85.267142,-85.178375,-85.634574,-87.114060,-87.960723,-87.529294,-87.183056,-87.624040,...,-88.851925,-87.888104,-88.201598,-89.465323,-89.899548,-89.368236,-88.965686,,,Northern shrimp


In [141]:
def balance_classes(df, track):
    'Balance classes representative by over sampling pandalus and atlantic cod'

    count = df.groupby('Species').count().iloc[:,5]
    df_balanced = df[df['Species']=='Polar cod'] #dominant species
    select_ind = np.where(df['Species']=='Polar cod')[0]
    track_balanced = track.iloc[select_ind]

    name_list = ['Atlantic cod','Northern shrimp']


    for spec in name_list:
        new_ind = random.choices(np.where(df['Species']==spec)[0][:], k=count['Polar cod'])
        df_balanced = pd.concat([df_balanced, df.iloc[new_ind,:]])
        df_balanced = df_balanced.reset_index(drop=True)

        track_balanced = pd.concat([track_balanced, track.iloc[new_ind,:]])
        track_balanced = track_balanced.reset_index(drop=True)

    return df_balanced, track_balanced

In [142]:
df_120_balanced, track_120_balanced = balance_classes(df_120, track_120)
df_200_balanced, track_200_balanced = balance_classes(df_200, track_200)

In [110]:
df_120_balanced.to_feather(f'{path}/df_120_balanced.feather')
df_200_balanced.to_feather(f'{path}/df_200_balanced.feather')

track_120_balanced.to_feather(f'{path}/track_120_balanced.feather')
track_200_balanced.to_feather(f'{path}/track_200_balanced.feather')

Now the hyperoptimizer splits into 90% training and 10% testing with equal (+/- 1) distribution across the classes.

### linear balanced

In [145]:
df_120_sigbs_balanced = 10**(df_120_balanced.iloc[:,:-1]/10)
df_200_sigbs_balanced = 10**(df_200_balanced.iloc[:,:-1]/10)

In [146]:
df_120_sigbs_balanced['Species'] = df_120_balanced['Species']
df_200_sigbs_balanced['Species'] = df_200_balanced['Species']

In [147]:
df_120_sigbs_balanced.to_feather(f'{path}/df_120_sigbs_balanced.feather')
df_200_sigbs_balanced.to_feather(f'{path}/df_200_sigbs_balanced.feather')

In [148]:
len(df_120_balanced)/30

69.5

In [149]:
len(df_200_balanced)/30

69.9