In this notebook we solve the problem of the imbalance of classes in the AFKABAN datasets

In [2]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tenacity import retry, stop_after_attempt

import sys, errno  
import AZKABANML

In [3]:
# Set paths
path = 'F:/AFKABAN/'
classifypath = 'F:/AFKABAN/Classify/'

df_120 = pd.read_feather(f'{path}/df_120.feather')
df_200 = pd.read_feather(f'{path}/df_200.feather')

df_120_w_h = pd.read_feather(f'{path}/df_120_w_h.feather')
df_200_w_h = pd.read_feather(f'{path}/df_200_w_h.feather')

In [8]:
len(df_120.groupby('Species').count().iloc[:,0])

3

Class imbalance is when the classes are not represented equally. Here, we have ~ 600 polar cod, ~300 atlantic cod and ~ 100 Pandalus. Maybe even more imbalance when looking per frequency bandwidth

https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

Methods to resolve te problem of imbalanced data:
    
    - over sampling (duplicate classes with fewer samples)
    - under sampling (delete from classes with many samples)
    - Use better metrics (F1 score solved this and accounts for false +)
    - SMOTE?

# Over sampling
Easiest. Not so many samples so prefer over sampling to under sampling. Try multiplying shrimps and atlantic cod to match polar cod #

In [9]:
def balance_classes(df):
    'Balance classes representative by over sampling pandalus and atlantic cod'
    
    count = df.groupby('Species').count().iloc[:,0]
    df_balanced = df[df['Species']=='Polar cod'] #dominant species
    
    
    if len(count) == 3:
        
        name_list = ['Atlantic cod','Northern shrimp']
    else:
        name_list = ['Atlantic cod','Northern shrimp', 'Herring']
        
    for spec in name_list:
        new_ind = random.choices(np.where(df['Species']==spec)[0][:], k=count['Polar cod'])

        df_balanced = pd.concat([df_balanced, df.iloc[new_ind,:]])
        df_balanced = df_balanced.reset_index(drop=True)
    
    return df_balanced

In [10]:
df_120_balanced = balance_classes(df_120)
df_200_balanced = balance_classes(df_200)

In [11]:
df_120_w_h_balanced = balance_classes(df_120_w_h)
df_200_w_h_balanced = balance_classes(df_200_w_h)

In [186]:
df_120_balanced.to_feather(f'{path}/df_120_balanced.feather')
df_200_balanced.to_feather(f'{path}/df_200_balanced.feather')

In [13]:
df_120_w_h_balanced.to_feather(f'{path}/df_120_w_h_balanced.feather')
df_200_w_h_balanced.to_feather(f'{path}/df_200_w_h_balanced.feather')

Now the hyperoptimizer splits into 90% training and 10% testing with equal (+/- 1) distribution across the classes.

### linear balanced

In [188]:
df_120_sigbs_balanced = 10**(df_120_balanced.iloc[:,:-1]/10)
df_200_sigbs_balanced = 10**(df_200_balanced.iloc[:,:-1]/10)

In [189]:
df_120_sigbs_balanced['Species'] = df_120_balanced['Species']
df_200_sigbs_balanced['Species'] = df_200_balanced['Species']

In [190]:
df_120_sigbs_balanced.to_feather(f'{path}/df_120_sigbs_balanced.feather')
df_200_sigbs_balanced.to_feather(f'{path}/df_200_sigbs_balanced.feather')

In [198]:
len(df_120_balanced)/30

69.5

In [199]:
len(df_200_balanced)/30

69.9