In this notebook we solve the problem of the imbalance of classes in the AFKABAN datasets

In [182]:
import time
import os.path
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix
import hyperopt
from hyperopt import tpe
from hpsklearn import HyperoptEstimator
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tenacity import retry, stop_after_attempt

import sys, errno  
import AZKABANML

In [183]:
# Set paths
path = 'F:/AFKABAN/'
classifypath = 'F:/AFKABAN/Classify/'

df_120 = pd.read_feather(f'{path}/df_120.feather')
df_200 = pd.read_feather(f'{path}/df_200.feather')

Class imbalance is when the classes are not represented equally. Here, we have ~ 600 polar cod, ~300 atlantic cod and ~ 100 Pandalus. Maybe even more imbalance when looking per frequency bandwidth

https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

Methods to resolve te problem of imbalanced data:
    
    - over sampling (duplicate classes with fewer samples)
    - under sampling (delete from classes with many samples)
    - Use better metrics (F1 score solved this and accounts for false +)
    - SMOTE?

# Over sampling
Easiest. Not so many samples so prefer over sampling to under sampling. Try multiplying shrimps and atlantic cod to match polar cod #

In [184]:
def balance_classes(df):
    'Balance classes representative by over sampling pandalus and atlantic cod'
    
    count = df.groupby('Species').count().iloc[:,0]
    df_balanced = df[df['Species']=='Polar cod']

    for spec in ['Atlantic cod','Northern shrimp']:
        new_ind = random.choices(np.where(df['Species']==spec)[0][:], k=count['Polar cod'])

        df_balanced = pd.concat([df_balanced, df.iloc[new_ind,:]])
        df_balanced = df_balanced.reset_index(drop=True)
    
    return df_balanced

In [185]:
df_120_balanced = balance_classes(df_120)
df_200_balanced = balance_classes(df_200)

In [186]:
df_120_balanced.to_feather(f'{path}/df_120_balanced.feather')
df_200_balanced.to_feather(f'{path}/df_200_balanced.feather')

Now the hyperoptimizer splits into 90% training and 10% testing with equal (+/- 1) distribution across the classes.

### linear balanced

In [187]:
df_120_balanced


Unnamed: 0,94.032,96.048,98.065,100.081,102.097,104.113,106.129,108.145,110.161,112.177,...,142.419,144.435,146.452,148.468,150.484,152.500,154.516,156.532,158.548,Species
0,-45.758686,-44.868659,-44.958895,-45.010072,-44.868090,-44.689645,-44.893148,-44.791251,-44.563163,-44.800194,...,-44.905437,-45.229583,-45.015549,-45.228847,-46.060031,-45.970267,-45.318964,-45.073978,-44.697276,Polar cod
1,-43.113701,-42.492137,-42.774746,-42.820333,-42.948931,-42.894308,-42.939604,-42.819493,-42.421160,-42.278958,...,-40.426212,-40.779054,-40.446211,-40.573818,-41.380525,-41.401501,-41.013047,-40.738137,-40.227906,Polar cod
2,-45.105354,-44.151921,-44.324668,-44.222121,-44.298563,-44.283248,-44.282728,-44.112967,-43.804455,-43.756822,...,-40.446052,-40.692118,-40.185383,-40.247821,-41.024861,-41.067247,-40.621290,-40.399201,-39.874827,Polar cod
3,-44.709585,-44.039217,-44.419825,-44.356420,-44.579693,-44.594233,-44.567755,-44.519842,-44.284258,-44.238251,...,-40.187181,-40.373366,-39.740903,-39.799439,-40.517186,-40.409879,-39.862945,-39.547541,-38.853531,Polar cod
4,-43.706161,-42.983250,-43.324557,-43.476165,-43.682228,-43.697687,-43.737267,-43.576087,-43.194127,-43.186469,...,-42.056466,-42.253645,-41.771565,-41.865642,-42.568785,-42.650897,-42.207498,-42.072210,-41.571769,Polar cod
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,-72.194815,-77.215413,-81.705330,-79.384231,-80.015609,-78.698068,-78.679593,-81.075281,-79.738311,-79.235078,...,-80.444738,-81.262198,-80.464428,-79.964879,-80.877562,-81.458018,-81.681595,-80.405833,-79.077285,Northern shrimp
2081,-82.580268,-78.340196,-76.197126,-77.141408,-79.183821,-81.210972,-83.887205,-82.113385,-78.158805,-78.757105,...,-79.975641,-81.380872,-81.834356,-81.162140,-81.575552,-81.610534,-81.801946,-81.617410,-80.453903,Northern shrimp
2082,-72.247166,-76.597375,-75.108006,-75.620721,-78.825751,-77.556788,-76.174477,-77.026415,-78.079112,-77.943368,...,-81.620952,-83.401369,-82.421586,-82.012346,-84.580736,-83.243481,-81.836195,-85.160574,-83.880140,Northern shrimp
2083,-78.874025,-80.856951,-78.814295,-79.757592,-83.120390,-84.037262,-81.370861,-79.432733,-79.553565,-80.820616,...,-81.406121,-81.933317,-82.791393,-82.720792,-82.753657,-83.086020,-83.735196,-83.919692,-84.483031,Northern shrimp


In [188]:
df_120_sigbs_balanced = 10**(df_120_balanced.iloc[:,:-1]/10)
df_200_sigbs_balanced = 10**(df_200_balanced.iloc[:,:-1]/10)

In [189]:
df_120_sigbs_balanced['Species'] = df_120_balanced['Species']
df_200_sigbs_balanced['Species'] = df_200_balanced['Species']

In [190]:
df_120_sigbs_balanced.to_feather(f'{path}/df_120_sigbs_balanced.feather')
df_200_sigbs_balanced.to_feather(f'{path}/df_200_sigbs_balanced.feather')

In [192]:
df_200.columns

Index(['189.032', '191.048', '193.065', '195.081', '197.097', '199.113',
       '201.129', '203.145', '205.161', '207.177', '209.194', '211.210',
       '213.226', '215.242', '217.258', '219.274', '221.290', '223.306',
       '225.323', '227.339', '229.355', '231.371', '233.387', '235.403',
       '237.419', '239.435', '241.452', '243.468', '245.484', '247.500',
       '249.516', 'Species'],
      dtype='object')