# Functions for up- and downsampling of the data to account for class imbalances


Source: https://www.kaggle.com/code/aikhmelnytskyy/birdclef24-pretraining-is-all-you-need#Filter-&-Upsample-Data-%E2%AC%86%EF%B8%8F

In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv("../../data/dataset_raw.csv")

In [44]:
class cfg:
    seed = 42

### The issue:

In [23]:
counts = df.groupby("en")["en"].count().reset_index(name='count').sort_values("count", ascending = False)
counts

Unnamed: 0,en,count
0,Barn Swallow,500
14,Eurasian Blackcap,500
44,Willow Warbler,500
42,Tawny Owl,500
41,Song Thrush,500
32,Great Tit,500
30,European Robin,500
29,European Greenfinch,500
28,European Goldfinch,500
26,Eurasian Wren,500


In [28]:
df.groupby("en")["length_seconds"].sum(
).reset_index(name='total length (seconds)').sort_values("total length (seconds)", ascending = False)[:60]

Unnamed: 0,en,total length (seconds)
2,Common Blackbird,75144
41,Song Thrush,66877
6,Common Nightingale,65856
30,European Robin,61216
14,Eurasian Blackcap,60421
42,Tawny Owl,53970
45,Yellowhammer,47329
3,Common Chaffinch,43666
32,Great Tit,42801
26,Eurasian Wren,42640


## Upsampling

In [45]:
def upsample_data(df, thr=200):
    # get the class distribution
    class_dist = df['en'].value_counts()

    # identify the classes that have less than the threshold number of samples
    down_classes = class_dist[class_dist < thr].index.tolist()

    # create an empty list to store the upsampled dataframes
    up_dfs = []

    # loop through the undersampled classes and upsample them
    for c in down_classes:
        # get the dataframe for the current class
        class_df = df.query("en==@c")
        # find number of samples to add
        num_up = thr - class_df.shape[0]
        # upsample the dataframe
        class_df = class_df.sample(n=num_up, replace=True, random_state=cfg.seed)
        # append the upsampled dataframe to the list
        up_dfs.append(class_df)

    # concatenate the upsampled dataframes and the original dataframe
    up_df = pd.concat([df] + up_dfs, axis=0, ignore_index=True)
    
    return up_df

In [39]:
len(df)

17382

In [40]:
df_up = upsample_data(df)

In [41]:
len(df_up)

17864

In [42]:
df_up.groupby("en")["en"].count().reset_index(name='count').sort_values("count", ascending = False)

Unnamed: 0,en,count
0,Barn Swallow,500
14,Eurasian Blackcap,500
44,Willow Warbler,500
42,Tawny Owl,500
41,Song Thrush,500
32,Great Tit,500
30,European Robin,500
29,European Greenfinch,500
28,European Goldfinch,500
26,Eurasian Wren,500


In [43]:
df_up.groupby("en")["length_seconds"].sum(
).reset_index(name='total length (seconds)').sort_values("total length (seconds)", ascending = False)

Unnamed: 0,en,total length (seconds)
2,Common Blackbird,75144
41,Song Thrush,66877
6,Common Nightingale,65856
30,European Robin,61216
14,Eurasian Blackcap,60421
42,Tawny Owl,53970
45,Yellowhammer,47329
3,Common Chaffinch,43666
32,Great Tit,42801
26,Eurasian Wren,42640


## Downsampling

In [48]:
def downsample_data(df, thr=400):
    # get the class distribution
    class_dist = df['en'].value_counts()
    
    # identify the classes that have less than the threshold number of samples
    up_classes = class_dist[class_dist > thr].index.tolist()

    # create an empty list to store the upsampled dataframes
    down_dfs = []

    # loop through the undersampled classes and upsample them
    for c in up_classes:
        # get the dataframe for the current class
        class_df = df.query("en==@c")
        # Remove that class data
        df = df.query("en!=@c")
        # upsample the dataframe
        class_df = class_df.sample(n=thr, replace=False, random_state=cfg.seed)
        # append the upsampled dataframe to the list
        down_dfs.append(class_df)

    # concatenate the upsampled dataframes and the original dataframe
    down_df = pd.concat([df] + down_dfs, axis=0, ignore_index=True)
    
    return down_df

In [49]:
df_down = downsample_data(df_up)

In [50]:
df_down.groupby("en")["en"].count().reset_index(name='count').sort_values("count", ascending = False)

Unnamed: 0,en,count
0,Barn Swallow,400
20,Eurasian Golden Oriole,400
44,Willow Warbler,400
42,Tawny Owl,400
41,Song Thrush,400
40,Redwing,400
33,House Sparrow,400
32,Great Tit,400
31,Goldcrest,400
30,European Robin,400


In [51]:
df_down.groupby("en")["length_seconds"].sum(
).reset_index(name='total length (seconds)').sort_values("total length (seconds)", ascending = False)

Unnamed: 0,en,total length (seconds)
2,Common Blackbird,60529
41,Song Thrush,53897
6,Common Nightingale,53276
30,European Robin,49362
14,Eurasian Blackcap,48762
42,Tawny Owl,43575
45,Yellowhammer,37974
3,Common Chaffinch,35273
32,Great Tit,34586
26,Eurasian Wren,34483
