In [None]:
from glob import glob
import pandas as pd
from sklearn.model_selection import train_test_split

## Make stratified datasets
-> Each class has an exact test, val split.
-> Classes with more instances are undersampled to level imbalance.

In [None]:
SRC_DIR = "../data/train_images/train_images"
TARGET_DIR = "../data/train_images/stratified"

In [None]:
df_dict = {
    "label": [],
    "image": [],
}

for path in glob(f"{SRC_DIR}/**/*.jpg"):
    _, label, _ = path.split("\\")
    df_dict["label"].append(label)
    df_dict["image"].append(path)

In [None]:
df = pd.DataFrame(df_dict)

In [None]:
df["label"].hist(bins = len(df["label"].unique()))
print(f"Nuber of labels: {len(df['label'].unique())}")
print(f"Min amount of instances: {min(df['label'].value_counts())}")
print(f"Max amount of instances: {max(df['label'].value_counts())}")

In [None]:
TO_TAKE = 40  # number of instances to take for the stratified fold
stratified_df = pd.DataFrame({
    "label": [],
    "image": [],
})

In [None]:
for label in df["label"].unique():
    target_df = df[df["label"] == label]
    if len(target_df) > TO_TAKE:
         target_df = target_df.sample(n=TO_TAKE)
    stratified_df = pd.concat([stratified_df, target_df])

In [None]:
stratified_df["label"].hist(bins = len(df["label"].unique()))

In [None]:
stratified_df.to_csv("../data/train_images_stratified.csv")

In [None]:
stratified_train_df, stratified_val_df = train_test_split(stratified_df, test_size=0.3, random_state=42, stratify=stratified_df["label"])

In [None]:
stratified_train_df.to_csv("../data/train_ds_images_stratified.csv")
stratified_val_df.to_csv("../data/val_ds_images_stratified.csv")