In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image

np.random.seed(42)

In [2]:
df = pd.read_csv("food_cls/food_cls.csv")
df

Unnamed: 0,path,class,class_id
0,apple_pie/23893.jpg,apple_pie,0
1,apple_pie/1348788.jpg,apple_pie,0
2,apple_pie/420768.jpg,apple_pie,0
3,apple_pie/1969851.jpg,apple_pie,0
4,apple_pie/1580276.jpg,apple_pie,0
...,...,...,...
23868,taquito/Taquito-Train (635).jpeg,taquito,33
23869,taquito/Taquito-Train (1235).jpeg,taquito,33
23870,taquito/Taquito-Train (664).jpeg,taquito,33
23871,taquito/Taquito-Train (7).jpeg,taquito,33


In [3]:
freq = df.groupby("class").count()["path"]
selected_freq = freq[freq >= 1000]
selected_freq

class
apple_pie         1000
baked_potato      1500
cheesecake        1000
chicken_curry     1000
crispy_chicken    1500
donut             1500
fries             1500
hot_dog           1548
ice_cream         1000
omelette          1000
sandwich          1500
sushi             1000
taco              1500
taquito           1500
Name: path, dtype: int64

In [4]:
selected_classes = selected_freq.index.to_list()
selected_classes

['apple_pie',
 'baked_potato',
 'cheesecake',
 'chicken_curry',
 'crispy_chicken',
 'donut',
 'fries',
 'hot_dog',
 'ice_cream',
 'omelette',
 'sandwich',
 'sushi',
 'taco',
 'taquito']

# Split for training and validation

All classes has at least 1000 data. We pick 800 for training and 200 for validation per class

In [5]:
# Config
train_size = 800
val_size = 200

list_df_train = []
list_df_val = []
for cls in selected_classes:
    df_sample = df[df["class"] == cls].sample(train_size + val_size)
    # since sample doesn't preserve the row order,
    # we will just split it immediately,
    # using iloc for position-based slicing
    # instead of index-based 
    df_sample_train = df_sample.iloc[:train_size] 
    df_sample_val = df_sample.iloc[train_size:]
    list_df_train.append(df_sample_train)
    list_df_val.append(df_sample_val)

df_train = pd.concat(list_df_train)
df_val = pd.concat(list_df_val)

df_train.to_csv("food_cls/train.csv")
df_val.to_csv("food_cls/val.csv")

# Creating imbalanced dataset

In [6]:
balanced_classes = selected_classes[:10]
imbalanced_classes = selected_classes[10:13]
print("Balanced classes: ", balanced_classes)
print("Imbalanced classes: ", imbalanced_classes)

Balanced classes:  ['apple_pie', 'baked_potato', 'cheesecake', 'chicken_curry', 'crispy_chicken', 'donut', 'fries', 'hot_dog', 'ice_cream', 'omelette']
Imbalanced classes:  ['sandwich', 'sushi', 'taco']


In [7]:
# Config
balanced_size = 800
imbalanced_size = 400

In [8]:
list_df = []
for cls in balanced_classes:
    df_sample = df_train[df_train["class"] == cls].sample(balanced_size)
    list_df.append(df_sample)
for cls in imbalanced_classes:
    df_sample = df_train[df_train["class"] == cls].sample(imbalanced_size)
    list_df.append(df_sample)
imbalanced_df = pd.concat(list_df)
imbalanced_df.to_csv("food_cls/imbalanced.csv")
imbalanced_df.groupby("class").count()["path"]

class
apple_pie         800
baked_potato      800
cheesecake        800
chicken_curry     800
crispy_chicken    800
donut             800
fries             800
hot_dog           800
ice_cream         800
omelette          800
sandwich          400
sushi             400
taco              400
Name: path, dtype: int64

# Creating augmented dataset

In [9]:
# Utility to generate and save horizontal flip
def augment(input_path, output_path):
    # Open the image
    image = Image.open(input_path)

    # Flip horizontally
    flipped_image = image.transpose(Image.FLIP_LEFT_RIGHT)

    # Save or display the flipped image
    flipped_image.save(output_path)

    # Close the image objects
    image.close()
    flipped_image.close()

In [10]:
# Config
original_size = 400


def augment_row(row):
    new_path = os.path.join("[flipped]", row)
    input_path = os.path.join("food_cls", row)
    output_path = os.path.join("food_cls", new_path)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    augment(input_path, output_path)
    return new_path


list_df = []
for cls in selected_classes:
    df_sample = df_train[df_train["class"] == cls].sample(original_size)
    flipped_df_sample = df_sample.copy()
    flipped_df_sample["path"] = flipped_df_sample["path"].apply(augment_row)
    list_df += [df_sample, flipped_df_sample]

df_augmented = pd.concat(list_df)
df_augmented.to_csv("food_cls/augmented.csv")
df_augmented

Unnamed: 0,path,class,class_id
923,apple_pie/750073.jpg,apple_pie,0
676,apple_pie/3333030.jpg,apple_pie,0
250,apple_pie/1617515.jpg,apple_pie,0
755,apple_pie/603113.jpg,apple_pie,0
495,apple_pie/2451702.jpg,apple_pie,0
...,...,...,...
22921,[flipped]/taquito/Taquito-Train (946).jpeg,taquito,33
22777,[flipped]/taquito/Taquito-Train (1189).jpeg,taquito,33
22782,[flipped]/taquito/Taquito-Train (480).jpeg,taquito,33
23734,[flipped]/taquito/Taquito-Train (15).jpeg,taquito,33
