In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv("FoodFactsCleaned.csv")

# Keep only rows that have an image AND a nutriscore_letter label
df = df[df["image_160_path"].notna()].copy()
df = df[df["nutriscore_letter"].notna()].copy()
df["nutriscore_letter"] = df["nutriscore_letter"].astype(int)

print("Rows with image + label:", len(df))

Rows with image + label: 3183


In [3]:
# If you already created a global split earlier and saved it, reuse it.
# Otherwise, create a fresh 60/20/20 split for the image subset:
X_idx = df.index.values
y = df["nutriscore_letter"].values

# 1) First split: indices only, y used only for stratify
train_val_idx, test_idx = train_test_split(
    X_idx,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 2) Second split: again, split indices only
y_train_val = df.loc[train_val_idx, "nutriscore_letter"].values

train_idx, val_idx = train_test_split(
    train_val_idx,
    test_size=0.25,        # 0.25 of 0.8 = 0.2
    random_state=42,
    stratify=y_train_val
)

# 3) Assign split labels
df["split"] = "train"
df.loc[val_idx, "split"] = "val"
df.loc[test_idx, "split"] = "test"

In [4]:
import tensorflow as tf

In [5]:
print(df["split"].value_counts())
print("Total rows:", len(df))


split
train    1909
test      637
val       637
Name: count, dtype: int64
Total rows: 3183


In [6]:
IMG_SIZE = 160
NUM_CLASSES = 5

df_train = df[df["split"] == "train"].copy()
df_val   = df[df["split"] == "val"].copy()
df_test  = df[df["split"] == "test"].copy()

print("Train/Val/Test sizes:", len(df_train), len(df_val), len(df_test))



Train/Val/Test sizes: 1909 637 637


In [8]:
def make_dataset(df_subset, batch_size=32, shuffle=False, augment=False):
    paths = df_subset["image_160_path"].values
    labels = df_subset["nutriscore_letter"].values - 1

    ds = tf.data.Dataset.from_tensor_slices((paths, labels))

    def _load_image(path, label):
        img_bytes = tf.io.read_file(path)
        # Most of your images are JPEG; decode_image can handle PNG/JPEG
        img = tf.image.decode_image(img_bytes, channels=3, expand_animations=False)
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
        img = tf.cast(img, tf.float32) / 255.0  # scale to [0,1]

        if augment:
            img = tf.image.random_flip_left_right(img)
            img = tf.image.random_brightness(img, max_delta=0.1)
            img = tf.image.random_contrast(img, 0.9, 1.1)

        # Optionally normalize roughly like ImageNet (not required for simple CNN)
        # img = (img - tf.constant([0.485, 0.456, 0.406])) / tf.constant([0.229, 0.160, 0.225])

        return img, label

    ds = ds.map(_load_image, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(buffer_size=len(df_subset), reshuffle_each_iteration=True)

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


In [9]:
BATCH_SIZE = 32

train_ds = make_dataset(df_train, batch_size=BATCH_SIZE, shuffle=True,  augment=True)
val_ds   = make_dataset(df_val,   batch_size=BATCH_SIZE, shuffle=False, augment=False)
test_ds  = make_dataset(df_test,  batch_size=BATCH_SIZE, shuffle=False, augment=False)

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_labels = le.fit_transform(df_train["nutriscore_letter"])


In [11]:
print(train_labels.dtype)  # int64


int64


In [12]:
class_counts = np.bincount(train_labels, minlength=NUM_CLASSES)
class_weights = class_counts.sum() / (class_counts + 1e-6)
class_weights = class_weights / class_weights.mean()

print("Class counts:", class_counts)
print("Class weights:", class_weights)

class_weight_dict = {i: float(w) for i, w in enumerate(class_weights)}
class_weight_dict


Class counts: [525  60 208 423 693]
Class weights: [0.35031794 3.06528195 0.88421596 0.43479177 0.26539238]


{0: 0.3503179421211768,
 1: 3.065281948310897,
 2: 0.8842159571140376,
 3: 0.4347917719365107,
 4: 0.2653923805173786}

In [13]:
def check_ds_global(ds, name):
    ys = []
    for _, y in ds.unbatch():
        ys.append(y.numpy())
    ys = np.array(ys)
    print(name, "min:", ys.min(), "max:", ys.max(), "unique:", np.unique(ys))

check_ds_global(train_ds, "train")
check_ds_global(val_ds, "val")
check_ds_global(test_ds, "test")


train min: 0 max: 4 unique: [0 1 2 3 4]
val min: 0 max: 4 unique: [0 1 2 3 4]
test min: 0 max: 4 unique: [0 1 2 3 4]
