In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv("FoodFactsCleaned.csv")

# Keep only rows that have an image AND a nutriscore_letter label
df = df[df["image_160_path"].notna()].copy()
df = df[df["nutriscore_letter"].notna()].copy()
df["nutriscore_letter"] = df["nutriscore_letter"].astype(int)

print("Rows with image + label:", len(df))

Rows with image + label: 5138


In [3]:
# If you already created a global split earlier and saved it, reuse it.
# Otherwise, create a fresh 60/20/20 split for the image subset:
X_idx = df.index.values
y = df["nutriscore_letter"].values

# 1) First split: indices only, y used only for stratify
train_val_idx, test_idx = train_test_split(
    X_idx,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 2) Second split: again, split indices only
y_train_val = df.loc[train_val_idx, "nutriscore_letter"].values

train_idx, val_idx = train_test_split(
    train_val_idx,
    test_size=0.25,        # 0.25 of 0.8 = 0.2
    random_state=42,
    stratify=y_train_val
)

# 3) Assign split labels
df["split"] = "train"
df.loc[val_idx, "split"] = "val"
df.loc[test_idx, "split"] = "test"

In [4]:
import tensorflow as tf

In [5]:
print(df["split"].value_counts())
print("Total rows:", len(df))


split
train    3082
test     1028
val      1028
Name: count, dtype: int64
Total rows: 5138


In [6]:
IMG_SIZE = 160
NUM_CLASSES = 5

df_train = df[df["split"] == "train"].copy()
df_val   = df[df["split"] == "val"].copy()
df_test  = df[df["split"] == "test"].copy()

print("Train/Val/Test sizes:", len(df_train), len(df_val), len(df_test))



Train/Val/Test sizes: 3082 1028 1028


In [7]:
def make_dataset(df_subset, batch_size=32, shuffle=False, augment=False):
    paths = df_subset["image_160_path"].values
    labels = df_subset["nutriscore_letter"].values - 1

    ds = tf.data.Dataset.from_tensor_slices((paths, labels))

    def _load_image(path, label):
        img_bytes = tf.io.read_file(path)
        # Most of your images are JPEG; decode_image can handle PNG/JPEG
        img = tf.image.decode_image(img_bytes, channels=3, expand_animations=False)
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
        img = tf.cast(img, tf.float32) / 255.0  # scale to [0,1]

        if augment:
            img = tf.image.random_flip_left_right(img)
            img = tf.image.random_brightness(img, max_delta=0.1)
            img = tf.image.random_contrast(img, 0.9, 1.1)

        # Optionally normalize roughly like ImageNet (not required for simple CNN)
        # img = (img - tf.constant([0.485, 0.456, 0.406])) / tf.constant([0.229, 0.160, 0.225])

        return img, label

    ds = ds.map(_load_image, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(buffer_size=len(df_subset), reshuffle_each_iteration=True)

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


In [8]:
BATCH_SIZE = 32

train_ds = make_dataset(df_train, batch_size=BATCH_SIZE, shuffle=True,  augment=True)
val_ds   = make_dataset(df_val,   batch_size=BATCH_SIZE, shuffle=False, augment=False)
test_ds  = make_dataset(df_test,  batch_size=BATCH_SIZE, shuffle=False, augment=False)

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_labels = le.fit_transform(df_train["nutriscore_letter"])


In [12]:
def check_ds_global(ds, name):
    ys = []
    for _, y in ds.unbatch():
        ys.append(y.numpy())
    ys = np.array(ys)
    print(name, "min:", ys.min(), "max:", ys.max(), "unique:", np.unique(ys))

check_ds_global(train_ds, "train")
check_ds_global(val_ds, "val")
check_ds_global(test_ds, "test")


train min: 0 max: 4 unique: [0 1 2 3 4]
val min: 0 max: 4 unique: [0 1 2 3 4]
test min: 0 max: 4 unique: [0 1 2 3 4]


### Simple CNN

In [15]:
from tensorflow.keras import layers, models

In [13]:
def build_simple_cnn(input_shape=(IMG_SIZE, IMG_SIZE, 3), num_classes=5):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(32, (3, 3), padding="same", activation="relu")(inputs)
    x = layers.MaxPooling2D((2, 2))(x)      # 112x112

    x = layers.Conv2D(64, (3, 3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2, 2))(x)      # 56x56

    x = layers.Conv2D(128, (3, 3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2, 2))(x)      # 28x28

    x = layers.Flatten()(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inputs, outputs, name="SimpleCNN")
    return model



In [16]:
simple_cnn = build_simple_cnn(num_classes=NUM_CLASSES)
simple_cnn.summary()

simple_cnn.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"]
)

EPOCHS = 10

history_simple = simple_cnn.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)


Epoch 1/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 103ms/step - accuracy: 0.2641 - loss: 1.5909 - val_accuracy: 0.2986 - val_loss: 1.5563
Epoch 2/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 99ms/step - accuracy: 0.3047 - loss: 1.5519 - val_accuracy: 0.3181 - val_loss: 1.5416
Epoch 3/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 101ms/step - accuracy: 0.3219 - loss: 1.5277 - val_accuracy: 0.3035 - val_loss: 1.5382
Epoch 4/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 104ms/step - accuracy: 0.3271 - loss: 1.5179 - val_accuracy: 0.3200 - val_loss: 1.5380
Epoch 5/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 107ms/step - accuracy: 0.3514 - loss: 1.4927 - val_accuracy: 0.3288 - val_loss: 1.5249
Epoch 6/10
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 113ms/step - accuracy: 0.3579 - loss: 1.4779 - val_accuracy: 0.3161 - val_loss: 1.5261
Epoch 7/10
[1m97/97[0

In [None]:
def build_simple_cnn(input_shape=(IMG_SIZE, IMG_SIZE, 3), num_classes=5):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(32, (3, 3), padding="same", activation="relu")(inputs)
    x = layers.MaxPooling2D((2, 2))(x)      # 112x112

    x = layers.Conv2D(64, (3, 3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2, 2))(x)      # 56x56

    x = layers.Conv2D(128, (3, 3), padding="same", activation="relu")(x)
    x = layers.MaxPooling2D((2, 2))(x)      # 28x28

    x = layers.Flatten()(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inputs, outputs, name="SimpleCNN")
    return model



In [17]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input

In [None]:

def make_dataset_efficientnet(df_subset, batch_size=32, shuffle=False, augment=False):
    paths = df_subset["image_160_path"].values
    labels = df_subset["nutriscore_letter"].values - 1

    ds = tf.data.Dataset.from_tensor_slices((paths, labels))

    def _load_image(path, label):
        img_bytes = tf.io.read_file(path)
        img = tf.image.decode_image(img_bytes, channels=3, expand_animations=False)
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))

        if augment:
            img = tf.image.random_flip_left_right(img)
            img = tf.image.random_brightness(img, max_delta=0.1)
            img = tf.image.random_contrast(img, 0.9, 1.1)

        img = tf.cast(img, tf.float32)
        img = preprocess_input(img)  # EfficientNet preprocessing

        return img, label

    ds = ds.map(_load_image, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(buffer_size=len(df_subset), reshuffle_each_iteration=True)

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

