In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import numpy as np
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('../FoodFactsCleaned.csv')
print(df.shape)

(5138, 53)


In [6]:
feature_cols = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]
cat_features = [
    'nova_group', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade'
]

In [7]:
X = df[feature_cols].values
y = df["nutriscore_letter"].values

In [8]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,  
    random_state=42,
    stratify=y_train_val
)

In [9]:
print("Full distribution:\n", pd.Series(y).value_counts(normalize=True))
print("\nTrain distribution:\n", pd.Series(y_train).value_counts(normalize=True))
print("\nVal distribution:\n", pd.Series(y_val).value_counts(normalize=True))
print("\nTest distribution:\n", pd.Series(y_test).value_counts(normalize=True))

Full distribution:
 5.0    0.224212
3.0    0.204360
4.0    0.192098
2.0    0.191709
1.0    0.187622
Name: proportion, dtype: float64

Train distribution:
 5.0    0.224205
3.0    0.204413
4.0    0.192083
2.0    0.191759
1.0    0.187541
Name: proportion, dtype: float64

Val distribution:
 5.0    0.223735
3.0    0.204280
4.0    0.192607
2.0    0.191634
1.0    0.187743
Name: proportion, dtype: float64

Test distribution:
 5.0    0.224708
3.0    0.204280
2.0    0.191634
4.0    0.191634
1.0    0.187743
Name: proportion, dtype: float64


In [10]:
print("Train size:", X_train.shape[0])
print("Val size:  ", X_val.shape[0])
print("Test size: ", X_test.shape[0])

Train size: 3082
Val size:   1028
Test size:  1028


In [11]:
# ========= Scale features =========
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


In [12]:
# ========= Logistic Regression baseline =========
log_reg = LogisticRegression(
    max_iter=2000,
    multi_class="multinomial",
    n_jobs=-1
)


In [13]:
df.isna().sum()


url                                0
product_name                       0
barcode                            0
brand                            163
quantity                         887
serving_size                    1908
nutriscore_letter                  0
nova_group                         0
ingredients_text                 377
allergens                       1792
traces                          2720
fat_100g                           0
saturated_fat_100g                 0
carbohydrates_100g                 0
sugars_100g                        0
fiber_100g                         0
proteins_100g                      0
main_image_url                     0
categories                         0
contains_palm_oil                  0
vegetarian_status                  0
vegan_status                       0
nutrient_level_fat                 0
nutrient_level_saturated_fat       0
nutrient_level_sugars              0
nutrient_level_salt                0
additives                          0
p

In [14]:
log_reg.fit(X_train_scaled, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [77]:
def evaluate_model(model, X_tr, y_tr, X_v, y_v, X_te, y_te, name="model"):
    print(f"\n===== {name} - TRAIN =====")
    y_tr_pred = model.predict(X_tr)
    print("Accuracy:", accuracy_score(y_tr, y_tr_pred))
    print("Macro F1:", f1_score(y_tr, y_tr_pred, average="macro"))

    print(f"\n===== {name} - VAL =====")
    y_v_pred = model.predict(X_v)
    print("Accuracy:", accuracy_score(y_v, y_v_pred))
    print("Macro F1:", f1_score(y_v, y_v_pred, average="macro"))
    print("\nClassification report (VAL):")
    print(classification_report(y_v, y_v_pred, digits=3))

    print(f"\n===== {name} - TEST =====")
    y_te_pred = model.predict(X_te)
    print("Accuracy:", accuracy_score(y_te, y_te_pred))
    print("Macro F1:", f1_score(y_te, y_te_pred, average="macro"))


In [78]:
evaluate_model(
    log_reg,
    X_train_scaled, y_train,
    X_val_scaled,   y_val,
    X_test_scaled,  y_test,
    name="Logistic Regression (tabular)"
)


===== Logistic Regression (tabular) - TRAIN =====
Accuracy: 0.7608695652173914
Macro F1: 0.7568481939096788

===== Logistic Regression (tabular) - VAL =====
Accuracy: 0.7441634241245136
Macro F1: 0.7407492912201546

Classification report (VAL):
              precision    recall  f1-score   support

         1.0      0.768     0.808     0.788       193
         2.0      0.738     0.629     0.679       197
         3.0      0.624     0.695     0.658       210
         4.0      0.694     0.687     0.690       198
         5.0      0.894     0.883     0.888       230

    accuracy                          0.744      1028
   macro avg      0.744     0.740     0.741      1028
weighted avg      0.747     0.744     0.744      1028


===== Logistic Regression (tabular) - TEST =====
Accuracy: 0.7431906614785992
Macro F1: 0.7397225953272629


In [None]:
# ========= Random Forest baseline (no scaling needed) =========
rf = RandomForestClassifier(
    n_estimators=60,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,60
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [80]:

evaluate_model(
    rf,
    X_train, y_train,
    X_val,   y_val,
    X_test,  y_test,
    name="Random Forest (tabular)"
)


===== Random Forest (tabular) - TRAIN =====
Accuracy: 1.0
Macro F1: 1.0

===== Random Forest (tabular) - VAL =====
Accuracy: 0.8326848249027238
Macro F1: 0.8305067225909462

Classification report (VAL):
              precision    recall  f1-score   support

         1.0      0.822     0.860     0.841       193
         2.0      0.788     0.756     0.772       197
         3.0      0.780     0.810     0.794       210
         4.0      0.811     0.823     0.817       198
         5.0      0.954     0.904     0.929       230

    accuracy                          0.833      1028
   macro avg      0.831     0.831     0.831      1028
weighted avg      0.834     0.833     0.833      1028


===== Random Forest (tabular) - TEST =====
Accuracy: 0.8482490272373541
Macro F1: 0.8461672676705009


In [None]:
# ========= XGBoost baseline  =========

# Convert y labels to be 0-indexed for XGBoost
y_train_xgb = y_train - 1
y_val_xgb = y_val - 1
y_test_xgb = y_test - 1

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

xgb.fit(
    X_train, y_train_xgb,
    eval_set=[(X_val, y_val_xgb)],
    verbose=False
)

evaluate_model(
    xgb,
    X_train, y_train_xgb,
    X_val,   y_val_xgb,
    X_test,  y_test_xgb,
    name="XGBoost (tabular)"
)


===== XGBoost (tabular) - TRAIN =====
Accuracy: 0.9386761842959117
Macro F1: 0.9374628374009738

===== XGBoost (tabular) - VAL =====
Accuracy: 0.816147859922179
Macro F1: 0.8142068098217731

Classification report (VAL):
              precision    recall  f1-score   support

         0.0      0.833     0.829     0.831       193
         1.0      0.734     0.772     0.752       197
         2.0      0.774     0.767     0.770       210
         3.0      0.815     0.803     0.809       198
         4.0      0.916     0.900     0.908       230

    accuracy                          0.816      1028
   macro avg      0.815     0.814     0.814      1028
weighted avg      0.817     0.816     0.817      1028


===== XGBoost (tabular) - TEST =====
Accuracy: 0.8336575875486382
Macro F1: 0.8318269402184646


### MLP

In [20]:
# ========= MLP =========
# Encode labels to integers
le_mlp = LabelEncoder()
y_train_enc = le_mlp.fit_transform(y_train)
y_val_enc = le_mlp.transform(y_val)
y_test_enc = le_mlp.transform(y_test)
num_classes = len(le_mlp.classes_)

# Build MLP model
def build_mlp(input_dim, num_classes):
    model = Sequential([
        Dense(256, activation="relu", input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.2),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [43]:
# Create and train model
mlp = build_mlp(X_train_scaled.shape[1], num_classes)

es = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history_mlp = mlp.fit(
    X_train_scaled, y_train_enc,
    validation_data=(X_val_scaled, y_val_enc),
    epochs=30,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

# Predict
y_pred_mlp_train_enc = mlp.predict(X_train_scaled).argmax(axis=1)
y_pred_mlp_val_enc = mlp.predict(X_val_scaled).argmax(axis=1)
y_pred_mlp_test_enc = mlp.predict(X_test_scaled).argmax(axis=1)

# Decode predictions back to original labels
y_pred_mlp_train = le_mlp.inverse_transform(y_pred_mlp_train_enc)
y_pred_mlp_val = le_mlp.inverse_transform(y_pred_mlp_val_enc)
y_pred_mlp_test = le_mlp.inverse_transform(y_pred_mlp_test_enc)

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5396 - loss: 1.0356 - val_accuracy: 0.6741 - val_loss: 0.7257
Epoch 2/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6768 - loss: 0.7586 - val_accuracy: 0.7121 - val_loss: 0.6363
Epoch 3/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7158 - loss: 0.6830 - val_accuracy: 0.7636 - val_loss: 0.6037
Epoch 4/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7313 - loss: 0.6475 - val_accuracy: 0.7558 - val_loss: 0.6163
Epoch 5/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7411 - loss: 0.6135 - val_accuracy: 0.7704 - val_loss: 0.5984
Epoch 6/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7544 - loss: 0.5999 - val_accuracy: 0.7558 - val_loss: 0.6114
Epoch 7/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━

# CatBoost

In [50]:
X = df[feature_cols].copy()
y = df["nutriscore_letter"].copy()

In [53]:
#--CatBoost--
for col in cat_features:
    X[col] = X[col].fillna(-1).astype(int)

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)

cat_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    use_best_model=True,
    verbose=100
)

0:	learn: 0.6495782	test: 0.6439689	best: 0.6439689 (0)	total: 29.9ms	remaining: 29.9s
100:	learn: 0.8238157	test: 0.7966926	best: 0.7966926 (92)	total: 3.73s	remaining: 33.2s
200:	learn: 0.8500973	test: 0.8044747	best: 0.8064202 (181)	total: 7s	remaining: 27.8s
300:	learn: 0.8757300	test: 0.8151751	best: 0.8151751 (295)	total: 10.5s	remaining: 24.4s
400:	learn: 0.8932511	test: 0.8210117	best: 0.8239300 (365)	total: 13.8s	remaining: 20.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8239299611
bestIteration = 365

Shrink model to first 366 iterations.


<catboost.core.CatBoostClassifier at 0x21b95db08e0>

In [55]:
# Predict (CatBoost outputs class labels directly)
y_pred_cat_train = cat_model.predict(X_train)
y_pred_cat_val   = cat_model.predict(X_val)
y_pred_cat_test  = cat_model.predict(X_test)

# CatBoost sometimes returns shape (n,1) → flatten
y_pred_cat_train = y_pred_cat_train.ravel()
y_pred_cat_val   = y_pred_cat_val.ravel()
y_pred_cat_test  = y_pred_cat_test.ravel()


In [75]:
# Evaluate
print("\n===== CatBoost - TRAIN =====")
print("Accuracy:", accuracy_score(y_train, y_pred_cat_train))
print("Macro F1:", f1_score(y_train, y_pred_cat_train, average="macro"))

print("\n===== CatBoost - VAL =====")
print("Accuracy:", accuracy_score(y_val, y_pred_cat_val))
print("Macro F1:", f1_score(y_val, y_pred_cat_val, average="macro"))
print("\nClassification report (VAL):")
print(classification_report(y_val, y_pred_cat_val, digits=3))

print("\n===== CatBoost - TEST =====")
print("Accuracy:", accuracy_score(y_test, y_pred_cat_test))
print("Macro F1:", f1_score(y_test, y_pred_cat_test, average="macro"))




===== CatBoost - TRAIN =====
Accuracy: 0.8828682673588579
Macro F1: 0.880951960815708

===== CatBoost - VAL =====
Accuracy: 0.8239299610894941
Macro F1: 0.8221515440743327

Classification report (VAL):
              precision    recall  f1-score   support

         1.0      0.849     0.819     0.834       193
         2.0      0.757     0.777     0.767       197
         3.0      0.766     0.810     0.787       210
         4.0      0.803     0.803     0.803       198
         5.0      0.941     0.900     0.920       230

    accuracy                          0.824      1028
   macro avg      0.823     0.822     0.822      1028
weighted avg      0.826     0.824     0.825      1028


===== CatBoost - TEST =====
Accuracy: 0.8190661478599222
Macro F1: 0.8171208446557543


# LightGBM

In [68]:
#---LightGBM--
lgbm_numeric = lgb.LGBMClassifier(
    n_estimators=20,
    learning_rate=0.05,
    num_leaves=31,
    boosting_type='gbdt',
    objective='multiclass',
    num_class=5,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
# Train with Early Stopping
lgbm_numeric.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],

    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[20]	valid_0's multi_logloss: 0.705289


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,20
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [69]:
# Predict
y_pred_lgbm_train = lgbm_numeric.predict(X_train)
y_pred_lgbm_val   = lgbm_numeric.predict(X_val)
y_pred_lgbm_test  = lgbm_numeric.predict(X_test)


In [74]:
# Evaluate
print("\n===== LightGBM - TRAIN =====")
print("Accuracy:", accuracy_score(y_train, y_pred_lgbm_train))
print("Macro F1:", f1_score(y_train, y_pred_lgbm_train, average="macro"))

print("\n===== LightGBM - VAL =====")
print("Accuracy:", accuracy_score(y_val, y_pred_lgbm_val))
print("Macro F1:", f1_score(y_val, y_pred_lgbm_val, average="macro"))
print("\nClassification report (VAL):")
print(classification_report(y_val, y_pred_lgbm_val, digits=3))

print("\n===== LightGBM - TEST =====")
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm_test))
print("Macro F1:", f1_score(y_test, y_pred_lgbm_test, average="macro"))




===== LightGBM - TRAIN =====
Accuracy: 0.9033095392602206
Macro F1: 0.9016506126482897

===== LightGBM - VAL =====
Accuracy: 0.8103112840466926
Macro F1: 0.8079059149309348

Classification report (VAL):
              precision    recall  f1-score   support

         1.0      0.821     0.855     0.838       193
         2.0      0.744     0.766     0.755       197
         3.0      0.772     0.743     0.757       210
         4.0      0.798     0.778     0.788       198
         5.0      0.904     0.900     0.902       230

    accuracy                          0.810      1028
   macro avg      0.808     0.808     0.808      1028
weighted avg      0.810     0.810     0.810      1028


===== LightGBM - TEST =====
Accuracy: 0.811284046692607
Macro F1: 0.8088530898143494
