In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import numpy as np
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/FoodFactsCleaned.csv')
print(df.shape)

In [None]:
feature_cols = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]
cat_features = [
    'nova_group', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade'
]

In [None]:
X = df[feature_cols].values
y = df["nutriscore_letter"].values

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,  
    random_state=42,
    stratify=y_train_val
)

In [None]:
print("Full distribution:\n", pd.Series(y).value_counts(normalize=True))
print("\nTrain distribution:\n", pd.Series(y_train).value_counts(normalize=True))
print("\nVal distribution:\n", pd.Series(y_val).value_counts(normalize=True))
print("\nTest distribution:\n", pd.Series(y_test).value_counts(normalize=True))

In [None]:
print("Train size:", X_train.shape[0])
print("Val size:  ", X_val.shape[0])
print("Test size: ", X_test.shape[0])

In [None]:
# ========= Scale features =========
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


In [None]:
# ========= Logistic Regression baseline =========
log_reg = LogisticRegression(
    max_iter=2000,
    multi_class="multinomial",
    n_jobs=-1
)


In [None]:
df.isna().sum()


In [None]:
log_reg.fit(X_train_scaled, y_train)

In [None]:
def evaluate_model(model, X_tr, y_tr, X_v, y_v, X_te, y_te, name="model"):
    print(f"\n===== {name} - TRAIN =====")
    y_tr_pred = model.predict(X_tr)
    print("Accuracy:", accuracy_score(y_tr, y_tr_pred))
    print("Macro F1:", f1_score(y_tr, y_tr_pred, average="macro"))

    print(f"\n===== {name} - VAL =====")
    y_v_pred = model.predict(X_v)
    print("Accuracy:", accuracy_score(y_v, y_v_pred))
    print("Macro F1:", f1_score(y_v, y_v_pred, average="macro"))
    print("\nClassification report (VAL):")
    print(classification_report(y_v, y_v_pred, digits=3))

    print(f"\n===== {name} - TEST =====")
    y_te_pred = model.predict(X_te)
    print("Accuracy:", accuracy_score(y_te, y_te_pred))
    print("Macro F1:", f1_score(y_te, y_te_pred, average="macro"))
    print("\nConfusion matrix (TEST):")
    print(confusion_matrix(y_te, y_te_pred))

In [None]:
evaluate_model(
    log_reg,
    X_train_scaled, y_train,
    X_val_scaled,   y_val,
    X_test_scaled,  y_test,
    name="Logistic Regression (tabular)"
)

In [None]:
# ========= Random Forest baseline (no scaling needed) =========
rf = RandomForestClassifier(
    n_estimators=60,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)


In [None]:

evaluate_model(
    rf,
    X_train, y_train,
    X_val,   y_val,
    X_test,  y_test,
    name="Random Forest (tabular)"
)

In [None]:
# ========= XGBoost baseline  =========

# Convert y labels to be 0-indexed for XGBoost
y_train_xgb = y_train - 1
y_val_xgb = y_val - 1
y_test_xgb = y_test - 1

xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
    tree_method="hist" 
)

xgb.fit(
    X_train, y_train_xgb,
    eval_set=[(X_val, y_val_xgb)],
    verbose=False
)

evaluate_model(
    xgb,
    X_train, y_train_xgb,
    X_val,   y_val_xgb,
    X_test,  y_test_xgb,
    name="XGBoost (tabular)"
)

### MLP

In [None]:
# ========= MLP =========
# Encode labels to integers
le_mlp = LabelEncoder()
y_train_enc = le_mlp.fit_transform(y_train)
y_val_enc = le_mlp.transform(y_val)
y_test_enc = le_mlp.transform(y_test)
num_classes = len(le_mlp.classes_)

# Build MLP model
def build_mlp(input_dim, num_classes):
    model = Sequential([
        Dense(256, activation="relu", input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.2),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [None]:
print("\n" + "="*30)
print("      MLP MODEL EVALUATION")
print("="*30)
print(f"Test Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"Test Macro F1: {f1_score(y_true, y_pred, average='macro'):.4f}")
print("\nClassification Report:\n", classification_report(y_true, y_pred))

# CatBoost

In [None]:
#--CatBoost--
X_catboost = df[feature_cols].copy()
y_catboost = df["nutriscore_letter"].copy()

for col in cat_features:
  X_catboost[col] = X_catboost[col].fillna(-1).astype(int).astype(str)
  
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)

cat_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    use_best_model=True
)