In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    Embedding, GlobalAveragePooling1D, Dense,
    Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, SimpleRNN, GRU
)


In [None]:
df = pd.read_csv("../FoodFactsCleaned.csv")

In [None]:
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

In [None]:

# Concatenate into a single text field per product
df["text_concat"] = df[TEXT_COLS].fillna("").agg(" ".join, axis=1)

In [None]:
TARGET_COL = "nutriscore_letter"  

In [None]:
X_text = df["text_concat"]
y = df[TARGET_COL]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train), "Test size:", len(X_test))


In [None]:

def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    print(f"\n{name}")
    print("-" * len(name))
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro F1:  {f1_macro:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))
    print("=" * 80)

In [None]:
# Store results
results = []

def log_result(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    print(f"\n{name}")
    print("-" * len(name))
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro F1:  {f1_macro:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))
    print("=" * 80)
    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": f1_macro
    })

# PATH A: TF-IDF 

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_features=30000  
)

In [None]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
log_reg = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    multi_class="multinomial",
    class_weight="balanced"
)

In [None]:
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)
log_result("Path A1: TF-IDF + LogisticRegression", y_test, y_pred_lr)

In [None]:

svm_clf = LinearSVC(
    C=1.0,
    class_weight="balanced"
)
svm_clf.fit(X_train_tfidf, y_train)
y_pred_svm = svm_clf.predict(X_test_tfidf)
log_result("Path A2: TF-IDF + LinearSVC", y_test, y_pred_svm)