In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    Embedding, GlobalAveragePooling1D, Dense,
    Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, SimpleRNN, GRU
)


In [None]:
df = pd.read_csv("../FoodFactsCleaned.csv")

In [None]:
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

In [None]:

# Concatenate into a single text field per product
df["text_concat"] = df[TEXT_COLS].fillna("").agg(" ".join, axis=1)

In [None]:
TARGET_COL = "nutriscore_letter"  

In [None]:
X_text = df["text_concat"]
y = df[TARGET_COL]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train), "Test size:", len(X_test))


In [None]:

def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    print(f"\n{name}")
    print("-" * len(name))
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro F1:  {f1_macro:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))
    print("=" * 80)

In [None]:
# Store results
results = []

def log_result(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    print(f"\n{name}")
    print("-" * len(name))
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro F1:  {f1_macro:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred))
    print("=" * 80)
    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": f1_macro
    })

# PATH A: TF-IDF 

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=5,
    max_features=30000  
)

In [None]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
log_reg = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    multi_class="multinomial",
    class_weight="balanced"
)

In [None]:
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)
log_result("Path A1: TF-IDF + LogisticRegression", y_test, y_pred_lr)

In [None]:

svm_clf = LinearSVC(
    C=1.0,
    class_weight="balanced"
)
svm_clf.fit(X_train_tfidf, y_train)
y_pred_svm = svm_clf.predict(X_test_tfidf)
log_result("Path A2: TF-IDF + LinearSVC", y_test, y_pred_svm)

In [None]:
# ---- TF-IDF + (Neural Network) ----

# Convert sparse to dense for Keras (be careful with very high dim)
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# Encode labels as integers for Keras
le_tfidf = LabelEncoder()
y_train_enc = le_tfidf.fit_transform(y_train)
y_test_enc = le_tfidf.transform(y_test)
num_classes = len(le_tfidf.classes_)

input_dim = X_train_dense.shape[1]

def build_tfidf_mlp(input_dim, num_classes):
    model = Sequential([
        Dense(256, activation="relu", input_shape=(input_dim,)),
        Dropout(0.4),
        Dense(128, activation="relu"),
        Dropout(0.4),
        Dense(num_classes, activation="softmax")
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [None]:
mlp_tfidf = build_tfidf_mlp(input_dim, num_classes)

es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

history_mlp = mlp_tfidf.fit(
    X_train_dense,
    y_train_enc,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    callbacks=[es],
    verbose=1
)

In [None]:
y_proba_mlp = mlp_tfidf.predict(X_test_dense)
y_pred_mlp_enc = np.argmax(y_proba_mlp, axis=1)
y_pred_mlp = le_tfidf.inverse_transform(y_pred_mlp_enc)

log_result("Path A3: TF-IDF + Custom MLP", y_test, y_pred_mlp)


In [None]:
# Plot training & validation accuracy values for TF-IDF MLP
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_mlp.history['accuracy'])
plt.plot(history_mlp.history['val_accuracy'])
plt.title('TF-IDF MLP Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history_mlp.history['loss'])
plt.plot(history_mlp.history['val_loss'])
plt.title('TF-IDF MLP Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Show the plots
plt.show()

# PATH B: Neural Text Model

In [None]:
# ---- Encode labels to integers (for neural net) ----
le_seq = LabelEncoder()
y_train_seq_enc = le_seq.fit_transform(y_train)
y_test_seq_enc = le_seq.transform(y_test)
num_classes_seq = len(le_seq.classes_)


In [None]:
"""le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
num_classes = len(le.classes_)
print("Number of classes:", num_classes)"""

In [None]:
# ---- 2) Tokenize & pad sequences ----
MAX_WORDS = 30000   # vocab size
MAX_LEN = 200       # max tokens per sample

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post", truncating="post")

In [None]:
vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)

In [None]:
# ---- 3) Build the model ----
def build_baseline_text_model(vocab_size, max_len, num_classes):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
        GlobalAveragePooling1D(),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(num_classes, activation="softmax"),
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [None]:
def build_rnn_text_model(vocab_size, max_len, num_classes):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
        SimpleRNN(64, return_sequences=False),
        Dropout(0.4),
        Dense(64, activation="relu"),
        Dropout(0.4),
        Dense(num_classes, activation="softmax"),
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [None]:
def build_cnn_text_model(vocab_size, max_len, num_classes):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
        Conv1D(filters=128, kernel_size=5, activation="relu"),
        GlobalMaxPooling1D(),
        Dropout(0.4),
        Dense(64, activation="relu"),
        Dropout(0.4),
        Dense(num_classes, activation="softmax"),
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [None]:
es_seq = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

In [None]:
# ---- Baseline Embedding + GlobalAveragePooling ----
model_b1 = build_baseline_text_model(vocab_size, MAX_LEN, num_classes_seq)

history_b1 = model_b1.fit(
    X_train_pad,
    y_train_seq_enc,
    validation_split=0.2,
    epochs=30,
    batch_size=64,
    
    verbose=1
)

y_proba_b1 = model_b1.predict(X_test_pad)
y_pred_b1_enc = np.argmax(y_proba_b1, axis=1)
y_pred_b1 = le_seq.inverse_transform(y_pred_b1_enc)

log_result("Embedding + GlobalAveragePooling", y_test, y_pred_b1)

In [None]:
# Plot training & validation accuracy values for Baseline model
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_b1.history['accuracy'])
plt.plot(history_b1.history['val_accuracy'])
plt.title('Embedding + GlobalAveragePooling accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history_b1.history['loss'])
plt.plot(history_b1.history['val_loss'])
plt.title('Embedding + GlobalAveragePooling loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

In [None]:
# ---- CNN-based text model (TextCNN-style) ----

model_b3 = build_cnn_text_model(vocab_size, MAX_LEN, num_classes_seq)

history_b3 = model_b3.fit(
    X_train_pad,
    y_train_seq_enc,
    validation_split=0.2,
    epochs=15,
    batch_size=64,
    callbacks=[es_seq],
    verbose=1
)

y_proba_b3 = model_b3.predict(X_test_pad)
y_pred_b3_enc = np.argmax(y_proba_b3, axis=1)
y_pred_b3 = le_seq.inverse_transform(y_pred_b3_enc)

log_result("Path B3: Embedding + Conv1D", y_test, y_pred_b3)

In [None]:
# Plot training & validation accuracy values for CNN model
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_b3.history['accuracy'])
plt.plot(history_b3.history['val_accuracy'])
plt.title('Embedding + Conv1D accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history_b3.history['loss'])
plt.plot(history_b3.history['val_loss'])
plt.title('Embedding + Conv1D loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()