In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [26]:
TARGET_COL = "nutriscore_letter"  

# 2) Tabular (numeric) feature columns (CHANGE THIS)
TABULAR_COLS = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]

# 3) Text columns (already preprocessed)
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

In [27]:
df = pd.read_csv("../FoodFactsCleaned.csv")

# Ensure all text cols exist and are strings
for c in TEXT_COLS:
    if c not in df.columns:
        raise ValueError(f"Missing text column: {c}")
    df[c] = df[c].fillna("").astype(str)

# Concatenate all text into a single field
df["text_concat"] = df[TEXT_COLS].fillna("").agg(" ".join, axis=1)

# Define X and y
X = df[["text_concat"] + TABULAR_COLS]
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train), "Test size:", len(X_test))

Train size: 4110 Test size: 1028


In [28]:
preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            ngram_range=(1, 2),
            min_df=5,
            max_features=30000
        ), "text_concat"),
        ("num", StandardScaler(with_mean=False), TABULAR_COLS),
    ],
    remainder="drop"
)

# Fit and transform directly
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("X_train_transformed shape:", X_train_transformed.shape)
print("X_test_transformed shape:", X_test_transformed.shape)

X_train_transformed shape: (4110, 7444)
X_test_transformed shape: (1028, 7444)


In [29]:
models = {
    "LogReg": LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        solver="saga",
        class_weight="balanced",
    ),

    "LinearSVC": LinearSVC(
        C=1.0,
        class_weight="balanced",
    ),

    "SGDClassifier (hinge SVM)": SGDClassifier(
        loss="hinge",
        max_iter=2000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ),

}


In [30]:
models = {
    "LogReg": LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        solver="saga",
        class_weight="balanced",
    ),

    "LinearSVC": LinearSVC(
        C=1.0,
        class_weight="balanced",
    ),

    "SGDClassifier (hinge SVM)": SGDClassifier(
        loss="hinge",
        max_iter=2000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ),

}


In [31]:
# =========================================
# 5. Train & evaluate each model (using pre-transformed data)
# =========================================

results = []

for name, clf in models.items():
    print(f"\n=== Training {name} ===")
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)

    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average="macro")

    print(f"{name}")
    print("-" * len(name))
    print(f"Accuracy:  {acc:.4f}")
    print(f"Macro F1:  {f1_macro:.4f}")
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

    results.append({
        "model": name,
        "accuracy": acc,
        "macro_f1": f1_macro
    })


=== Training LogReg ===
LogReg
------
Accuracy:  0.7821
Macro F1:  0.7797

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       193
           1       0.74      0.71      0.73       197
           2       0.73      0.74      0.73       210
           3       0.72      0.78      0.75       197
           4       0.90      0.88      0.89       231

    accuracy                           0.78      1028
   macro avg       0.78      0.78      0.78      1028
weighted avg       0.78      0.78      0.78      1028


=== Training LinearSVC ===
LinearSVC
---------
Accuracy:  0.7179
Macro F1:  0.7138

Classification report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76       193
           1       0.68      0.64      0.66       197
           2       0.63      0.61      0.62       210
           3       0.63      0.69      0.66       197
           4       0.86      0.88



In [32]:
mlp_model = MLPClassifier(
        hidden_layer_sizes=(256, 128),
        activation="relu",
        max_iter=50,
        random_state=42
    )

In [33]:
print(f"\n=== Training MLPClassifier ===")
mlp_model.fit(X_train_transformed, y_train)
y_pred = mlp_model.predict(X_test_transformed)

acc = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average="macro")

print(f"{name}")
print("-" * len(name))
print(f"Accuracy:  {acc:.4f}")
print(f"Macro F1:  {f1_macro:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))

results.append({
    "model": "MLP",
    "accuracy": acc,
    "macro_f1": f1_macro
})


=== Training MLPClassifier ===
SGDClassifier (hinge SVM)
-------------------------
Accuracy:  0.7588
Macro F1:  0.7553

Classification report:
              precision    recall  f1-score   support

           0       0.86      0.67      0.75       193
           1       0.68      0.75      0.71       197
           2       0.69      0.70      0.69       210
           3       0.72      0.76      0.74       197
           4       0.86      0.90      0.88       231

    accuracy                           0.76      1028
   macro avg       0.76      0.75      0.76      1028
weighted avg       0.76      0.76      0.76      1028





In [34]:
# =========================================
# 6. Summary table
# =========================================

results_df = pd.DataFrame(results).sort_values(by="macro_f1", ascending=False)
print("\n=== Summary: Tabular + TF-IDF Text Models ===")
print(results_df.to_string(index=False))


=== Summary: Tabular + TF-IDF Text Models ===
                    model  accuracy  macro_f1
                   LogReg  0.782101  0.779662
                      MLP  0.758755  0.755311
                LinearSVC  0.717899  0.713787
SGDClassifier (hinge SVM)  0.694553  0.674301
