In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [2]:
TARGET_COL = "nutriscore_letter"  

# 2) Tabular (numeric) feature columns (CHANGE THIS)
TABULAR_COLS = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]

# 3) Text columns (already preprocessed)
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

In [4]:
df = pd.read_csv("FoodFactsCleaned.csv")

# Ensure all text cols exist and are strings
for c in TEXT_COLS:
    if c not in df.columns:
        raise ValueError(f"Missing text column: {c}")
    df[c] = df[c].fillna("").astype(str)

# Concatenate all text into a single field
df["text_concat"] = df[TEXT_COLS].fillna("").agg(" ".join, axis=1)

# Define X and y
X = df[["text_concat"] + TABULAR_COLS]
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train), "Test size:", len(X_test))

Train size: 2548 Test size: 637


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            ngram_range=(1, 2),
            min_df=5,
            max_features=30000
        ), "text_concat"),
        ("num", StandardScaler(with_mean=False), TABULAR_COLS),
    ],
    remainder="drop"
)

# Fit and transform directly
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("X_train_transformed shape:", X_train_transformed.shape)
print("X_test_transformed shape:", X_test_transformed.shape)

X_train_transformed shape: (2548, 4737)
X_test_transformed shape: (637, 4737)
