In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Dropout, Concatenate,
    BatchNormalization, Add, SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("FoodFactsCleaned.csv")
df["nutriscore_letter"] = df["nutriscore_letter"].astype(int)

In [8]:
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

TABULAR_COLS = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]

TARGET_COL = "nutriscore_letter"

RANDOM_STATE = 42
MAX_WORDS = 30000
MAX_LEN = 220

EPOCHS = 20
BATCH_SIZE = 64

In [7]:

for c in TEXT_COLS:
    df[c] = df[c].fillna("").astype(str)
df["text_concat"] = df[TEXT_COLS].agg(" ".join, axis=1)

X_text = df["text_concat"].values
X_tab  = df[TABULAR_COLS].values.astype(np.float32)
y = df[TARGET_COL].values

In [10]:
X_text_tv, X_text_te, X_tab_tv, X_tab_te, y_tv, y_te = train_test_split(
    X_text, X_tab, y,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=y
)

In [11]:
X_text_tr, X_text_val, X_tab_tr, X_tab_val, y_tr, y_val = train_test_split(
    X_text_tv, X_tab_tv, y_tv,
    test_size=0.1765,
    random_state=RANDOM_STATE,
    stratify=y_tv
)

In [12]:

print("Train:", len(y_tr), "Val:", len(y_val), "Test:", len(y_te))

Train: 3596 Val: 771 Test: 771


In [13]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text_tr)

def tok_pad(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=MAX_LEN, padding="post", truncating="post")

X_text_tr_pad  = tok_pad(X_text_tr)
X_text_val_pad = tok_pad(X_text_val)
X_text_te_pad  = tok_pad(X_text_te)

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)