In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Dropout, Concatenate,
    BatchNormalization, Add, SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("FoodFactsCleaned.csv")
df["nutriscore_letter"] = df["nutriscore_letter"].astype(int)

In [8]:
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

TABULAR_COLS = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]

TARGET_COL = "nutriscore_letter"

RANDOM_STATE = 42
MAX_WORDS = 30000
MAX_LEN = 220

EPOCHS = 20
BATCH_SIZE = 64

In [7]:

for c in TEXT_COLS:
    df[c] = df[c].fillna("").astype(str)
df["text_concat"] = df[TEXT_COLS].agg(" ".join, axis=1)

X_text = df["text_concat"].values
X_tab  = df[TABULAR_COLS].values.astype(np.float32)
y = df[TARGET_COL].values

In [10]:
X_text_tv, X_text_te, X_tab_tv, X_tab_te, y_tv, y_te = train_test_split(
    X_text, X_tab, y,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=y
)

In [11]:
X_text_tr, X_text_val, X_tab_tr, X_tab_val, y_tr, y_val = train_test_split(
    X_text_tv, X_tab_tv, y_tv,
    test_size=0.1765,
    random_state=RANDOM_STATE,
    stratify=y_tv
)

In [12]:

print("Train:", len(y_tr), "Val:", len(y_val), "Test:", len(y_te))

Train: 3596 Val: 771 Test: 771


In [13]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text_tr)

def tok_pad(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=MAX_LEN, padding="post", truncating="post")

X_text_tr_pad  = tok_pad(X_text_tr)
X_text_val_pad = tok_pad(X_text_val)
X_text_te_pad  = tok_pad(X_text_te)

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)

In [14]:
scaler = StandardScaler()
X_tab_tr_sc  = scaler.fit_transform(X_tab_tr).astype(np.float32)
X_tab_val_sc = scaler.transform(X_tab_val).astype(np.float32)
X_tab_te_sc  = scaler.transform(X_tab_te).astype(np.float32)

tab_dim = X_tab_tr_sc.shape[1]

print("Vocab:", vocab_size, "Tab dim:", tab_dim)

Vocab: 13832 Tab dim: 25


In [15]:
def residual_dense_block(x, units, dropout=0.30, name="res"):
    skip = x
    x = Dense(units, activation="relu", name=f"{name}_d1")(x)
    x = BatchNormalization(name=f"{name}_bn1")(x)
    x = Dropout(dropout, name=f"{name}_do1")(x)
    x = Dense(units, activation=None, name=f"{name}_d2")(x)
    x = BatchNormalization(name=f"{name}_bn2")(x)
    x = Add(name=f"{name}_add")([skip, x])
    x = tf.keras.layers.Activation("relu", name=f"{name}_relu")(x)
    return x

In [16]:
def build_early_fusion_model(vocab_size, max_len, tab_dim, num_classes):
    # Text input
    text_in = Input(shape=(max_len,), dtype=tf.int32, name="text_in")
    x_text = Embedding(vocab_size, 192, name="txt_embed")(text_in)
    x_text = SpatialDropout1D(0.2, name="txt_spdrop")(x_text)


    avg_pool = GlobalAveragePooling1D(name="txt_gap")(x_text)
    max_pool = GlobalMaxPooling1D(name="txt_gmp")(x_text)
    text_vec = Concatenate(name="txt_pool_concat")([avg_pool, max_pool])

    text_vec = Dense(256, activation="relu", name="txt_dense")(text_vec)
    text_vec = Dropout(0.35, name="txt_drop")(text_vec)

    # Tabular input
    tab_in = Input(shape=(tab_dim,), dtype=tf.float32, name="tab_in")
    tab_vec = BatchNormalization(name="tab_bn0")(tab_in)
    tab_vec = Dense(256, activation="relu", name="tab_dense1")(tab_vec)
    tab_vec = Dropout(0.35, name="tab_drop1")(tab_vec)
    tab_vec = Dense(128, activation="relu", name="tab_dense2")(tab_vec)
    tab_vec = Dropout(0.25, name="tab_drop2")(tab_vec)

    # Early fusion
    fused = Concatenate(name="early_fusion")([text_vec, tab_vec])

    # Fusion network 
    x = Dense(512, activation="relu", name="fusion_dense1")(fused)
    x = BatchNormalization(name="fusion_bn1")(x)
    x = Dropout(0.45, name="fusion_drop1")(x)

    x = Dense(512, activation="relu", name="fusion_proj")(x)
    x = residual_dense_block(x, 512, dropout=0.35, name="fusion_res1")
    x = residual_dense_block(x, 512, dropout=0.35, name="fusion_res2")

    x = Dense(256, activation="relu", name="fusion_dense2")(x)
    x = Dropout(0.35, name="fusion_drop2")(x)

    # Embedding for downstream classical models
    emb = Dense(256, activation="relu", name="final_embedding")(x)
    emb = Dropout(0.25, name="final_emb_drop")(emb)

    out = Dense(num_classes, activation="softmax", name="out")(emb)

    model = Model(inputs=[text_in, tab_in], outputs=out, name="TextTab_EarlyFusion_BaselineText")
    return model

In [18]:
num_classes = len(np.unique(y))

In [19]:
nn_model = build_early_fusion_model(vocab_size, MAX_LEN, tab_dim, num_classes)

nn_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

nn_model.summary()

