# Multimodal Late Fusion Model
## Text + Tabular + Image embeddings -> SVM + XGBoost

## 0) Imports

In [11]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    Dense, Dropout, BatchNormalization,
    GlobalAveragePooling1D, GlobalMaxPooling1D,
    Concatenate
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mobilenet_preprocess

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

## 1) Configuration

In [2]:
df = pd.read_csv("FoodFactsCleaned.csv")
df["nutriscore_letter"] = df["nutriscore_letter"].astype(int)

In [3]:
TEXT_COLS = [ 
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

TABULAR_COLS = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]

TARGET_COL = "nutriscore_letter"           
IMAGE_COL = "image_160_path"

RANDOM_STATE = 42
TEST_SIZE = 0.2

# Text tokenization
MAX_WORDS = 30000
MAX_LEN = 200

# Image settings
IMG_SIZE = (160, 160)

# NN training
EPOCHS_TEXT = 10
EPOCHS_TAB  = 25
EPOCHS_IMG  = 10
BATCH_SIZE = 64

## 2) Basic checks + building concatenated text

In [4]:
needed = TEXT_COLS + TABULAR_COLS + [TARGET_COL, IMAGE_COL]
missing = [c for c in needed if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in df: {missing}")

for c in TEXT_COLS:
    df[c] = df[c].fillna("").astype(str)

df["text_concat"] = df[TEXT_COLS].agg(" ".join, axis=1)

df = df[df[IMAGE_COL].notna()].copy()
df = df[df[IMAGE_COL].astype(str).str.len() > 0].copy()
df = df[df[IMAGE_COL].apply(lambda p: os.path.exists(str(p)))].copy()

print("Rows after filtering invalid image paths:", len(df))

Rows after filtering invalid image paths: 5138


## 3) Unified Data Splitting across Modalities

In [6]:
X_text = df["text_concat"].values
X_tab  = df[TABULAR_COLS].values.astype(np.float32)
X_img  = df[IMAGE_COL].astype(str).values
y_raw  = df[TARGET_COL].values

if y_raw.dtype == object or isinstance(y_raw[0], str):
    le = LabelEncoder()
    y = le.fit_transform(y_raw)
else:
    y = y_raw.astype(int)

num_classes = len(np.unique(y))
print("Classes:", num_classes)

X_text_tr, X_text_te, X_tab_tr, X_tab_te, X_img_tr, X_img_te, y_tr, y_te = train_test_split(
    X_text, X_tab, X_img, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train:", len(y_tr), "Test:", len(y_te))

Classes: 5
Train: 4110 Test: 1028


## Text & Tabular Data Preprocessing

In [7]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text_tr)

seq_tr = tokenizer.texts_to_sequences(X_text_tr)
seq_te = tokenizer.texts_to_sequences(X_text_te)

X_text_tr_pad = pad_sequences(seq_tr, maxlen=MAX_LEN, padding="post", truncating="post")
X_text_te_pad = pad_sequences(seq_te, maxlen=MAX_LEN, padding="post", truncating="post")

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)
print("Vocab size:", vocab_size)

scaler = StandardScaler()
X_tab_tr_sc = scaler.fit_transform(X_tab_tr).astype(np.float32)
X_tab_te_sc = scaler.transform(X_tab_te).astype(np.float32)

tab_dim = X_tab_tr_sc.shape[1]
print("Tab dim:", tab_dim)

Vocab size: 14981
Tab dim: 25


## Image loading & preprocessing

In [8]:
def load_and_preprocess_image(path):
    img_bytes = tf.io.read_file(path)
    img = tf.io.decode_image(img_bytes, channels=3, expand_animations=False)
    img = tf.image.resize(img, IMG_SIZE)
    img = tf.cast(img, tf.float32)
    img = mobilenet_preprocess(img)
    return img

def make_img_ds(paths, labels, batch_size=64, training=True):
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))

    def _map(p, y_):
        return load_and_preprocess_image(p), y_
    
    ds = ds.map(_map, num_parallel_calls=tf.data.AUTOTUNE)
    if training:
        ds = ds.shuffle(2048, seed=RANDOM_STATE, reshuffle_each_iteration=True)

    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

img_train_ds = make_img_ds(X_img_tr, y_tr, batch_size=32, training=True)
img_test_ds  = make_img_ds(X_img_te, y_te, batch_size=32, training=False)

## 6) Modality-specific models

### 6A) TEXT model: BiLSTM + pooling -> embedding

In [25]:
def build_text_model(vocab_size, max_len, num_classes, embed_dim=64, rnn_units=16):
    text_in = Input(shape=(max_len,), dtype=tf.int32, name="text_in")

    x = Embedding(vocab_size, embed_dim, name="txt_embed")(text_in)
    x = SimpleRNN(rnn_units,return_sequences=True, recurrent_dropout=0.1, name="txt_rnn")(x)

    avg_pool = GlobalAveragePooling1D(name="txt_gap")(x)
    max_pool = GlobalMaxPooling1D(name="txt_gmp")(x)
    x = Concatenate(name="txt_pool_concat")([avg_pool, max_pool])

    x = Dense(64, activation="relu", name="txt_embed_dense")(x)
    x = Dropout(0.4)(x)

    out = Dense(num_classes, activation="softmax", name="txt_out")(x)

    model = Model(inputs=text_in, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

text_model = build_text_model(vocab_size, MAX_LEN, num_classes)
text_model.summary()

text_model.fit(
    X_text_tr_pad, y_tr,
    validation_split=0.2,
    epochs=EPOCHS_TEXT,
    batch_size=BATCH_SIZE,
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True),
        ReduceLROnPlateau(monitor="val_loss", patience=1, factor=0.5, min_lr=1e-6)
    ],
    verbose=1
)

text_extractor = Model(
    inputs=text_model.input,
    outputs=text_model.get_layer("txt_embed_dense").output
)

Z_txt_tr = text_extractor.predict(X_text_tr_pad, batch_size=256, verbose=0)
Z_txt_te = text_extractor.predict(X_text_te_pad, batch_size=256, verbose=0)

print("Text embeddings:", Z_txt_tr.shape, Z_txt_te.shape)

Epoch 1/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 67ms/step - accuracy: 0.2156 - loss: 1.6173 - val_accuracy: 0.2652 - val_loss: 1.5874 - learning_rate: 0.0010
Epoch 2/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - accuracy: 0.3236 - loss: 1.5385 - val_accuracy: 0.4027 - val_loss: 1.4380 - learning_rate: 0.0010
Epoch 3/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 102ms/step - accuracy: 0.3963 - loss: 1.3827 - val_accuracy: 0.4672 - val_loss: 1.2676 - learning_rate: 0.0010
Epoch 4/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 144ms/step - accuracy: 0.4337 - loss: 1.2814 - val_accuracy: 0.4732 - val_loss: 1.2059 - learning_rate: 0.0010
Epoch 5/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 62ms/step - accuracy: 0.4857 - loss: 1.1754 - val_accuracy: 0.5182 - val_loss: 1.1532 - learning_rate: 0.0010
Epoch 6/10
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0

### 6B) TABULAR model: MLP -> embedding

In [26]:
def build_tabular_model(tab_dim, num_classes):
    tab_in = Input(shape=(tab_dim,), dtype=tf.float32, name="tab_in")

    x = BatchNormalization()(tab_in)

    x = Dense(256, activation="relu")(x)
    x = Dropout(0.35)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.25)(x)

    emb = Dense(128, activation="relu", name="tab_embedding")(x)
    emb = Dropout(0.25)(emb)

    out = Dense(num_classes, activation="softmax", name="tab_out")(emb)

    model = Model(inputs=tab_in, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

tab_model = build_tabular_model(tab_dim, num_classes)
tab_model.summary()


tab_model.fit(
    X_tab_tr_sc, y_tr,
    validation_split=0.2,
    epochs=EPOCHS_TAB,
    batch_size=128,
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
        ReduceLROnPlateau(monitor="val_loss", patience=2, factor=0.5, min_lr=1e-6)
    ],
    verbose=1
)


tab_extractor = Model(
    inputs=tab_model.input,
    outputs=tab_model.get_layer("tab_embedding").output
)

Z_tab_tr = tab_extractor.predict(X_tab_tr_sc, batch_size=256, verbose=0)
Z_tab_te = tab_extractor.predict(X_tab_te_sc, batch_size=256, verbose=0)

print("Tabular embeddings:", Z_tab_tr.shape, Z_tab_te.shape)

Epoch 1/25
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.4398 - loss: 1.3233 - val_accuracy: 0.6034 - val_loss: 0.9532 - learning_rate: 0.0010
Epoch 2/25
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6074 - loss: 0.9200 - val_accuracy: 0.6825 - val_loss: 0.7560 - learning_rate: 0.0010
Epoch 3/25
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6615 - loss: 0.7969 - val_accuracy: 0.7287 - val_loss: 0.6545 - learning_rate: 0.0010
Epoch 4/25
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6977 - loss: 0.7413 - val_accuracy: 0.7287 - val_loss: 0.6347 - learning_rate: 0.0010
Epoch 5/25
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6931 - loss: 0.7114 - val_accuracy: 0.7530 - val_loss: 0.6081 - learning_rate: 0.0010
Epoch 6/25
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16

### 6C) IMAGE model: pretrained CNN + small head -> embedding

In [None]:
def build_image_model(num_classes):
    base = MobileNetV2(weights="imagenet", include_top=False, pooling="avg")
    base.trainable = False

    img_in = Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3), dtype=tf.float32, name="img_in")
    x = base(img_in)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.35)(x)

    emb = Dense(128, activation="relu", name="img_embedding")(x)
    emb = Dropout(0.25)(emb)

