### Multimodal Classification: text + tabular + image
### Early fusion NN + Feature fusion with XGBoost and SVM

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    Dense, Dropout, Concatenate, GlobalAveragePooling1D
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mobilenet_preprocess
from tensorflow.keras.preprocessing import image as keras_image

In [None]:
"""my plan is to first perform early fusion between the tabular features and the
 text data (using tokenized text as input). This combined representation will be fed 
 into a suitable neural network. In parallel, the image data will be processed using a 
 pretrained neural network to extract visual features. Finally, I will apply feature-level 
 fusion between the learned representations from the text–tabular model and the image model, and use the 
 fused features as input to XGBoost and SVM classifiers for final prediction."""

'my plan is to first perform early fusion between the tabular features and the\n text data (using tokenized text as input). This combined representation will be fed into a suitable neural network. In parallel, the image data will be processed using a pretrained neural network to extract visual features. Finally, I will apply feature-level fusion between the learned representations from the text–tabular model and the image model, and use the fused features as input to XGBoost and SVM classifiers for final prediction.'

In [5]:
df = pd.read_csv("FoodFactsCleaned.csv")

In [6]:
TEXT_COLS = [
    "brand_cleaned",
    "allergens_cleaned",
    "ingredients_text_cleaned",
    "countries_cleaned",
    "additives_cleaned",
]

TABULAR_COLS = [
    'nova_group', 'fat_100g',
    'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
    'proteins_100g', 'contains_palm_oil', 'vegetarian_status', 'vegan_status',
    'nutrient_level_fat', 'nutrient_level_saturated_fat',
    'nutrient_level_sugars', 'nutrient_level_salt', 'ecoscore_grade', 'ecoscore_score',
    'carbon_footprint_100g', 'additives_count', 'sugar_ratio',
    'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score',
    'log_energy_kcal_100g', 'log_salt_100g'
]

TARGET_COL = "nutriscore_letter"         
IMAGE_PATH_COL = "image_160_path"  
RANDOM_STATE = 42
TEST_SIZE = 0.2

# Text tokenization
MAX_WORDS = 30000
MAX_LEN = 200

In [7]:
for c in TEXT_COLS:
    df[c] = df[c].fillna("").astype(str)

df["text_concat"] = df[TEXT_COLS].agg(" ".join, axis=1)

print("Rows after image path filtering:", len(df))

Rows after image path filtering: 5138


In [9]:
X_text = df["text_concat"].values
X_tab = df[TABULAR_COLS].values
X_img_paths = df[IMAGE_PATH_COL].astype(str).values
y = df[TARGET_COL].values


X_text_tr, X_text_te, X_tab_tr, X_tab_te, X_img_tr, X_img_te, y_tr, y_te = train_test_split(
    X_text, X_tab, X_img_paths, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

num_classes = len(np.unique(y))
print("Train:", len(y_tr), "Test:", len(y_te), "Classes:", num_classes)

Train: 4110 Test: 1028 Classes: 5


In [12]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_text_tr)

seq_tr = tokenizer.texts_to_sequences(X_text_tr)
seq_te = tokenizer.texts_to_sequences(X_text_te)

X_text_tr_pad = pad_sequences(seq_tr, maxlen=MAX_LEN, padding="post", truncating="post")
X_text_te_pad = pad_sequences(seq_te, maxlen=MAX_LEN, padding="post", truncating="post")

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)

scaler = StandardScaler()
X_tab_tr_scaled = scaler.fit_transform(X_tab_tr)
X_tab_te_scaled = scaler.transform(X_tab_te)

In [14]:
def build_text_tab_model(vocab_size, max_len, tab_dim, num_classes):
    # Text branch
    text_in = Input(shape=(max_len,), name="text_in")
    x_text = Embedding(
        input_dim=vocab_size,
        output_dim=128,
        input_length=max_len
    )(text_in)

    x_text = GlobalAveragePooling1D()(x_text)
    x_text = Dropout(0.3)(x_text)

    # Tabular branch
    tab_in = Input(shape=(tab_dim,), name="tab_in")
    x_tab = Dense(64, activation="relu")(tab_in)
    x_tab = Dropout(0.3)(x_tab)
    x_tab = Dense(32, activation="relu")(x_tab)

    # Early fusion
    fused = Concatenate()([x_text, x_tab])
    fused = Dense(128, activation="relu", name="text_tab_embedding")(fused)
    fused = Dropout(0.4)(fused)

    out = Dense(num_classes, activation="softmax")(fused)
    model = Model(inputs=[text_in, tab_in], outputs=out)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [22]:

tab_dim = X_tab_tr_scaled.shape[1]
text_tab_model = build_text_tab_model(vocab_size, MAX_LEN, tab_dim, num_classes)
text_tab_model.summary()

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
]

history = text_tab_model.fit(
    {"text_in": X_text_tr_pad, "tab_in": X_tab_tr_scaled},
    y_tr,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4799 - loss: 1.2117 - val_accuracy: 0.7056 - val_loss: 0.7846
Epoch 2/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6347 - loss: 0.8298 - val_accuracy: 0.6910 - val_loss: 0.6940
Epoch 3/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6804 - loss: 0.7472 - val_accuracy: 0.7494 - val_loss: 0.6135
Epoch 4/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7013 - loss: 0.7102 - val_accuracy: 0.7567 - val_loss: 0.6041
Epoch 5/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7232 - loss: 0.6843 - val_accuracy: 0.7567 - val_loss: 0.5893
Epoch 6/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7178 - loss: 0.6700 - val_accuracy: 0.7640 - val_loss: 0.5735
Epoch 7/20
[1m103/103[0m 

In [23]:
# Feature extractor: outputs the learned text-tab embedding
text_tab_extractor = Model(
    inputs=text_tab_model.inputs,
    outputs=text_tab_model.get_layer("text_tab_embedding").output
)

Z_texttab_tr = text_tab_extractor.predict({"text_in": X_text_tr_pad, "tab_in": X_tab_tr_scaled}, batch_size=256, verbose=0)
Z_texttab_te = text_tab_extractor.predict({"text_in": X_text_te_pad, "tab_in": X_tab_te_scaled}, batch_size=256, verbose=0)

print("Text-Tab embeddings:", Z_texttab_tr.shape, Z_texttab_te.shape)

Text-Tab embeddings: (4110, 128) (1028, 128)


In [24]:
img_backbone = MobileNetV2(weights="imagenet", include_top=False, pooling="avg")
IMG_SIZE = (160, 160)

  img_backbone = MobileNetV2(weights="imagenet", include_top=False, pooling="avg")


In [25]:
def load_and_preprocess_images(paths, img_size=IMG_SIZE):
    """Loads a batch of images from local paths and returns a float32 tensor"""
    imgs = []
    keep_idx = []
    for i, p in enumerate(paths):
        try:
            img = keras_image.load_img(p, target_size=img_size)
            arr = keras_image.img_to_array(img)
            imgs.append(arr)
            keep_idx.append(i)
        except Exception:
            # skip unreadable image
            pass

    if len(imgs) == 0:
        raise RuntimeError("No images could be loaded. Check paths and formats.")

    x = np.stack(imgs).astype(np.float32)
    x = mobilenet_preprocess(x)
    return x, np.array(keep_idx, dtype=int)