In [2]:
import pandas as pd
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split, KFold
import torch
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [3]:
def df_preprocessing(input_file):
  # 1️⃣ Load dataset
  df = pd.read_csv(input_file)  # Must have: dish_name, description, cuisine, diet

  # 2️⃣ Handle missing description
  df["description"] = df["description"].fillna("")

  # 3️⃣ Create full text input including cuisine
  df["text"] = df["cuisine"] + " dish: " + df["dish_name"] + " - " + df["description"]
  df["text"] = df["text"].str.strip(" - ")  # Remove trailing dash if description was empty

  # 4️⃣ Map labels: Vegetarian -> 1, Non-Vegetarian -> 0
  label_mapping = {"Vegetarian": 1, "Non-Vegetarian": 0}
  df["label"] = df["diet"].map(label_mapping)
  return df

# 5 Train-test dataframes
synthetic_train_df = df_preprocessing("synthetic_train.csv")
synthetic_test_df = df_preprocessing("synthetic_test.csv")
yelp_train_df = df_preprocessing("yelp_sample_train.csv")
yelp_test_df = df_preprocessing("yelp_sample_test.csv")



In [4]:
# 6️⃣ Convert to HuggingFace Dataset format
synthetic_train_dataset = Dataset.from_pandas(synthetic_train_df)
yelp_train_dataset = Dataset.from_pandas(yelp_train_df)
synthetic_test_dataset = Dataset.from_pandas(synthetic_test_df)
yelp_test_dataset = Dataset.from_pandas(yelp_test_df)

In [5]:
from huggingface_hub import login
login()  # will prompt you to enter your HF token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# 7️⃣ Load mBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# 8️⃣ Tokenization function
def tokenize_fn(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

synthetic_train_dataset = synthetic_train_dataset.map(tokenize_fn, batched=True)
yelp_train_dataset = yelp_train_dataset.map(tokenize_fn, batched=True)
synthetic_test_dataset = synthetic_test_dataset.map(tokenize_fn, batched=True)
yelp_test_dataset = yelp_test_dataset.map(tokenize_fn, batched=True)

# Remove unused columns
columns_to_remove = ["dish_name", "description", "cuisine", "diet", "text"]
synthetic_train_dataset = synthetic_train_dataset.remove_columns([col for col in columns_to_remove if col in synthetic_train_dataset.column_names])
yelp_train_dataset = yelp_train_dataset.remove_columns([col for col in columns_to_remove if col in yelp_train_dataset.column_names])
synthetic_test_dataset = synthetic_test_dataset.remove_columns([col for col in columns_to_remove if col in synthetic_test_dataset.column_names])
yelp_test_dataset = yelp_test_dataset.remove_columns([col for col in columns_to_remove if col in yelp_test_dataset.column_names])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/1794 [00:00<?, ? examples/s]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [7]:
print(type(synthetic_train_dataset))
print(synthetic_train_dataset.column_names)


<class 'datasets.arrow_dataset.Dataset'>
['label', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# ✅ Prepare lists to store results
synthetic_acc_scores = [0.9805013927576601, 0.9637883008356546]
synthetic_f1_scores = [0.9824561403508771, 0.962536023054755]
yelp_acc_scores = [0.9106145251396648, 0.8876404494382022]
yelp_f1_scores = [0.92, 0.8947368421052632]

# ✅ K-fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, ((train_idx_synth, test_idx_synth), (train_idx_yelp, test_idx_yelp)) in enumerate(
        zip(kf.split(synthetic_train_dataset), kf.split(yelp_train_dataset)), 1):

    print(f"\n===== Fold {fold} =====")

    if fold != 3:
        continue

    print(f"\n===== Fold {fold} =====")

    # ✅ Create train and test splits for synthetic
    synthetic_train_split = synthetic_train_dataset.select(train_idx_synth)
    synthetic_test_split = synthetic_train_dataset.select(test_idx_synth)

    # ✅ Create train and test splits for yelp
    yelp_train_split = yelp_train_dataset.select(train_idx_yelp)
    yelp_test_split = yelp_train_dataset.select(test_idx_yelp)

    # ✅ Combine synthetic + yelp train splits
    combined_train_split = concatenate_datasets([synthetic_train_split, yelp_train_split])

    # Load a fresh model for every fold
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold+1}",
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f"./logs_fold_{fold+1}",
        load_best_model_at_end=False,
        logging_steps=50
    )

    # ✅ Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=combined_train_split,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # ✅ Train model on combined train split
    trainer.train()

    # ✅ Save model & tokenizer for this fold
    save_path = f"./mbbert_fold_{fold}"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model and tokenizer saved at {save_path}")

    # ✅ Synthetic test evaluation
    synthetic_preds = trainer.predict(synthetic_test_split)
    synthetic_metrics = synthetic_preds.metrics
    synthetic_acc_scores.append(synthetic_metrics["test_accuracy"])
    synthetic_f1_scores.append(synthetic_metrics["test_f1"])

    print(f"Synthetic Test - Accuracy: {synthetic_metrics['test_accuracy']:.4f}, "
          f"F1_score: {synthetic_metrics['test_f1']:.4f}")

    # ✅ Yelp test evaluation
    yelp_preds = trainer.predict(yelp_test_split)
    yelp_metrics = yelp_preds.metrics
    yelp_acc_scores.append(yelp_metrics["test_accuracy"])
    yelp_f1_scores.append(yelp_metrics["test_f1"])

    print(f"Yelp Test - Accuracy: {yelp_metrics['test_accuracy']:.4f}, "
          f"F1_score: {yelp_metrics['test_f1']:.4f}")

# ✅ Average results across folds
print("\n===== AVERAGE METRICS ACROSS FOLDS =====")
print("Synthetic Test Set:")
print(f"Average Accuracy: {np.mean(synthetic_acc_scores):.4f}")
print(f"Average F1_score: {np.mean(synthetic_f1_scores):.4f}")

print("\nYelp Test Set:")
print(f"Average Accuracy: {np.mean(yelp_acc_scores):.4f}")
print(f"Average F1_score: {np.mean(yelp_f1_scores):.4f}")


===== Fold 3 =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.6994
100,0.6162
150,0.4369
200,0.4119
250,0.2967
300,0.3332
350,0.2688


In [12]:
yelp_f1_scores

[0.92, 0.8947368421052632]

In [None]:
# Importing a model that has been trained and saved.
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./mbbert_fold_1")
tokenizer = AutoTokenizer.from_pretrained("./mbbert_fold_1")
