<a href="https://colab.research.google.com/github/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/code/The_Food_Hazard_Detection_Challenge_SemEval_2025_The_BERT_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
%%capture
!pip install torch transformers datasets pandas scikit-learn pickle numpy

In [15]:
%%capture
!pip install --upgrade datasets

In [16]:
%%capture
!pip install nlpaug
!pip install sacremoses

In [17]:
!git clone https://github.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io.git

fatal: destination path 'food-hazard-detection-semeval-2025.github.io' already exists and is not an empty directory.


In [18]:
from transformers import BertTokenizer
import pandas as pd
data = pd.read_csv('food-hazard-detection-semeval-2025.github.io/data/incidents_train.csv', index_col=0)
data.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
1366,2015,9,14,uk,Premier Foods recalls Bisto For Chicken Gravy ...,Premier Food recalls some batches of Bisto For...,foreign bodies,"soups, broths, sauces and condiments",metal fragment,gravy granules


In [19]:
import pandas as pd
from datasets import Dataset
from transformers import AdamW, get_scheduler, DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
import pickle
from google.colab import userdata, drive
from typing_extensions import TypedDict
import json
import re
import time
import requests
import google.generativeai as genai
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from sklearn.utils import shuffle

In [9]:
from nltk.corpus import wordnet
import random
import nltk

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [10]:
import zipfile
import os

nltk_data_path = '/usr/share/nltk_data/corpora'

# Unzip wordnet.zip
with zipfile.ZipFile(os.path.join(nltk_data_path, 'wordnet.zip'), 'r') as zip_ref:
    zip_ref.extractall(nltk_data_path)

# Unzip omw-1.4.zip
with zipfile.ZipFile(os.path.join(nltk_data_path, 'omw-1.4.zip'), 'r') as zip_ref:
    zip_ref.extractall(nltk_data_path)

print("Unzipping complete!")


Unzipping complete!


In [11]:
from nltk.corpus import wordnet

try:
    wordnet.synsets('example')
    print("WordNet is accessible.")
except LookupError as e:
    print("Error accessing WordNet:", e)


WordNet is accessible.


In [13]:
import pandas as pd
import nlpaug.augmenter.word as naw
from tqdm import tqdm

# Initialize the backtranslation augmenter with MarianMT models (fast and lightweight)
aug_fr = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-fr',  # English to French
    to_model_name='Helsinki-NLP/opus-mt-fr-en',  # French to English
    device='cuda'  # Use 'cpu' if GPU is unavailable
)
aug_de = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-de',  # English to German
    to_model_name='Helsinki-NLP/opus-mt-de-en',  # German to English
    device='cuda'  # Use 'cpu' if GPU is unavailable
)

# Load your data
data = pd.read_csv('food-hazard-detection-semeval-2025.github.io/data/incidents_valid.csv')

# DataFrame to store augmented data
augmented_data = []

# Perform augmentation with progress bar
for index, row in tqdm(data.iterrows(), total=len(data), desc="Augmenting data"):
    for i in range(4):  # Generate 6 augmented rows for each original row
        augmented_row = row.copy()

        # Choose a backtranslation augmenter in a round-robin fashion
        aug = aug_fr if i % 2 == 0 else aug_de

        try:
            # Apply backtranslation to the title and text
            augmented_row['title'] = aug.augment(row['title'])
            augmented_row['text'] = aug.augment(row['text'])
            augmented_data.append(augmented_row)
        except Exception as e:
            print(f"Error during augmentation: {e}")

# Convert augmented data to DataFrame
augmented_data_df = pd.DataFrame(augmented_data)

# Save augmented data to a CSV file
output_file = 'augmented_valid_data_backtranslation.csv'
augmented_data_df.to_csv(output_file, index=False)

print(f"Backtranslation augmentation complete. Data saved to '{output_file}'.")


Augmenting data: 100%|██████████| 565/565 [2:35:59<00:00, 16.57s/it]  

Backtranslation augmentation complete. Data saved to 'augmented_valid_data_backtranslation.csv'.





In [10]:
# Categories to augment
categories_to_augment = ["packaging defect", "other hazard", "fraud", "food additives and flavourings"]
augmentation_samples_per_technique = 200  # 200 samples per technique per category

# Initialize augmenters
synonym_aug = naw.SynonymAug(aug_min=10, aug_max=20)  # Synonym replacement
contextual_aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', aug_p=0.2, device='cuda')  # Contextual embeddings

# DataFrame to store augmented data
augmented_data = []

# Loop through each category to augment
for category in categories_to_augment:
    # Filter data for the current category
    category_data = data[data['hazard-category'] == category]

    if len(category_data) == 0:
        print(f"No data available for category: {category}")
        continue

    for augmenter, name in zip([synonym_aug, contextual_aug], ["synonym", "contextual"]):
        print(f"Augmenting {category} with {name} augmentation...")

        # Reset counter for each technique
        augmented_rows = []
        while len(augmented_rows) < augmentation_samples_per_technique:
            # Shuffle the data to ensure randomness
            category_data = shuffle(category_data)
            # Cycle through the available rows if fewer rows exist
            for _, sample in category_data.iterrows():
                if len(augmented_rows) >= augmentation_samples_per_technique:
                    break

                # Apply augmentation to title and text
                augmented_title = augmenter.augment(sample['title'])
                augmented_text = augmenter.augment(sample['text'])

                # Create a new row with augmented data
                new_row = sample.copy()
                new_row['title'] = augmented_title
                new_row['text'] = augmented_text
                augmented_rows.append(new_row)

        # Add the augmented rows to the global list
        augmented_data.extend(augmented_rows)

# Convert augmented data to DataFrame
augmented_data_df = pd.DataFrame(augmented_data)

# Save augmented data to a file
augmented_data_df.to_csv('augmented_data_step1.csv', index=False)

print("Augmentation for multiple categories complete. Data saved to 'augmented_data_step1.csv'.")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Augmenting packaging defect with synonym augmentation...
Augmenting packaging defect with contextual augmentation...
Augmenting other hazard with synonym augmentation...
Augmenting other hazard with contextual augmentation...
Augmenting fraud with synonym augmentation...
Augmenting fraud with contextual augmentation...
Augmenting food additives and flavourings with synonym augmentation...
Augmenting food additives and flavourings with contextual augmentation...
Augmentation for multiple categories complete. Data saved to 'augmented_data_step1.csv'.


In [21]:
synthesized_df = pd.DataFrame()
generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented.csv', index_col=0)
synthesized_df = pd.concat([synthesized_df, generated_data])
# generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_2.csv', index_col=0)
# synthesized_df = pd.concat([synthesized_df, generated_data])
generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_3.csv', index_col=0)
synthesized_df = pd.concat([synthesized_df, generated_data])
# generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_4.csv', index_col=0)
# synthesized_df = pd.concat([synthesized_df, generated_data])
generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_5.csv', index_col=0)
synthesized_df = pd.concat([synthesized_df, generated_data])
# generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_6.csv', index_col=0)
# synthesized_df = pd.concat([synthesized_df, generated_data])
generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_product.csv', index_col=0)
synthesized_df = pd.concat([synthesized_df, generated_data])
# generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_product_2.csv', index_col=0)
# synthesized_df = pd.concat([synthesized_df, generated_data])
generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_product_3.csv', index_col=0)
synthesized_df = pd.concat([synthesized_df, generated_data])
# generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_product_4.csv', index_col=0)
# synthesized_df = pd.concat([synthesized_df, generated_data])
generated_data = pd.read_csv('/kaggle/input/gemini2/Gemini/gemini_augmented_product_5.csv', index_col=0)
synthesized_df = pd.concat([synthesized_df, generated_data])

In [22]:
augmented_data_df = pd.read_csv('/kaggle/input/data-aug1/augmented_data.csv', index_col=0)

## Rest Code

In [23]:
data['title_text'] = data['title'] + ' ' + data['text']
synthesized_df['title_text'] = synthesized_df['title'] + ' ' + synthesized_df['text']
augmented_data_df['title_text'] = augmented_data_df['title'] + ' ' + augmented_data_df['text']

# data['title_text'] = 'Hazard Category: ' + data['hazard-category'] + ' ' + data['title'] + ' ' + data['text']
# synthesized_df['title_text'] = 'Hazard Category: ' + synthesized_df['hazard-category'] + ' ' + synthesized_df['title'] + ' ' + synthesized_df['text']

In [24]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.save_pretrained('./tokenizer')
# tokenizer.save_pretrained('/content/drive/My Drive/FoodHazardData/tokenizer')

def tokenize_function(examples):
    return tokenizer(examples['title_text'], padding=True, truncation=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Label: `Hazard Category`

* Choose your target

In [15]:
label = 'hazard-category' # change this to: 'product-category', 'hazard', 'product' to alter the ground truth
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])
synthesized_df['label'] = label_encoder.transform(synthesized_df[label])
augmented_data_df['label'] = label_encoder.transform(augmented_data_df[label])

with open(f'{label}_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [None]:
# train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [16]:
from nltk.corpus import wordnet
import random
import nltk

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# ----------------------------
# 1. Synonym Replacement Functions
# ----------------------------
def get_synonyms(word):
    """Retrieve synonyms for a given word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ').lower()
            if synonym != word.lower():
                synonyms.add(synonym)
    return list(synonyms)

def synonym_replacement(sentence, n=1):
    """
    Replace up to n words in the sentence with their synonyms.

    Args:
        sentence (str): The original sentence.
        n (int): Number of words to replace.

    Returns:
        str: The augmented sentence.
    """
    words = sentence.split()
    new_words = words.copy()
    eligible_words = [word for word in words if word.isalpha()]
    random.shuffle(eligible_words)
    num_replaced = 0

    for random_word in eligible_words:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)

# ----------------------------
# 2. Data Splitting
# ----------------------------
# Split the data into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
train_df = pd.concat([train_df, synthesized_df], ignore_index=True)
train_df = pd.concat([train_df, augmented_data_df], ignore_index=True)


print(f"Final augmented training set size: {len(train_df)}")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Final augmented training set size: 11672


In [None]:
augmented_data_df['hazard-category'].value_counts()

* Data preprocessing

In [17]:
# Split the data into training and testing sets
# train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

Map:   0%|          | 0/11672 [00:00<?, ? examples/s]

Map:   0%|          | 0/509 [00:00<?, ? examples/s]

* Choose your model

In [18]:
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(data[label].unique()))
model.to('cuda')  # Move model to GPU if available

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


* Train it

In [19]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



  0%|          | 0/7295 [00:00<?, ?it/s]



* Assess it

In [20]:
# model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/FoodHazardData/bert_hazard_category")

# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

model.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

                                precision    recall  f1-score   support

                     allergens       0.97      0.99      0.98       188
                    biological       0.99      0.99      0.99       171
                      chemical       1.00      0.94      0.97        35
food additives and flavourings       1.00      1.00      1.00         5
                foreign bodies       1.00      0.98      0.99        58
                         fraud       0.89      0.86      0.87        28
                     migration       1.00      1.00      1.00         1
          organoleptic aspects       1.00      1.00      1.00         3
                  other hazard       1.00      0.93      0.97        15
              packaging defect       0.83      1.00      0.91         5

                      accuracy                           0.98       509
                     macro avg       0.97      0.97      0.97       509
                  weighted avg       0.98      0.98      0.98 

In [None]:
model.save_pretrained("distilbert_hazard_category_gemini_nlpaug")
# model.save_pretrained("/content/drive/My Drive/FoodHazardData/Models/bert_hazard_category_gemini_pubmed")

In [None]:
import numpy as np

# List of model paths saved in Google Drive
model_paths = [
    "/content/drive/My Drive/FoodHazardData/bert_hazard_category",
    "/content/drive/My Drive/FoodHazardData/Models/bert_hazard_category_gemini",
    # "/content/drive/My Drive/FoodHazardData/bert_hazard_category_model_3"
]

# Load all models
models = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for path in model_paths:
    model = BertForSequenceClassification.from_pretrained(path)
    model.to(device)
    model.eval()
    models.append(model)

# Evaluate with ensembling
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        # Move batch to GPU if available
        batch = {k: v.to(device) for k, v in batch.items()}

        # Get logits from all models
        logits_list = []
        for model in models:
            outputs = model(**batch)
            logits_list.append(outputs.logits.cpu().numpy())

        # Ensemble predictions by averaging logits
        avg_logits = np.mean(logits_list, axis=0)  # Average logits from all models

        # Get final predictions by taking the argmax of averaged logits
        predictions = np.argmax(avg_logits, axis=-1)
        total_predictions.extend(predictions)

# Decode predicted labels and gold labels
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)

# Print classification report
print(classification_report(gold_labels, predicted_labels, zero_division=0))


# Label: `Product Category`

In [25]:
label = 'product-category'
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])
synthesized_df['label'] = label_encoder.transform(synthesized_df[label])
augmented_data_df['label'] = label_encoder.transform(augmented_data_df[label])

with open(f'{label}_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [26]:
import random
import nltk
from nltk.corpus import wordnet
import pandas as pd
from sklearn.model_selection import train_test_split

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# ----------------------------
# 1. Synonym Replacement Functions
# ----------------------------
def get_synonyms(word):
    """Retrieve synonyms for a given word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ').lower()
            if synonym != word.lower():
                synonyms.add(synonym)
    return list(synonyms)

def synonym_replacement(sentence, n=1):
    """
    Replace up to n words in the sentence with their synonyms.

    Args:
        sentence (str): The original sentence.
        n (int): Number of words to replace.

    Returns:
        str: The augmented sentence.
    """
    words = sentence.split()
    new_words = words.copy()
    eligible_words = [word for word in words if word.isalpha()]
    random.shuffle(eligible_words)
    num_replaced = 0

    for random_word in eligible_words:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)

train_df = pd.DataFrame()
train_df = pd.concat([train_df, data], ignore_index=True)
test_df = pd.read_csv('food-hazard-detection-semeval-2025.github.io/data/incidents_valid.csv', index_col=0)
test_df['title_text'] = test_df['title'] + ' ' + test_df['text']
test_df['label'] = label_encoder.transform(test_df[label])

train_df = pd.concat([train_df, synthesized_df], ignore_index=True)
train_df = pd.concat([train_df, augmented_data_df], ignore_index=True)

print(f"Final augmented training set size: {len(train_df)}")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Final augmented training set size: 12745


In [27]:
# Split the data into training and testing sets
# train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

Map:   0%|          | 0/12745 [00:00<?, ? examples/s]

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

* Train

In [28]:
# model_product_category = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_product_category = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(data[label].unique()))
model_product_category.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_product_category.parameters(), lr=5e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_product_category.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product_category(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7970 [00:00<?, ?it/s]



In [29]:
# model_product_category = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/FoodHazardData/bert_product_category")

# Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_product_category.to(device)

model_product_category.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product_category(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions, zero_division=0))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.86      0.86      0.86         7
                      cereals and bakery products       0.81      0.83      0.82        75
     cocoa and cocoa preparations, coffee and tea       0.67      0.67      0.67        15
                                    confectionery       0.84      0.62      0.71        26
dietetic foods, food supplements, fortified foods       0.53      0.64      0.58        14
                                    fats and oils       1.00      0.75      0.86         4
                                   feed materials       0.00      0.00      0.00         1
                            fruits and vegetables       0.73      0.85      0.79        52
                                 herbs and spices       0.59      0.68      0.63        19
                                ices and desserts       0.88      0.88      0.88        2

* Test

In [30]:
model_product_category.save_pretrained("distilbert_product_category")
# model_product_category.save_pretrained("distilbert_product_category")

# Label: `Hazard`

In [None]:
label = 'hazard'
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])

# Split the data into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

In [None]:
model_hazard = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_hazard.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_hazard.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_hazard.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
model_hazard.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions, zero_division=0))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

In [None]:
model_hazard.save_pretrained("bert_hazard")

# Label: `product`

In [None]:
label = 'product'
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data[label])

# Split the data into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

In [None]:
model_product = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_product.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_product.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
model_product.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
model_product.eval()
total_predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

#print(classification_report(test_df.label, total_predictions, zero_division=0))
predicted_labels = label_encoder.inverse_transform(total_predictions)
gold_labels = label_encoder.inverse_transform(test_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

In [None]:
model_product.save_pretrained("bert_product")
tokenizer.save_pretrained("bert_tokenizer")

In [None]:
!zip bert_baseline.zip bert_*

# Loading a trained baseline

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def predict(texts, model_path, tokenizer_path="tokenizer", batch_size=32):
    # Load the saved tokenizer
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    # Load the saved model
    model = BertForSequenceClassification.from_pretrained(model_path)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # for batch in test_dataloader:
        #   batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        #   outputs = model(**batch)
        #   predictions = torch.argmax(outputs.logits, dim=-1)
        #   total_predictions.extend([p.item() for p in predictions])

    return predictions.cpu().numpy().tolist()

    # Initialize an empty list to store predictions
    # all_predictions = []

    # # Process texts in batches
    # for i in range(0, len(texts), batch_size):
    #     batch_texts = texts[i:i + batch_size]  # Create a batch of texts

    #     # Tokenize the input texts for the current batch
    #     inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

    #     # Move inputs to the same device as the model
    #     inputs = {key: value.to(device) for key, value in inputs.items()}

    #     # Make predictions for the batch
    #     with torch.no_grad():
    #         outputs = model(**inputs)
    #         logits = outputs.logits
    #         predictions = torch.argmax(logits, dim=-1)

    #     # Collect predictions from the current batch
    #     all_predictions.extend(predictions.cpu().numpy().tolist())

    # return all_predictions

In [None]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # Convert string labels to numerical labels for hazards
  le_hazards = LabelEncoder()
  hazards_true = le_hazards.fit_transform(hazards_true)

  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # Convert string labels to numerical labels for products
  le_products = LabelEncoder()
  products_true = le_products.fit_transform(products_true)

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

In [None]:
predictions = pd.DataFrame()

### Sub-Task 1:

In [None]:
import joblib

# Load the pre-fitted LabelEncoders
hazard_label_encoder = joblib.load("hazard-category_label_encoder.pkl")
product_label_encoder = joblib.load("product-category_label_encoder.pkl")

# Store them in a dictionary for easier access if needed
label_encoders = {
    "hazard_category": hazard_label_encoder,
    "product_category": product_label_encoder
}

In [None]:
for category in ['hazard_category', 'product_category']: #, 'hazard', 'product']:
# for category in ['product_category']:
  c = category.replace('_', '-')
  print(c.upper())
  predictions[category] = predict(test_df.title_text.to_list(), f"/content/drive/My Drive/FoodHazardData/bert_{category}")
  # Decode predictions back to string labels
  label_encoder = label_encoders[category]
  gold = label_encoder.fit_transform(test_df[c])
  print(classification_report(gold, predictions[category], zero_division=0))

In [None]:
print(f"Score Sub-Task 1: {compute_score(test_df['hazard-category'], test_df['product-category'], predictions['hazard_category'], predictions['product_category']):.3f}")
# print(f"Score Sub-Task 2: {compute_score(test_df['hazard'], test_df['product'], predictions['hazard'], predictions['product']):.3f}")

In [None]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Create DataCollator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)

# Convert dataset to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Testing

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import pickle

# Load validation data
validation_data = pd.read_csv('/content/drive/My Drive/FoodHazardData/incidents_validation.csv', index_col=0)

# Combine title and text for predictions
validation_data['title_text'] = validation_data['title'] + ' ' + validation_data['text']

# Function to tokenize input
def tokenize_function(examples, tokenizer):
    return tokenizer(examples['title_text'], padding=True, truncation=True)

# Function to load model and make predictions
def predict_category(model_path, tokenizer_path, label_encoder_path, validation_data):
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    # Tokenize validation data
    dataset = Dataset.from_pandas(validation_data)
    dataset = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    # Format dataset for PyTorch
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)

    # Load model
    model = BertForSequenceClassification.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Load label encoder
    with open(label_encoder_path, 'rb') as f:
        label_encoder = pickle.load(f)

    # Make predictions
    total_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_predictions.extend([p.item() for p in predictions])

    # Decode predictions
    predicted_labels = label_encoder.inverse_transform(total_predictions)
    return predicted_labels

# Paths for hazard-category model, tokenizer, and label encoder
hazard_model_path = "/content/drive/My Drive/FoodHazardData/bert_hazard_category"
hazard_tokenizer_path = "tokenizer"
hazard_label_encoder_path = "hazard-category_label_encoder.pkl"

# Predict hazard-category
validation_data['hazard-category'] = predict_category(
    hazard_model_path,
    hazard_tokenizer_path,
    hazard_label_encoder_path,
    validation_data
)

# Combine hazard-category with title and text for product-category prediction
validation_data['title_text'] = 'Hazard Category: ' + validation_data['hazard-category'] + ' ' + validation_data['title'] + ' ' + validation_data['text']

# Paths for product-category model, tokenizer, and label encoder
product_model_path = "/content/drive/My Drive/FoodHazardData/Models/bert_product_category_gemini"
product_tokenizer_path = "tokenizer"
product_label_encoder_path = "product-category_label_encoder.pkl"

# Predict product-category
validation_data['product-category'] = predict_category(
    product_model_path,
    product_tokenizer_path,
    product_label_encoder_path,
    validation_data
)

# Save to submission.csv
submission = validation_data[['hazard-category', 'product-category']]
submission.to_csv('submission.csv', index=False)

print("Predictions saved to 'submission.csv'")


In [None]:
# download submission as csv
from google.colab import files
files.download('submission.csv')

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
import pickle
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('food-hazard-detection-semeval-2025.github.io/data/incidents_train.csv', index_col=0)

# Split data into train and test sets
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

# Combine title and text for predictions in test set
test_df['title_text'] = test_df['title'] + ' ' + test_df['text']

# Function to tokenize input
def tokenize_function(examples, tokenizer):
    return tokenizer(examples['title_text'], padding=True, truncation=True)

# Function to load model and make predictions
def predict_category(model_path, tokenizer_path, label_encoder_path, validation_data):
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    # Tokenize validation data
    dataset = Dataset.from_pandas(validation_data)
    dataset = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    # Format dataset for PyTorch
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)

    # Load model
    model = BertForSequenceClassification.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Load label encoder
    with open(label_encoder_path, 'rb') as f:
        label_encoder = pickle.load(f)

    # Make predictions
    total_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_predictions.extend([p.item() for p in predictions])

    # Decode predictions
    predicted_labels = label_encoder.inverse_transform(total_predictions)
    return predicted_labels

# Paths for hazard-category model, tokenizer, and label encoder
hazard_model_path = "/content/drive/My Drive/FoodHazardData/bert_hazard_categoryf"
hazard_tokenizer_path = "tokenizer"
hazard_label_encoder_path = "hazard-category_label_encoder.pkl"

# Predict hazard-category
test_df['hazard-category-pred'] = predict_category(
    hazard_model_path,
    hazard_tokenizer_path,
    hazard_label_encoder_path,
    test_df
)

# Combine hazard-category with title and text for product-category prediction
test_df['title_text'] = 'Hazard Category: ' + test_df['hazard-category'] + ' ' + test_df['title'] + ' ' + test_df['text']

# Paths for product-category model, tokenizer, and label encoder
product_model_path = "/content/drive/My Drive/FoodHazardData/bert_product_category3"
product_tokenizer_path = "tokenizer"
product_label_encoder_path = "product-category_label_encoder.pkl"

# Predict product-category
test_df['product-category-pred'] = predict_category(
    product_model_path,
    product_tokenizer_path,
    product_label_encoder_path,
    test_df
)

# Compute classification report and final score
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 for hazards
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')

    # Compute F1 for products (only for correctly predicted hazards)
    correct_hazards = hazards_true == hazards_pred
    f1_products = f1_score(
        products_true[correct_hazards],
        products_pred[correct_hazards],
        average='macro'
    )

    return (f1_hazards + f1_products) / 2

# Extract true and predicted labels
hazards_true = test_df['hazard-category']
products_true = test_df['product-category']
hazards_pred = test_df['hazard-category-pred']
products_pred = test_df['product-category-pred']

# Classification report
print("Classification Report for Hazard-Category:")
print(classification_report(hazards_true, hazards_pred))

print("Classification Report for Product-Category (correct hazards only):")
correct_hazards = hazards_true == hazards_pred
print(classification_report(products_true[correct_hazards], products_pred[correct_hazards]))

# Compute final score
final_score = compute_score(hazards_true, products_true, hazards_pred, products_pred)
print(f"Final Score: {final_score}")


In [None]:
import pandas as pd
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    """
    Computes the macro-F1 for hazards, then the macro-F1 for products
    *only for rows where hazards_pred == hazards_true*.
    Finally returns the average of those two scores.
    """
    # -- F1 for hazards (macro) --
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')

    # -- F1 for products, only on rows where hazard is correct --
    correct_hazard_mask = (hazards_pred == hazards_true)
    # If no hazards were correct, this can raise an error.
    # For safety, you might do a check to handle that edge case.
    f1_products = f1_score(
        products_true[correct_hazard_mask],
        products_pred[correct_hazard_mask],
        average='macro'
    )

    return (f1_hazards + f1_products) / 2.0

# -----------------------------
# 1. Read the ground truth
# -----------------------------
df_valid = pd.read_csv("incidents_valid.csv")

# We only need these columns from the ground truth
df_valid = df_valid[["hazard-category", "product-category"]].copy()
df_valid.columns = ["hazard_true", "product_true"]

# -----------------------------
# 2. Read your submission
# -----------------------------
df_sub = pd.read_csv("submission.csv")

# We only need these columns from the submission
df_sub = df_sub[["hazard-category", "product-category"]].copy()
df_sub.columns = ["hazard_pred", "product_pred"]

# -----------------------------
# 3. Concatenate or merge to compare row by row
# -----------------------------
# *** IMPORTANT ***
# This assumes that the number of rows in df_valid
# and df_sub match, and that each row corresponds
# to the same incident.
df_compare = pd.concat([df_valid, df_sub], axis=1)

# -----------------------------
# 4. Compute final score
# -----------------------------
score = compute_score(
    hazards_true=df_compare["hazard_true"],
    products_true=df_compare["product_true"],
    hazards_pred=df_compare["hazard_pred"],
    products_pred=df_compare["product_pred"]
)

print("Your macro-F1 Score (with the given formula) =", score)


In [None]:
import os
from shutil import make_archive

# save predictions to a new folder:
os.makedirs('./submission/', exist_ok=True)
predictions.to_csv('./submission/submission.csv')

# zip the folder (zipfile can be directly uploaded to codalab):
make_archive('./submission', 'zip', './submission')