<a href="https://colab.research.google.com/github/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/code/The_Food_Hazard_Detection_Challenge_SemEval_2025_The_BERT_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install torch transformers datasets pandas scikit-learn

# Load training data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# # download training data (labeled):
# !wget https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_train.csv

# load training data:
data = pd.read_csv('incidents_train.csv', index_col=0)
train_df, dev_df = train_test_split(data, test_size=0.2, random_state=2024)

train_df.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
3690,2019,8,21,ca,London Gourmet brand Monster Snack Mix recalle...,Food Recall Warning (Allergen) - London Gourme...,allergens,prepared dishes and snacks,almond,snack mix


In [3]:
len(data["product"].value_counts())

1022

In [4]:
len(train_df["product"].value_counts())

912

In [5]:
len(train_df["hazard"].value_counts())

127

In [6]:
data.title.str.split().apply(len).describe()

count    5082.000000
mean       13.282369
std         5.229355
min         1.000000
25%        10.000000
50%        13.000000
75%        16.000000
max        44.000000
Name: title, dtype: float64

In [7]:
len(data['hazard'].value_counts()), len(data["product"].value_counts())

(128, 1022)

In [8]:
data

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
0,1994,1,7,us,Recall Notification: FSIS-024-94,Case Number: 024-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,smoked sausage
1,1994,3,10,us,Recall Notification: FSIS-033-94,Case Number: 033-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria spp,sausage
2,1994,3,28,us,Recall Notification: FSIS-014-94,Case Number: 014-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,ham slices
3,1994,4,3,us,Recall Notification: FSIS-009-94,Case Number: 009-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,thermal processed pork meat
4,1994,7,1,us,Recall Notification: FSIS-001-94,Case Number: 001-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,chicken breast
...,...,...,...,...,...,...,...,...,...,...
5979,2022,7,28,hk,Imported biscuit may contain allergen (peanuts),Imported biscuit may contain allergen (peanuts...,allergens,cereals and bakery products,peanuts and products thereof,biscuits
5980,2022,7,28,us,Danny’s Sub and Pizza Recalls Meat Pizza Produ...,023-2022\n\n \n High - Class I\n\n Produc...,fraud,prepared dishes and snacks,inspection issues,pizza
5981,2022,7,29,us,Lyons Magnus Voluntarily Recalls 53 Nutritiona...,"FRESNO, Calif. – July 28, 2022 – Lyons Magnus ...",biological,non-alcoholic beverages,cronobacter spp,non-alcoholic beverages
5982,2022,7,30,us,"Conagra Brands, Inc., Recalls Frozen Beef Prod...",025-2022\n\n \n High - Class I\n\n Misbra...,allergens,"meat, egg and dairy products",eggs and products thereof,frozen beef products


# Train and Evaluate BERT

In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler, DataCollatorWithPadding

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['title'], padding=True, truncation=True)

  from .autonotebook import tqdm as notebook_tqdm
2024-11-19 06:42:45.029927: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-19 06:42:45.061330: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-19 06:42:45.061357: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-19 06:42:45.062174: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-19 06:42:45.0

Data preprocessing function:

In [15]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from datasets import Dataset

def prepare_data(label):
    # encode labels:
    label_encoder = LabelEncoder()
    label_encoder.fit(data[label])

    train_df['label'] = label_encoder.transform(train_df[label])
    dev_df['label'] = label_encoder.transform(dev_df[label])

    # Convert DataFrame to Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    
    # Apply the tokenizer to the dataset
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    dev_dataset = dev_dataset.map(tokenize_function, batched=True) 
    print(dev_dataset)
    
    # Create DataCollator to handle padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, max_length=16)
    
    # Convert dataset to PyTorch format
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    
    # Create DataLoader objects
    return (
        DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator),
        DataLoader(dev_dataset, batch_size=8, collate_fn=data_collator),
        label_encoder
    )

Evaluation function:

In [16]:
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

## Sub-Task 1:

### Label: `Hazard Category`

* Data preprocessing

In [17]:
label = 'hazard-category'

# Create DataLoader objects
train_dataloader, dev_dataloader, le_hazard_category = prepare_data(label)

Map: 100%|██████████| 4065/4065 [00:00<00:00, 5927.95 examples/s]
Map: 100%|██████████| 1017/1017 [00:00<00:00, 5853.84 examples/s]

Dataset({
    features: ['year', 'month', 'day', 'country', 'title', 'text', 'hazard-category', 'product-category', 'hazard', 'product', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1017
})





* Choose your model

In [8]:
model_hazard_category = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_hazard_category.to('cuda')  # Move model to GPU if available

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

* Train it

In [9]:
from tqdm.auto import tqdm

optimizer = AdamW(model_hazard_category.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_hazard_category.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard_category(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|█████████▉| 1523/1527 [00:32<00:00, 49.12it/s]

* Assess it

In [10]:
from sklearn.metrics import classification_report

model_hazard_category.eval()
total_predictions = []
with torch.no_grad():
    for batch in dev_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard_category(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

predicted_labels = le_hazard_category.inverse_transform(total_predictions)
gold_labels = le_hazard_category.inverse_transform(dev_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

dev_df['predictions-hazard-category'] = predicted_labels

                                precision    recall  f1-score   support

                     allergens       0.88      0.91      0.89       363
                    biological       0.89      0.93      0.91       349
                      chemical       0.74      0.80      0.77        65
food additives and flavourings       1.00      0.75      0.86         4
                foreign bodies       0.84      0.80      0.82       105
                         fraud       0.77      0.63      0.69        78
          organoleptic aspects       0.40      0.18      0.25        11
                  other hazard       0.52      0.48      0.50        29
              packaging defect       1.00      0.46      0.63        13

                      accuracy                           0.85      1017
                     macro avg       0.78      0.66      0.70      1017
                  weighted avg       0.84      0.85      0.84      1017



In [11]:
model_hazard_category.save_pretrained("bert_hazard_category")
np.save("bert_hazard_category/label_encoder.npy", le_hazard_category.classes_)

### Label: `Product Category`

In [12]:
label = 'product-category'

# Create DataLoader objects
train_dataloader, dev_dataloader, le_product_category = prepare_data(label)

Map: 100%|██████████| 4065/4065 [00:00<00:00, 9577.45 examples/s]
Map: 100%|██████████| 1017/1017 [00:00<00:00, 9390.81 examples/s]


* Train

In [13]:
model_product_category = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_product_category.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_product_category.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_product_category.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product_category(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1527/1527 [00:34<00:00, 43.88it/s]


* Test

In [14]:
model_product_category.eval()
total_predictions = []
with torch.no_grad():
    for batch in dev_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product_category(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

predicted_labels = le_product_category.inverse_transform(total_predictions)
gold_labels = le_product_category.inverse_transform(dev_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

dev_df['predictions-product-category'] = predicted_labels

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.92      0.92      0.92        12
                      cereals and bakery products       0.69      0.78      0.73       123
     cocoa and cocoa preparations, coffee and tea       0.68      0.71      0.70        42
                                    confectionery       0.41      0.41      0.41        32
dietetic foods, food supplements, fortified foods       0.74      0.71      0.72        24
                                    fats and oils       1.00      1.00      1.00         3
                   food additives and flavourings       0.00      0.00      0.00         2
                           food contact materials       0.00      0.00      0.00         1
                            fruits and vegetables       0.76      0.83      0.79       109
                                 herbs and spices       0.60      0.38      0.46        2

In [15]:
model_product_category.save_pretrained("bert_product_category")
np.save("bert_product_category/label_encoder.npy", le_product_category.classes_)

## Evaluate Sub-Task

In [16]:
score = compute_score(
    dev_df['hazard-category'], dev_df['product-category'],
    dev_df['predictions-hazard-category'], dev_df['predictions-product-category']
)
print(f"Score Sub-Task 1: {score:.3f}")

Score Sub-Task 1: 0.646


## Sub-Task 2:

### Label: `Hazard`

In [19]:
label = 'hazard'

# Create DataLoader objects
train_dataloader, dev_dataloader, le_hazard = prepare_data(label)

Map: 100%|██████████| 4065/4065 [00:00<00:00, 5997.36 examples/s]
Map: 100%|██████████| 1017/1017 [00:00<00:00, 5853.86 examples/s]


In [18]:
model_hazard = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_hazard.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_hazard.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_hazard.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1527/1527 [00:35<00:00, 42.96it/s]
100%|█████████▉| 1526/1527 [00:31<00:00, 49.07it/s]

In [19]:
model_hazard.eval()
total_predictions = []
with torch.no_grad():
    for batch in dev_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_hazard(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

predicted_labels = le_hazard.inverse_transform(total_predictions)
gold_labels = le_hazard.inverse_transform(dev_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

dev_df['predictions-hazard'] = predicted_labels

                                                   precision    recall  f1-score   support

                                        Aflatoxin       0.00      0.00      0.00         4
                                   abnormal smell       0.00      0.00      0.00         1
                                  alcohol content       0.00      0.00      0.00         1
                                        allergens       0.00      0.00      0.00         1
                                           almond       0.56      0.68      0.61        22
                                        amygdalin       0.00      0.00      0.00         2
                           antibiotics, vet drugs       0.00      0.00      0.00         1
                                    bacillus spp.       0.00      0.00      0.00         4
                             bad smell / off odor       0.00      0.00      0.00         3
                                    bone fragment       0.00      0.00      0.00         

In [20]:
model_hazard.save_pretrained("bert_hazard")
np.save("bert_hazard/label_encoder.npy", le_hazard.classes_)

### Label: `Product`

In [21]:
label = 'product'

# Create DataLoader objects
train_dataloader, dev_dataloader, le_product = prepare_data(label)

Map: 100%|██████████| 4065/4065 [00:00<00:00, 9618.03 examples/s]
Map: 100%|██████████| 1017/1017 [00:00<00:00, 9499.79 examples/s]


In [22]:
model_product = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data[label].unique()))
model_product.to('cuda')  # Move model to GPU if available

optimizer = AdamW(model_product.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
model_product.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1527/1527 [00:33<00:00, 45.30it/s]


In [23]:
model_product.eval()
total_predictions = []
with torch.no_grad():
    for batch in dev_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move batch to GPU if available
        outputs = model_product(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_predictions.extend([p.item() for p in predictions])

predicted_labels = le_product.inverse_transform(total_predictions)
gold_labels = le_product.inverse_transform(dev_df.label.values)
print(classification_report(gold_labels, predicted_labels, zero_division=0))

dev_df['predictions-product'] = predicted_labels

                                                                        precision    recall  f1-score   support

                                                Catfishes (freshwater)       0.00      0.00      0.00         2
                                                 Fishes not identified       0.08      0.80      0.14         5
                                                    Groupers (generic)       0.00      0.00      0.00         1
                                              Not classified pork meat       0.00      0.00      0.00         1
                                            Pangas catfishes (generic)       0.00      0.00      0.00         1
                                   Precooked cooked pork meat products       0.00      0.00      0.00         3
                                    Torpedo-shaped catfishes (generic)       0.00      0.00      0.00         1
                                                       adobo seasoning       0.00      0.00      0.00  

In [24]:
model_product.save_pretrained("bert_product")
np.save("bert_product/label_encoder.npy", le_product.classes_)
tokenizer.save_pretrained("bert_tokenizer")

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [25]:
!zip bert_baseline.zip bert_*

  adding: bert_hazard/ (stored 0%)
  adding: bert_hazard_category/ (stored 0%)
  adding: bert_product/ (stored 0%)
  adding: bert_product_category/ (stored 0%)
  adding: bert_tokenizer/ (stored 0%)


## Evaluate Sub-Task

In [26]:
score = compute_score(
    dev_df['hazard'], dev_df['product'],
    dev_df['predictions-hazard'], dev_df['predictions-product']
)
print(f"Score Sub-Task 2: {score:.3f}")

Score Sub-Task 2: 0.118


# Predict test set:

Load test data from Codalab:

In [20]:
# download testing data (conception phase, unlabeled):
# !wget https://codalab.lisn.upsaclay.fr/my/datasets/download/26c12bc0-3878-4edf-8b4a-9682763c0b7e
# !unzip -o 26c12bc0-3878-4edf-8b4a-9682763c0b7e
# !rm 26c12bc0-3878-4edf-8b4a-9682763c0b7e

# load test data:
test_df = pd.read_csv('incidents.csv', index_col=0)

test_df.sample()

Unnamed: 0,year,month,day,country,title,text
136,2014,5,9,au,Woolworths Ltd—Woolworths Select Finishing Sau...,PRA No. 2014/14096 Date published 9 May 2014 P...


Prediction function:

In [28]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

def predict(texts, model_path, tokenizer_path="bert_tokenizer"):
    # Load the saved tokenizer
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    
    # Load the saved label encoder
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.load(model_path + '/label_encoder.npy', allow_pickle=True)

    # Load the saved model
    model = BertForSequenceClassification.from_pretrained(model_path)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Decode predictions back to string labels
    return label_encoder.inverse_transform(predictions.cpu().numpy().tolist())

Predict test data:

In [29]:
predictions = pd.DataFrame()

for column in ['hazard-category', 'product-category', 'hazard', 'product']:
  predictions[column] = predict(test_df.title.to_list(), f"bert_{column.replace('-', '_')}")

predictions.sample()

Unnamed: 0,hazard-category,product-category,hazard,product
101,biological,"meat, egg and dairy products",listeria monocytogenes,ham


Save predictions:

In [33]:
import os
from shutil import make_archive

# save predictions to a new folder:
os.makedirs('./submission_BERT/', exist_ok=True)
predictions.to_csv('./submission_BERT/submission.csv')

# zip the folder (zipfile can be directly uploaded to codalab):
make_archive('./submission_BERT', 'zip', './submission_BERT')

'/home/iir/work/ben/NCKU/IIR/SemEval2025_Task9/code/submission_BERT.zip'