<a href="https://colab.research.google.com/github/dinagalevska/FoodHazardDetection/blob/master/ModernBERT_Food_Hazard_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login
from google.colab import userdata

In [2]:
hf_token = userdata.get('HF_TOKEN')

In [3]:
login(token=hf_token)

In [4]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-86weekm9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-86weekm9
  Resolved https://github.com/huggingface/transformers.git to commit 15bd3e61f8d3680ca472c9314ad07584d20f7b81
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [5]:
!pip install datasets pandas scikit-learn



In [6]:
!pip install triton

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

In [6]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler
from torch.optim import AdamW
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

#####Load data

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# download training data (labeled):
!wget https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_train.csv

data = pd.read_csv('incidents_train.csv', index_col=0)
train_df, dev_df = train_test_split(data, test_size=0.2, random_state=2024)

--2025-01-11 22:01:29--  https://raw.githubusercontent.com/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/refs/heads/main/data/incidents_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12866710 (12M) [text/plain]
Saving to: ‘incidents_train.csv’


2025-01-11 22:01:30 (386 MB/s) - ‘incidents_train.csv’ saved [12866710/12866710]



In [39]:
train_df.head()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
1062,2014,7,30,au,Marvellous Creations Jelly Popping Candy Beani...,Mondelez Australia Pty Ltd has recalled Marvel...,foreign bodies,"cocoa and cocoa preparations, coffee and tea",plastic fragment,chocolate
1969,2016,11,17,us,"Request Foods, Inc. Issues Allergy Alert On Un...","Holland, MI - Request Foods, Inc. is recalling...",allergens,other food product / mixed,eggs and products thereof,pasta products
1053,2014,7,17,uk,"VBites Foods recalls 'Wot, No Dairy?' desserts","VBites Foods is recalling two 'Wot, No Dairy?'...",allergens,ices and desserts,milk and products thereof,desserts
2200,2017,5,1,ca,Toppits brand Battered Blue Cod Fillet recalle...,Food Recall Warning (Allergen) - Toppits brand...,allergens,seafood,milk and products thereof,cod fillets
276,2006,10,6,us,Oct 6_ 2006_ Iowa_ Firm Recalls Ground Beef___,"WASHINGTON, October 6, 2006 - Jims Market and...",biological,"meat, egg and dairy products",escherichia coli,frozen beef patties


In [40]:
data.title.str.split().apply(len).describe()

Unnamed: 0,title
count,5082.0
mean,13.282369
std,5.229355
min,1.0
25%,10.0
50%,13.0
75%,16.0
max,44.0


###Train and Evaluate ModernBERT

In [41]:
MODEL_ID = "answerdotai/ModernBERT-base"

In [42]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
def tokenize_function(examples):
    return tokenizer(examples['title'], padding=True, truncation=True, max_length=512)

save_directory = "./modernbert"
tokenizer.save_pretrained(save_directory)
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
# model.save_pretrained(save_directory)

('./modernbert/tokenizer_config.json',
 './modernbert/special_tokens_map.json',
 './modernbert/tokenizer.json')

In [43]:
def prepare_data(label, train_df, dev_df):
    label_encoder = LabelEncoder()
    label_encoder.fit(train_df[label])

    train_df['label'] = label_encoder.transform(train_df[label])
    dev_df['label'] = label_encoder.transform(dev_df[label])

    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    dev_dataset = dev_dataset.map(tokenize_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    return (
        DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator),
        DataLoader(dev_dataset, batch_size=8, collate_fn=data_collator),
        label_encoder
    )

In [44]:
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

In [45]:
def train_and_evaluate_model(label, train_df, dev_df, num_labels):
    train_dataloader, dev_dataloader, label_encoder = prepare_data(label, train_df, dev_df)

    config = AutoConfig.from_pretrained(MODEL_ID, num_labels=num_labels)
    config.use_flash_attention_2 = False  # Explicitly disable Flash Attention

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        config=config
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    model.train()
    progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    model.eval()
    total_predictions = []
    with torch.no_grad():
        for batch in dev_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_predictions.extend([p.item() for p in predictions])

    predicted_labels = label_encoder.inverse_transform(total_predictions)
    gold_labels = label_encoder.inverse_transform(dev_df.label.values)

    print(classification_report(gold_labels, predicted_labels, zero_division=0))

    model.save_pretrained(f"modernbert_{label.replace('-', '_')}")
    np.save(f"modernbert_{label.replace('-', '_')}/label_encoder.npy", label_encoder.classes_)

    return predicted_labels

## Sub-Task 1

###Label: Hazard Category

In [46]:
label = 'hazard-category'
predicted_hazard_labels = train_and_evaluate_model(label, train_df, dev_df, num_labels=len(data[label].unique()))
dev_df['predictions-hazard-category'] = predicted_hazard_labels

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 1527/1527 [04:33<00:00,  5.55it/s]

                                precision    recall  f1-score   support

                     allergens       0.88      0.89      0.88       363
                    biological       0.88      0.93      0.91       349
                      chemical       0.83      0.75      0.79        65
food additives and flavourings       1.00      0.75      0.86         4
                foreign bodies       0.84      0.85      0.84       105
                         fraud       0.71      0.63      0.67        78
          organoleptic aspects       0.57      0.36      0.44        11
                  other hazard       0.62      0.52      0.57        29
              packaging defect       0.67      0.77      0.71        13

                      accuracy                           0.85      1017
                     macro avg       0.78      0.72      0.74      1017
                  weighted avg       0.85      0.85      0.85      1017



Training: 100%|██████████| 1527/1527 [04:41<00:00,  5.42it/s]

### Label: Product Category

In [47]:
label = 'product-category'
predicted_product_labels = train_and_evaluate_model(label, train_df, dev_df, num_labels=len(data[label].unique()))
dev_df['predictions-product-category'] = predicted_product_labels

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 1527/1527 [04:42<00:00,  5.82it/s]

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.79      0.92      0.85        12
                      cereals and bakery products       0.67      0.83      0.74       123
     cocoa and cocoa preparations, coffee and tea       0.78      0.74      0.76        42
                                    confectionery       0.34      0.41      0.37        32
dietetic foods, food supplements, fortified foods       0.70      0.58      0.64        24
                                    fats and oils       1.00      1.00      1.00         3
                   food additives and flavourings       0.00      0.00      0.00         2
                           food contact materials       0.00      0.00      0.00         1
                            fruits and vegetables       0.77      0.79      0.78       109
                                 herbs and spices       0.52      0.50      0.51        2

### Evaluate Sub-Task

In [48]:
score = compute_score(
    dev_df['hazard-category'], dev_df['product-category'],
    dev_df['predictions-hazard-category'], dev_df['predictions-product-category']
)
print(f"Score Sub-Task 1: {score:.3f}")

Training: 100%|██████████| 1527/1527 [04:54<00:00,  5.18it/s]

Score Sub-Task 1: 0.673





## Sub-Task 2

In [49]:
def prepare_data_2(label, train_df, dev_df):
    all_labels = list(train_df[label].unique()) + list(dev_df[label].unique())
    all_labels = list(set(all_labels))

    label_encoder = LabelEncoder()
    label_encoder.fit(all_labels)

    train_df['label'] = label_encoder.transform(train_df[label])
    dev_df['label'] = label_encoder.transform(dev_df[label])

    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    dev_dataset = dev_dataset.map(tokenize_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    dev_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    return (
        DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator),
        DataLoader(dev_dataset, batch_size=8, collate_fn=data_collator),
        label_encoder
    )

In [50]:
def train_and_evaluate_model_2(label, train_df, dev_df, num_labels):
    train_dataloader, dev_dataloader, label_encoder = prepare_data_2(label, train_df, dev_df)

    config = AutoConfig.from_pretrained(MODEL_ID, num_labels=num_labels)
    config.use_flash_attention_2 = False  # Explicitly disable Flash Attention

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        config=config
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    model.train()
    progress_bar = tqdm(range(num_training_steps), desc="Training", leave=True)
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    model.eval()
    total_predictions = []
    with torch.no_grad():
        for batch in dev_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_predictions.extend([p.item() for p in predictions])

    predicted_labels = label_encoder.inverse_transform(total_predictions)
    gold_labels = label_encoder.inverse_transform(dev_df.label.values)

    print(classification_report(gold_labels, predicted_labels, zero_division=0))

    model.save_pretrained(f"modernbert_{label.replace('-', '_')}")
    np.save(f"modernbert_{label.replace('-', '_')}/label_encoder.npy", label_encoder.classes_)

    return predicted_labels

In [51]:
label = 'hazard'
predicted_hazard_2_labels = train_and_evaluate_model_2(label, train_df, dev_df, num_labels=len(data[label].unique()))
dev_df['predictions-hazard'] = predicted_hazard_2_labels

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 1527/1527 [04:34<00:00,  5.55it/s]

                                                   precision    recall  f1-score   support

                                        Aflatoxin       1.00      0.75      0.86         4
                                   abnormal smell       0.00      0.00      0.00         1
                                  alcohol content       0.00      0.00      0.00         1
                                        allergens       0.00      0.00      0.00         1
                                           almond       0.82      0.64      0.72        22
                                        amygdalin       0.00      0.00      0.00         2
                           antibiotics, vet drugs       0.00      0.00      0.00         1
                                    bacillus spp.       0.00      0.00      0.00         4
                             bad smell / off odor       0.00      0.00      0.00         3
                                    bone fragment       1.00      0.60      0.75         

Training: 100%|██████████| 1527/1527 [04:50<00:00,  5.26it/s]


In [52]:
label = 'product'
predicted_product_2_labels = train_and_evaluate_model_2(label, train_df, dev_df, num_labels=len(data[label].unique()))
dev_df['predictions-product'] = predicted_product_2_labels

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 1527/1527 [04:34<00:00,  5.73it/s]

                                                                        precision    recall  f1-score   support

                                                Catfishes (freshwater)       0.00      0.00      0.00         2
                                                 Fishes not identified       0.33      0.60      0.43         5
                                                    Groupers (generic)       0.00      0.00      0.00         1
                                              Not classified pork meat       0.00      0.00      0.00         1
                                            Pangas catfishes (generic)       0.00      0.00      0.00         1
                                   Precooked cooked pork meat products       0.00      0.00      0.00         3
                                    Torpedo-shaped catfishes (generic)       0.00      0.00      0.00         1
                                                         Veggie Burger       0.00      0.00      0.00  

Training: 100%|██████████| 1527/1527 [04:49<00:00,  5.27it/s]

In [53]:
score = compute_score(
    dev_df['hazard'], dev_df['product'],
    dev_df['predictions-hazard'], dev_df['predictions-product']
)
print(f"Score Sub-Task 2: {score:.3f}")

Score Sub-Task 2: 0.288





##Predict test set

In [54]:
!wget https://codalab.lisn.upsaclay.fr/my/datasets/download/26c12bc0-3878-4edf-8b4a-9682763c0b7e
!unzip -o 26c12bc0-3878-4edf-8b4a-9682763c0b7e
!rm 26c12bc0-3878-4edf-8b4a-9682763c0b7e

# load test data:
test_df = pd.read_csv('incidents.csv', index_col=0)

test_df.sample()

--2025-01-11 22:22:09--  https://codalab.lisn.upsaclay.fr/my/datasets/download/26c12bc0-3878-4edf-8b4a-9682763c0b7e
Resolving codalab.lisn.upsaclay.fr (codalab.lisn.upsaclay.fr)... 129.175.8.29
Connecting to codalab.lisn.upsaclay.fr (codalab.lisn.upsaclay.fr)|129.175.8.29|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://miniodis-rproxy.lisn.upsaclay.fr/py3-private/public_data/ee902c30-cff6-4bc0-9525-f6a7531ddeaa/competition/19955/1/data/public_dat.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=EASNOMJFX9QFW4QIY4SL%2F20250111%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250111T222210Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b0d43022c490bffd30c25bc029296126af8b9b856a34f2534182b1f09a0e705d [following]
--2025-01-11 22:22:10--  https://miniodis-rproxy.lisn.upsaclay.fr/py3-private/public_data/ee902c30-cff6-4bc0-9525-f6a7531ddeaa/competition/19955/1/data/public_dat.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=EASNOMJFX

Unnamed: 0,year,month,day,country,title,text
349,2018,10,27,us,"Red Square Foods, Inc., Recalls Frozen Not-Rea...","WASHINGTON, Oct. 27, 2018 – Red Square Foods, ..."


In [55]:
def predict(texts, model_path, tokenizer_path="modernbert"):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)

    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.load(model_path + '/label_encoder.npy', allow_pickle=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

    inputs = {key: value.to(device) for key, value in inputs.items()}

    model.eval()

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return label_encoder.inverse_transform(predictions.cpu().numpy().tolist())

In [56]:
predictions = pd.DataFrame()

for column in ['hazard-category', 'product-category', 'hazard', 'product']:
    model_path = f"modernbert_{column.replace('-', '_')}"
    predictions[column] = predict(test_df.title.to_list(), model_path)

predictions.sample()

Unnamed: 0,hazard-category,product-category,hazard,product
547,biological,cereals and bakery products,salmonella,bakery products


In [57]:
import os
from shutil import make_archive

os.makedirs('./submission/', exist_ok=True)
predictions.to_csv('./submission/submission.csv', index=False)

make_archive('./submission', 'zip', './submission')

'/content/submission.zip'