<a href="https://colab.research.google.com/github/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/code/The_Food_Hazard_Detection_Challenge_SemEval_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load data

Load training data:

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split


# load training data:
data = pd.read_csv('/home/iir/work/ben/NCKU/IIR/SemEval2025_Task9/data/incidents_train.csv', index_col=0)
trainset, devset = train_test_split(data, test_size=0.2, random_state=2024)

trainset.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
75,2000,1,7,au,Woolworths Limited—Homebrand Corn & Chicken fl...,PRA No. 2000/4205 Date published 7 Jan 2000 Pr...,chemical,prepared dishes and snacks,chemical compound (high content),noodles


Load test data from Codalab:

In [3]:
# download testing data (conception phase, unlabeled):
!wget https://codalab.lisn.upsaclay.fr/my/datasets/download/26c12bc0-3878-4edf-8b4a-9682763c0b7e
!unzip -o 26c12bc0-3878-4edf-8b4a-9682763c0b7e
!rm 26c12bc0-3878-4edf-8b4a-9682763c0b7e

# load test data:
testset = pd.read_csv('incidents.csv', index_col=0)

testset.sample()

--2024-11-12 16:40:57--  https://codalab.lisn.upsaclay.fr/my/datasets/download/26c12bc0-3878-4edf-8b4a-9682763c0b7e
Resolving codalab.lisn.upsaclay.fr (codalab.lisn.upsaclay.fr)... 129.175.8.8
Connecting to codalab.lisn.upsaclay.fr (codalab.lisn.upsaclay.fr)|129.175.8.8|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://miniodis-rproxy.lisn.upsaclay.fr/py3-private/public_data/ee902c30-cff6-4bc0-9525-f6a7531ddeaa/competition/19955/1/data/public_dat.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=EASNOMJFX9QFW4QIY4SL%2F20241112%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241112T164059Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=0c2fd5aea347bc3fe044c7bd1f319c24a172794b0329f2fbc555c6000456ebdc [following]
--2024-11-12 16:40:59--  https://miniodis-rproxy.lisn.upsaclay.fr/py3-private/public_data/ee902c30-cff6-4bc0-9525-f6a7531ddeaa/competition/19955/1/data/public_dat.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=EASNOMJFX9Q

Unnamed: 0,year,month,day,country,title,text
119,2013,7,13,us,2009 - torres hillsdale country cheese llc exp...,"FOR IMMEDIATE RELEASE -- March 23, 2009 -- Tor..."


In [5]:
testset

Unnamed: 0,year,month,day,country,title,text
0,1994,5,5,us,Recall Notification: FSIS-017-94,Case Number: 017-94 \n Date Opene...
1,1994,5,12,us,Recall Notification: FSIS-048-94,Case Number: 048-94 \n Date Opene...
2,1995,4,16,us,Recall Notification: FSIS-032-95,Case Number: 032-95 \n Date Opene...
3,1998,7,16,ca,Archive - ALLERGY ALERT -- PRESENCE OF UNDECLA...,PRESENCE OF UNDECLARED NUTS IN ORIGINALE AUGUS...
4,1998,8,6,us,Recall Notification: FSIS-018-98,Case Number: 018-98 Recall Notification Repor...
...,...,...,...,...,...,...
560,2022,6,29,au,The Fresh Salad Co Thai Coconut Wild Rice Prep...,Page Content ​ ​​​​ ​Date publ...
561,2022,7,18,au,Powered by Plants Pty Ltd — Cleanfit Plant Pro...,PRA number 2022/19525 Published date 18 Jul 20...
562,2022,7,20,ca,Certain Enjoy Life brand Soft Baked Cookies – ...,Food recall warning Certain Enjoy Life brand S...
563,2022,7,28,hk,Imported biscuit may contain allergen (peanuts),Imported biscuit may contain allergen (peanuts...


# Classifictaion

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

text_clf_lr = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2,5), max_df=0.5, min_df=5)),
     ('clf', LogisticRegression(max_iter=1000)),
    ])

## Evaluation
* On 20% of the training data
* As was suggested by [Randl et al (2023)](https://aclanthology.org/2024.findings-acl.459)

In [7]:
from sklearn.metrics import classification_report, f1_score

for label in ('hazard-category', 'product-category', 'hazard', 'product'):
  print(label.upper())
  text_clf_lr.fit(trainset.title, trainset[label])
  print("----: ", trainset.title) 
  print("----: ", trainset[label]) 

  # get development scores:
  devset['predictions-' + label] = text_clf_lr.predict(devset.title)
  print(f'  macro: {f1_score(devset[label], devset["predictions-" + label], zero_division=0, average="macro"):.2f}')
  print(f'  micro: {f1_score(devset[label], devset["predictions-" + label], zero_division=0, average="micro"):.2f}')

  # predict test set:
  testset[label] = text_clf_lr.predict(testset.title)

HAZARD-CATEGORY
----:  1062    Marvellous Creations Jelly Popping Candy Beani...
1969    Request Foods, Inc. Issues Allergy Alert On Un...
1053       VBites Foods recalls 'Wot, No Dairy?' desserts
2200    Toppits brand Battered Blue Cod Fillet recalle...
276        Oct 6_ 2006_ Iowa_ Firm Recalls Ground Beef___
                              ...                        
183     Golden Circle—Meal Variety 4 and 8 Pack Baby Food
2542    Ottogi brand Beef Bone and Vegetable Soup reca...
2780    Labrada Nutrition Issues Allergy Alert on Unde...
2744                         Creative Gourmet Pomegranate
3933    Lin’s Waha Int’l Corp Issues Alert on Undeclar...
Name: title, Length: 4065, dtype: object
----:  1062    foreign bodies
1969         allergens
1053         allergens
2200         allergens
276         biological
             ...      
183          allergens
2542         allergens
2780         allergens
2744        biological
3933         allergens
Name: hazard-category, Length: 4065, d

In [15]:
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

print(f"Score Sub-Task 1: {compute_score(devset['hazard-category'], devset['product-category'], devset['predictions-hazard-category'], devset['predictions-product-category']):.3f}")
print(f"Score Sub-Task 2: {compute_score(devset['hazard'], devset['product'], devset['predictions-hazard'], devset['predictions-product']):.3f}")

Score Sub-Task 1: 0.449
Score Sub-Task 2: 0.121


# Save file for submission

In [16]:
import os
from shutil import make_archive

# save predictions to a new folder:
os.makedirs('./submission/', exist_ok=True)
testset[['hazard-category', 'product-category', 'hazard', 'product']].to_csv('./submission/submission.csv')

# zip the folder (zipfile can be directly uploaded to codalab):
make_archive('./submission', 'zip', './submission')

'/home/iir/work/ben/NCKU/IIR/SemEval2025_Task9/code/submission.zip'