In [None]:
!pip install transformers datasets torc

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-p

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset

import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  


df = pd.read_csv("/kaggle/input/recipe-sampled-0-25/sampled_dataset.csv")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Dataset preparation

In [4]:
df_sample = df[["directions", "ingredients"]].sample(n=80000, random_state=42).reset_index(drop=True)
df_sample.head(3)

Unnamed: 0,directions,ingredients
0,"['Mix together the cheese, olives, onion, drie...","[""1 cup shredded cheddar cheese"", ""1 cup chopp..."
1,"['Brown meat; drain and set aside.', 'Blend ma...","[""1 pie crust"", ""1/2 lb. ground beef (you can ..."
2,"['Dissolve jello in boiling water.', 'Let cool...","[""2 small orange jello"", ""2 c. boiling water"",..."


In [None]:
import ast

df_sample["ingredients"] = df_sample["ingredients"].apply(ast.literal_eval)
df_sample["directions"] = df_sample["directions"].apply(ast.literal_eval)

print(type(df_sample.loc[0, "ingredients"]))  
print(df_sample.loc[0, "ingredients"])        

print(type(df_sample.loc[0, "directions"]))  
print(df_sample.loc[0, "directions"])        

<class 'list'>
['1 cup shredded cheddar cheese', '1 cup chopped pimento stuffed olive', '1 tablespoon minced onion', '1 cup dried beef, chopped', '3/4 - 1 cup mayonnaise', '1 loaf sliced rye cocktail bread']
<class 'list'>
['Mix together the cheese, olives, onion, dried beef and mayo.', 'Spread on slices of rye cocktail bread. place the slices on a cookie sheet and broil until bubbly.']


In [6]:
df_sample["text"] = df_sample["directions"].apply(lambda steps: " ".join(steps))
print(df_sample.loc[0, "text"])

Mix together the cheese, olives, onion, dried beef and mayo. Spread on slices of rye cocktail bread. place the slices on a cookie sheet and broil until bubbly.


In this section I try to clean up the ingredients, trying to remove quantities and other unnecessary information; which may not appear in the directions 

In [7]:
import re

def clean_ingredient(ingredient):
    # Remove fractions and numbers (e.g., "1", "1/2", "2.5")
    ingredient = re.sub(r'\b\d+([\/\.]\d+)?\b', '', ingredient)

    # Common measurement units to remove
    units = [
        "teaspoons?", "tsp", "tablespoons?", "tbsp", "cups?", "ounces?", "oz",
        "pounds?", "lb", "grams?", "g", "kilograms?", "kg", "milliliters?", "ml",
        "liters?", "l", "pinch", "clove", "cloves", "slices?", "dash", "cans?", 
        "packages?", "bunch", "stalks?", "heads?", "pieces?", "sticks?", "inches?"
    ]
    units_pattern = r'\b(?:' + '|'.join(units) + r')\b'
    ingredient = re.sub(units_pattern, '', ingredient, flags=re.IGNORECASE)

    ingredient = re.sub(r'\b(c\.|c)\b\.?', '', ingredient, flags=re.IGNORECASE)

    ingredient = re.sub(r'\(\s*\.\s*\)', '', ingredient)
    ingredient = re.sub(r'\([^)]*\)', '', ingredient)

    ingredient = re.sub(r'\bof\b', '', ingredient, flags=re.IGNORECASE)
    ingredient = re.sub(r'^\s*\.\s*', '', ingredient)       # punto iniziale con spazio
    ingredient = re.sub(r'\.\s*', ' ', ingredient)          # ogni ". " ovunque
    ingredient = re.sub(r',.*', '', ingredient)             # rimuove note dopo virgola

    ingredient = re.sub(r'\b(to |pt |pkg |qt )\.?\b', '', ingredient, flags=re.IGNORECASE)
    ingredient = re.sub(r'^to\s+', '', ingredient, flags=re.IGNORECASE)

    # Remove extra spaces
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()

    return ingredient


# Applica a tutta la colonna ingredients
df_sample["clean_ingredients"] = df_sample["ingredients"].apply(lambda lst: [clean_ingredient(i) for i in lst])

In [8]:
print(df_sample.loc[:10, "clean_ingredients"]) 

0     [shredded cheddar cheese, chopped pimento stuf...
1     [pie crust, ground beef, mayonnaise, milk, egg...
2     [small orange jello, boiling water, small crus...
3     [square graham crackers, reduced calorie marga...
4     [cream cheese, sm jar Old English cheese, Lipt...
5     [MIRACLE WHIP Dressing, BREAKSTONE'S or KNUDSE...
6     [FOR THE FILLING:, Fresh Strawberries, - Fresh...
7     [doz mangos, cabbage, celery, brown sugar, sal...
8     [chopped green peppers, chopped red peppers, c...
9     [fryer, uncooked rice, cream chicken soup, dry...
10    [yeast, bread flour, salt, sugar, olive oil, w...
Name: clean_ingredients, dtype: object


Assignlabels: "0" "I-food" "B-food"

In [None]:

def check(labels):
    for i, label in enumerate(labels):
        if label == 'I-FOOD':
            if i == 0 or labels[i - 1] not in ['B-FOOD', 'I-FOOD']:
                raise ValueError(f"Incoerenza IOB: I-FOOD alla posizione {i} senza B-FOOD precedente.")



def iob_tag_tokens(text, ingredient_list):
    tokens = word_tokenize(text)
    labels = ['O'] * len(tokens)
    
    for ingredient in ingredient_list:
        ingredient_tokens = word_tokenize(ingredient)
        ingredient_len = len(ingredient_tokens)

        if ingredient_len == 0:
            continue  # ignora ingredienti vuoti

        for i in range(len(tokens) - ingredient_len + 1):
            window = tokens[i:i + ingredient_len]
            if [t.lower() for t in window] == [t.lower() for t in ingredient_tokens]:
                labels[i] = 'B-FOOD'
                for j in range(1, ingredient_len):
                    if i + j < len(labels):
                        labels[i + j] = 'I-FOOD'
                break  # evita doppi match dello stesso ingrediente


    check(labels)
    
    return tokens, labels



In [None]:
df_sample["ner_tokens_labels"] = df_sample.apply(
    lambda row: iob_tag_tokens(row["text"], row["clean_ingredients"]), axis=1
)

ESEMPIO UTILIZZO:
text = "Aggiungi una cipolla tritata e soffriggi in olio."
clean_ingredients = ["cipolla", "olio"]

tokens = ["Aggiungi", "una", "cipolla", "tritata", "e", "soffriggi", "in", "olio", "."]
labels = ["O",        "O",   "B-FOOD", "I-FOOD",  "O", "O",         "O", "B-FOOD", "O"]

RISULTATO FINALE:
("Aggiungi", ..., "olio", "."), ["O", ..., "B-FOOD", "O"]


In [11]:
tokens, labels = df_sample.loc[0, "ner_tokens_labels"]
for t, l in zip(tokens, labels):
    print(f"{t:15} → {l}")

Mix             → O
together        → O
the             → O
cheese          → O
,               → O
olives          → O
,               → O
onion           → O
,               → O
dried           → B-FOOD
beef            → I-FOOD
and             → O
mayo            → O
.               → O
Spread          → O
on              → O
slices          → O
of              → O
rye             → O
cocktail        → O
bread           → O
.               → O
place           → O
the             → O
slices          → O
on              → O
a               → O
cookie          → O
sheet           → O
and             → O
broil           → O
until           → O
bubbly          → O
.               → O


#### Training phase

In [12]:
!pip install seqeval

from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score
from torch.nn import CrossEntropyLoss
from collections import Counter
import torch.nn as nn
import numpy as np

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=e0cb99ffa0fc705f41d18cc96a2fcf0f09cdf0d1831ff2da4d8130eb49362f6f
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


2025-05-16 08:52:49.672048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747385569.898491      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747385569.963818      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
hf_data = [
    {
        "tokens": tokens,
        "ner_tags": labels
    }
    for tokens, labels in df_sample["ner_tokens_labels"]
]


This:
(
    ["Aggiungi", "una", "cipolla", "tritata", "finemente", ...],
    ["O",       "O",   "B-FOOD", "I-FOOD", "O", ...]
)
into :
{
    "tokens": ["Aggiungi", "una", "cipolla", "tritata", "finemente", ...],
    "ner_tags": ["O", "O", "B-FOOD", "I-FOOD", "O", ...]
}


In [None]:
# Extract unique NER tags from the dataset and sort them
unique_tags = set(tag for row in hf_data for tag in row["ner_tags"])
unique_tags = sorted(unique_tags)

# Create mappings from tag to id and id to tag for label encoding/decoding
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# Convert string NER tags to integer labels for each example
for row in hf_data:
    row["labels"] = [tag2id[tag] for tag in row["ner_tags"]]
    del row["ner_tags"]  # Remove the original string tags

# Create a HuggingFace Dataset and split into train and test sets
dataset = Dataset.from_list(hf_data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
# Load the BERT tokenizer and prepare the data collator for token classification
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def tokenize_and_align_labels(example):
    """
    Tokenizes input tokens and aligns the NER labels with the resulting wordpieces.
    For each token, the corresponding label is assigned to all subword tokens.
    Non-aligned tokens (special tokens) are assigned a label of -100 to be ignored in loss computation.
    """
    tokenized = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)
    
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            labels.append(example["labels"][word_idx])
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

# Apply the tokenization and label alignment to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Creation of “custom” metrics

In [None]:
def compute_metrics(pred):
    """
    Computes precision, recall, and F1 score for token classification tasks.
    Args:
        pred: A tuple (predictions, labels) as returned by the Trainer.
            - predictions: numpy array of shape (batch_size, seq_len, num_labels)
            - labels: numpy array of shape (batch_size, seq_len)
    Returns:
        A dictionary with precision, recall, and f1 score.
    """
    predictions, labels = pred
    predictions = predictions.argmax(axis=2)

    true_labels = []
    true_preds = []

    for pred_seq, label_seq in zip(predictions, labels):
        curr_preds = []
        curr_labels = []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:  # Ignore special tokens
                curr_preds.append(id2tag[p])
                curr_labels.append(id2tag[l])
        true_preds.append(curr_preds)
        true_labels.append(curr_labels)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }

Creation of the weighted loss

In [None]:
# Select device: use GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Gather all label ids from the training set (excluding special tokens with label -100)
all_labels = []
for example in tokenized_dataset["train"]:
    all_labels += example["labels"]

# Count occurrences of each label (excluding -100)
label_counts = Counter([label for label in all_labels if label != -100])
total = sum(label_counts.values())

# Compute class weights inversely proportional to class frequency
# This helps to handle class imbalance during training
weights = [0.0] * len(tag2id)
for label_id, count in label_counts.items():
    weights[label_id] = total / (len(label_counts) * count)

# Convert weights to a tensor and move to the selected device
weights = torch.tensor(weights).to(device)

In [18]:
from transformers.modeling_outputs import TokenClassifierOutput

class WeightedTokenClassifier(nn.Module):
    def __init__(self, base_model, weights):
        super().__init__()
        self.base_model = base_model
        self.loss_fct = CrossEntropyLoss(weight=weights, ignore_index=-100)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        # Rimuove 'num_items_in_batch' se presente
        kwargs.pop("num_items_in_batch", None)

        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        logits = outputs.logits

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if hasattr(outputs, "hidden_states") else None,
            attentions=outputs.attentions if hasattr(outputs, "attentions") else None,
        )


In [None]:
base_model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2id))
model = WeightedTokenClassifier(base_model, weights)
model.to(device)


args = TrainingArguments(
    output_dir="/kaggle/working/",
    run_name="bert-ner-food-v1",  # nome run esplicito
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=5000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    report_to="none",  # Disabilita logging verso wandb
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)



In [None]:
trainer.train()

In [None]:
trainer.evaluate()