In [1]:
!pip install transformers datasets torch

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-p

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  


df = pd.read_csv("/kaggle/input/recipe-sampled-0-25/sampled_dataset.csv")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Preparazione del dataset

In [4]:
df_sample = df[["directions", "ingredients"]].sample(n=100000, random_state=42).reset_index(drop=True)
df_sample.head(3)

Unnamed: 0,directions,ingredients
0,"['Mix together the cheese, olives, onion, drie...","[""1 cup shredded cheddar cheese"", ""1 cup chopp..."
1,"['Brown meat; drain and set aside.', 'Blend ma...","[""1 pie crust"", ""1/2 lb. ground beef (you can ..."
2,"['Dissolve jello in boiling water.', 'Let cool...","[""2 small orange jello"", ""2 c. boiling water"",..."


In [5]:
import ast

df_sample["ingredients"] = df_sample["ingredients"].apply(ast.literal_eval)
df_sample["directions"] = df_sample["directions"].apply(ast.literal_eval)

print(type(df_sample.loc[0, "ingredients"]))  # deve essere <class 'list'>
print(df_sample.loc[0, "ingredients"])        # stampa la lista vera

print(type(df_sample.loc[0, "directions"]))  # deve essere <class 'list'>
print(df_sample.loc[0, "directions"])        # stampa la lista vera

<class 'list'>
['1 cup shredded cheddar cheese', '1 cup chopped pimento stuffed olive', '1 tablespoon minced onion', '1 cup dried beef, chopped', '3/4 - 1 cup mayonnaise', '1 loaf sliced rye cocktail bread']
<class 'list'>
['Mix together the cheese, olives, onion, dried beef and mayo.', 'Spread on slices of rye cocktail bread. place the slices on a cookie sheet and broil until bubbly.']


In [6]:
df_sample["text"] = df_sample["directions"].apply(lambda steps: " ".join(steps))
print(df_sample.loc[0, "text"])

Mix together the cheese, olives, onion, dried beef and mayo. Spread on slices of rye cocktail bread. place the slices on a cookie sheet and broil until bubbly.


Queste regole regex non sono complete, non coprono tutti i casi. Bisognerebbe aggiunge man mano, ma è una operazione complicata

In [7]:
import re

def clean_ingredient(ingredient):
    # Remove fractions and numbers (e.g., "1", "1/2", "2.5")
    ingredient = re.sub(r'\b\d+([\/\.]\d+)?\b', '', ingredient)

    # Common measurement units to remove
    units = [
        "teaspoons?", "tsp", "tablespoons?", "tbsp", "cups?", "ounces?", "oz",
        "pounds?", "lb", "grams?", "g", "kilograms?", "kg", "milliliters?", "ml",
        "liters?", "l", "pinch", "clove", "cloves", "slices?", "dash", "cans?", 
        "packages?", "bunch", "stalks?", "heads?", "pieces?", "sticks?", "inches?"
    ]
    units_pattern = r'\b(?:' + '|'.join(units) + r')\b'
    ingredient = re.sub(units_pattern, '', ingredient, flags=re.IGNORECASE)

    ingredient = re.sub(r'\b(c\.|c)\b\.?', '', ingredient, flags=re.IGNORECASE)

    ingredient = re.sub(r'\(\s*\.\s*\)', '', ingredient)
    ingredient = re.sub(r'\([^)]*\)', '', ingredient)

    ingredient = re.sub(r'\bof\b', '', ingredient, flags=re.IGNORECASE)
    ingredient = re.sub(r'^\s*\.\s*', '', ingredient)       # punto iniziale con spazio
    ingredient = re.sub(r'\.\s*', ' ', ingredient)          # ogni ". " ovunque
    ingredient = re.sub(r',.*', '', ingredient)             # rimuove note dopo virgola

    ingredient = re.sub(r'\b(to |pt |pkg |qt )\.?\b', '', ingredient, flags=re.IGNORECASE)
    ingredient = re.sub(r'^to\s+', '', ingredient, flags=re.IGNORECASE)

    # Remove extra spaces
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()

    return ingredient


# Applica a tutta la colonna ingredients
df_sample["clean_ingredients"] = df_sample["ingredients"].apply(lambda lst: [clean_ingredient(i) for i in lst])

In [8]:
print(df_sample.loc[:10, "clean_ingredients"]) 

0     [shredded cheddar cheese, chopped pimento stuf...
1     [pie crust, ground beef, mayonnaise, milk, egg...
2     [small orange jello, boiling water, small crus...
3     [square graham crackers, reduced calorie marga...
4     [cream cheese, sm jar Old English cheese, Lipt...
5     [MIRACLE WHIP Dressing, BREAKSTONE'S or KNUDSE...
6     [FOR THE FILLING:, Fresh Strawberries, - Fresh...
7     [doz mangos, cabbage, celery, brown sugar, sal...
8     [chopped green peppers, chopped red peppers, c...
9     [fryer, uncooked rice, cream chicken soup, dry...
10    [yeast, bread flour, salt, sugar, olive oil, w...
Name: clean_ingredients, dtype: object


Assegno le label "0" "I-food" "B-food"

In [9]:

def check(labels):
      # 🔍 Verifica coerenza: nessun I-FOOD senza un B-FOOD prima
    for i, label in enumerate(labels):
        if label == 'I-FOOD':
            if i == 0 or labels[i - 1] not in ['B-FOOD', 'I-FOOD']:
                raise ValueError(f"Incoerenza IOB: I-FOOD alla posizione {i} senza B-FOOD precedente.")



def iob_tag_tokens(text, ingredient_list):
    tokens = word_tokenize(text)
    labels = ['O'] * len(tokens)
    
    for ingredient in ingredient_list:
        ingredient_tokens = word_tokenize(ingredient)
        ingredient_len = len(ingredient_tokens)

        if ingredient_len == 0:
            continue  # ignora ingredienti vuoti

        for i in range(len(tokens) - ingredient_len + 1):
            window = tokens[i:i + ingredient_len]
            if [t.lower() for t in window] == [t.lower() for t in ingredient_tokens]:
                labels[i] = 'B-FOOD'
                for j in range(1, ingredient_len):
                    if i + j < len(labels):
                        labels[i + j] = 'I-FOOD'
                break  # evita doppi match dello stesso ingrediente


    check(labels)
    
    return tokens, labels



In [10]:
df_sample["ner_tokens_labels"] = df_sample.apply(
    lambda row: iob_tag_tokens(row["text"], row["clean_ingredients"]), axis=1
)
"""
ESEMPIO UTILIZZO:
text = "Aggiungi una cipolla tritata e soffriggi in olio."
clean_ingredients = ["cipolla", "olio"]

tokens = ["Aggiungi", "una", "cipolla", "tritata", "e", "soffriggi", "in", "olio", "."]
labels = ["O",        "O",   "B-FOOD", "I-FOOD",  "O", "O",         "O", "B-FOOD", "O"]

RISULTATO FINALE:
("Aggiungi", ..., "olio", "."), ["O", ..., "B-FOOD", "O"]
"""

'\nESEMPIO UTILIZZO:\ntext = "Aggiungi una cipolla tritata e soffriggi in olio."\nclean_ingredients = ["cipolla", "olio"]\n\ntokens = ["Aggiungi", "una", "cipolla", "tritata", "e", "soffriggi", "in", "olio", "."]\nlabels = ["O",        "O",   "B-FOOD", "I-FOOD",  "O", "O",         "O", "B-FOOD", "O"]\n\nRISULTATO FINALE:\n("Aggiungi", ..., "olio", "."), ["O", ..., "B-FOOD", "O"]\n'

In [11]:
tokens, labels = df_sample.loc[0, "ner_tokens_labels"]
for t, l in zip(tokens, labels):
    print(f"{t:15} → {l}")

Mix             → O
together        → O
the             → O
cheese          → O
,               → O
olives          → O
,               → O
onion           → O
,               → O
dried           → B-FOOD
beef            → I-FOOD
and             → O
mayo            → O
.               → O
Spread          → O
on              → O
slices          → O
of              → O
rye             → O
cocktail        → O
bread           → O
.               → O
place           → O
the             → O
slices          → O
on              → O
a               → O
cookie          → O
sheet           → O
and             → O
broil           → O
until           → O
bubbly          → O
.               → O


#### Preparazione dell'addestramento

In [12]:
!pip install seqeval

from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score
from torch.nn import CrossEntropyLoss
from collections import Counter
import torch.nn as nn
import numpy as np

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c5fe456d3a01b10be7547e6557e641e2a21fe64cdca447048fd7a79d29591b88
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


2025-05-17 11:01:20.187610: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747479680.386737      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747479680.443769      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
"""
(["Aggiungi", "una", "cipolla", "tritata", "finemente", ...],
 ["O",       "O",   "B-FOOD", "I-FOOD", "O", ...])
DIVENTA:
{
    "tokens": ["Aggiungi", "una", "cipolla", "tritata", "finemente", ...],
    "ner_tags": ["O", "O", "B-FOOD", "I-FOOD", "O", ...]
}
"""

hf_data = [
    {
        "tokens": tokens,
        "ner_tags": labels
    }
    for tokens, labels in df_sample["ner_tokens_labels"]
]


In [14]:
unique_tags = set(tag for row in hf_data for tag in row["ner_tags"])
unique_tags = sorted(unique_tags)

tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
"""
tag2id = {"B-FOOD": 0, "I-FOOD": 1, "O": 2}
id2tag = {0: "B-FOOD", 1: "I-FOOD", 2: "O"}
"""

#Sostituisce "ner_tags" con una nuova chiave "labels" contenente gli ID
for row in hf_data:
    row["labels"] = [tag2id[tag] for tag in row["ner_tags"]]
    del row["ner_tags"]  

#Conversione in un effetivo dataset
dataset = Dataset.from_list(hf_data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [15]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

"""
In teoria, questo metodo permette di andare a tokenizzare e spezzare le parole per inserirle
all'interno di BERT, o comunque per convertirle prima in uno e-branding e inserirle all'interno
di BERT, mantenendo però le etichette corrette. Quindi se spezzo una parola lunga, che era un 
BFOOD, ci saranno alla fine due BFOOD, in teoria.
"""
def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)
    
    word_ids = tokenized.word_ids()
    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            # Se un word viene splittato in più subtoken, replichiamo la label (o metti -100 se vuoi ignorare)
            labels.append(example["labels"][word_idx])
        previous_word_idx = word_idx
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [16]:

def compute_metrics(pred):
    predictions, labels = pred
    predictions = predictions.argmax(axis=2)

    true_labels = []
    true_preds = []

    for pred_seq, label_seq in zip(predictions, labels):
        curr_preds = []
        curr_labels = []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                curr_preds.append(id2tag[p])
                curr_labels.append(id2tag[l])
        true_preds.append(curr_preds)
        true_labels.append(curr_labels)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Estrai tutte le etichette
all_labels = []
for example in tokenized_dataset["train"]:
    all_labels += example["labels"]

# Conta le etichette escludendo i -100 (token ignorati)
label_counts = Counter([label for label in all_labels if label != -100])
total = sum(label_counts.values())

# Calcola peso inverso della frequenza (più rara = peso più alto)
weights = [0.0] * len(tag2id)
for label_id, count in label_counts.items():
    weights[label_id] = total / (len(label_counts) * count)

weights = torch.tensor(weights).to(device)

In [18]:
from transformers.modeling_outputs import TokenClassifierOutput

class WeightedTokenClassifier(nn.Module):
    def __init__(self, base_model, weights):
        super().__init__()
        self.base_model = base_model
        self.loss_fct = CrossEntropyLoss(weight=weights, ignore_index=-100)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        # Rimuove 'num_items_in_batch' se presente
        kwargs.pop("num_items_in_batch", None)

        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        logits = outputs.logits

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if hasattr(outputs, "hidden_states") else None,
            attentions=outputs.attentions if hasattr(outputs, "attentions") else None,
        )


In [19]:
base_model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2id))
model = WeightedTokenClassifier(base_model, weights)
model.to(device)


args = TrainingArguments(
    output_dir="/kaggle/working/",
    run_name="bert-ner-food-v1",  # nome run esplicito
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=5000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",  # Disabilita logging verso wandb
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

"""
Nel fine-tuning, serve a:uniformare la lunghezza delle sequenze (padding), 
gestire correttamente i batch, 
allineare i token con le label (soprattutto importante nel NER).
"""


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


'\nNel fine-tuning, serve a:uniformare la lunghezza delle sequenze (padding), \ngestire correttamente i batch, \nallineare i token con le label (soprattutto importante nel NER).\n'

In [20]:
trainer.train()

Step,Training Loss
100,0.3657
200,0.253
300,0.2592
400,0.2365
500,0.2342
600,0.2161
700,0.2103
800,0.2161
900,0.2042
1000,0.208


TrainOutput(global_step=15000, training_loss=0.1607112346013387, metrics={'train_runtime': 10435.7333, 'train_samples_per_second': 22.998, 'train_steps_per_second': 1.437, 'total_flos': 0.0, 'train_loss': 0.1607112346013387, 'epoch': 3.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.19696687161922455,
 'eval_precision': 0.38602867455384,
 'eval_recall': 0.9300345418971473,
 'eval_f1': 0.5455968938422406,
 'eval_runtime': 201.9804,
 'eval_samples_per_second': 99.02,
 'eval_steps_per_second': 12.377,
 'epoch': 3.0}

#### Test with Lora

In [22]:
!pip install peft accelerate -q

from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForTokenClassification

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
base_model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2id)
)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.TOKEN_CLS
)

# Applica LoRA
model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()  # Verifica i parametri LoRA addestrabili




Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 108,019,206 || trainable%: 0.2752


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parametri di training
args = TrainingArguments(
    output_dir="/kaggle/working/ner-lora/",
    run_name="bert-ner-lora",
    do_train=True,
    do_eval=True,
    logging_dir="/kaggle/working/logs",
    logging_steps=100,
    save_steps=5000,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [25]:
trainer.train()

# Valutazione finale
metrics = trainer.evaluate()
print("Evaluation Results:", metrics)

Step,Training Loss
100,0.5123
200,0.1595
300,0.1317
400,0.1265
500,0.1247
600,0.1169
700,0.1085
800,0.1036
900,0.1025
1000,0.0999


Evaluation Results: {'eval_loss': 0.07875818759202957, 'eval_precision': 0.5997103441504247, 'eval_recall': 0.5901350274161211, 'eval_f1': 0.5948841569572786, 'eval_runtime': 210.6964, 'eval_samples_per_second': 94.923, 'eval_steps_per_second': 11.865, 'epoch': 3.0}
