In [None]:
import pandas as pd
import numpy as np
# ! pip install evaluate
# Load your dataset
data = pd.read_csv('TRAIN DATASET PATH')

# Prepare the data
data['input'] = data.apply(lambda row: f"""Extract the exact reason for the food recall from the given text. Provide only the specific recall reason, without including any other information, from the following recall notice:
 Text: {row['text']}""", axis=1)
data['target'] = data['hazard']

# Save to a new file
data[['input', 'target']].to_csv('prepared_dataset.csv', index=False)


In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import login
import os

# HuggingFace and WandB login
token = 'YOUR_HUGGING_FACE_TOKEN'
login(token)
os.environ["WANDB_API_KEY"] = "YOUR_WANDB_API_KEY"

# Load the dataset
dataset = load_dataset('csv', data_files='prepared_dataset.csv')

# Split into training and validation sets
dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

# Load tokenizer and model
model_name = "google/flan-t5-xl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "k", "v", "EncDecAttention.q", "EncDecAttention.v"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Preprocessing function
def preprocess_function(examples):
    # Ensure valid inputs and targets
    inputs = tokenizer(examples['input'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['target'], max_length=32, truncation=True, padding="max_length")
    # Mask pad tokens in labels
    inputs['labels'] = [
        label if label != tokenizer.pad_token_id else -100
        for label in targets['input_ids']
    ]
    return inputs


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-peft-hazardfinal",
    evaluation_strategy="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    fp16=False,
    max_grad_norm=1.0,
    push_to_hub=False,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save the trained model and tokenizer
model.save_pretrained("flan-t5-peft-hazardfinal")
tokenizer.save_pretrained("flan-t5-peft-hazardfinal")


In [None]:
from transformers import pipeline

fine_tuned_extractor = pipeline("text2text-generation", model="Fine tuned model path")


def extract_food_product(text, title):
    """Uses the fine-tuned model to extract the food product."""
    text = title + ": " + text
    prompt = f"Extract the exact reason for the food recall from the given text. Provide only the specific recall reason, without including any other information, from the following recall notice:{text}" # Product extraction prompt
    result = fine_tuned_extractor(prompt, max_length=32)[0]['generated_text']
    # print(result)
    return result.strip().lower()


chunk_size = 500
output_file = 'file1.csv'

header = True
i = 0
for chunk in pd.read_csv('TEST DATASET PATH', chunksize=chunk_size):
    chunk['extracted_hazard'] = chunk.apply(lambda row: extract_food_product(row['text'], row['title']), axis=1)
    chunk.to_csv(output_file, mode='a', index=False, header=header)
    header = False
    i = i+1
    print(f"A chunk {i*50} over\n")

print("Batch extraction complete. The updated dataset has been saved as 'updated_dataset_with_product.csv'.")



In [None]:
import pandas as pd
import numpy as np
# ! pip install evaluate
# Load your dataset
data = pd.read_csv('TRAIN DATASET PATH')


data['input'] = f"Extract the exact food product from the given text without including any other information: {data['text']+data['title']}"
data['target'] = data['product']

# Save to a new file
data[['input', 'target']].to_csv('prepared_dataset2.csv', index=False)


In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import login
import os

# HuggingFace and WandB login
token = 'YOUR_HUGGING_FACE_TOKEN_HERE'
login(token)
os.environ["WANDB_API_KEY"] = "YOUR_WANDB_APT_KEY"

# Load the dataset
dataset = load_dataset('csv', data_files='prepared_dataset2.csv')

# Split into training and validation sets
dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

# Load tokenizer and model
model_name = "google/flan-t5-xl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "k", "v", "EncDecAttention.q", "EncDecAttention.v"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Preprocessing function
def preprocess_function(examples):
    # Ensure valid inputs and targets
    inputs = tokenizer(examples['input'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['target'], max_length=32, truncation=True, padding="max_length")
    # Mask pad tokens in labels
    inputs['labels'] = [
        label if label != tokenizer.pad_token_id else -100
        for label in targets['input_ids']
    ]
    return inputs

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-peft-product_extract",
    evaluation_strategy="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    fp16=False,
    max_grad_norm=1.0,
    push_to_hub=False,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

# Save the trained model and tokenizer
model.save_pretrained("flan-t5-peft-product_extract")
tokenizer.save_pretrained("flan-t5-peft-product_extract")


In [None]:
from transformers import pipeline

fine_tuned_extractor = pipeline("text2text-generation", model="Fine tuned model path")


def extract_food_product(text, title):
    """Uses the fine-tuned model to extract the food product."""
    text = title + ": " + text
    prompt = f"Extract the recall food product from the following food recall text: {text}" # Product extraction prompt
    result = fine_tuned_extractor(prompt, max_length=32)[0]['generated_text']
    # print(result)
    return result.strip().lower()


chunk_size = 500
output_file = 'file2.csv'

header = True
i = 0
for chunk in pd.read_csv('TEST DATASET PATH', chunksize=chunk_size):
    chunk['extracted_hazard'] = chunk.apply(lambda row: extract_food_product(row['text'], row['title']), axis=1)
    chunk.to_csv(output_file, mode='a', index=False, header=header)
    header = False
    i = i+1
    print(f"A chunk {i*50} over\n")

print("Batch extraction complete. The updated dataset has been saved as 'updated_dataset_with_product.csv'.")

