In [None]:
! pip install evaluate
! pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu117
! pip install peft
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
# ! pip install /kaggle/input/llm-peft-pkg/pyarrow_hotfix-0.6-py3-none-any.whl
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from huggingface_hub import login
import os

# HuggingFace and WandB login
token = 'give_your_hugging_face_token'
login(token)
os.environ["WANDB_API_KEY"] = "give_your_WANDB_API_key"

# Load dataset
df = pd.read_csv("TRAIN-DATASET-PATH")

# Map 'product-category' to numerical labels
label2id = {
    "alcoholic beverages": 0, "cereals and bakery products": 1, "cocoa and cocoa preparations, coffee and tea": 2,
    "confectionery": 3, "dietetic foods, food supplements, fortified foods": 4, "fats and oils": 5,
    "feed materials": 6, "food additives and flavourings": 7, "food contact materials": 8, "fruits and vegetables": 9,
    "herbs and spices": 10, "honey and royal jelly": 11, "ices and desserts": 12, "meat, egg and dairy products": 13,
    "non-alcoholic beverages": 14, "nuts, nut products and seeds": 15, "other food product / mixed": 16,
    "pet feed": 17, "prepared dishes and snacks": 18, "seafood": 19, "soups, broths, sauces and condiments": 20,
    "sugars and syrups": 21
}

df['label'] = df['product-category'].map(label2id)
df['input'] = df["title"]+": "+df['text']
df = df[['input', 'label']]

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})

# Define model checkpoint and label mappings
model_checkpoint = 'openai-community/gpt2-large'
id2label = {i: label for label, i in label2id.items()}
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=22, id2label=id2label, label2id=label2id, device_map = 'auto'
)
print(model)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["input"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy_score}

# PEFT configuration
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "c_attn",
        "c_proj",
        "c_fc",
        "mlp.c_proj",
    ]
)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

# Training arguments
lr = 2e-5
batch_size = 8
num_epochs = 5

training_args = TrainingArguments(
    output_dir="gpt2-final-product", # Folder where the checkpoints are stored
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True
)
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

model.save_pretrained("gpt2-final-product")
tokenizer.save_pretrained("gpt2-final-product")

In [None]:

prediction_data = pd.read_csv("TEST DATASET PATH")
prediction_data['input'] = prediction_data['title']+": "+prediction_data['text']

prediction_dataset = Dataset.from_pandas(prediction_data)

# Tokenize the prediction data
def tokenize_for_prediction(examples):
    return tokenizer(
        examples["input"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_prediction_dataset = prediction_dataset.map(tokenize_for_prediction, batched=True)

raw_predictions = trainer.predict(tokenized_prediction_dataset)

predictions = np.argmax(raw_predictions.predictions, axis=1)

# Map numerical labels back to category names
predicted_labels = [id2label[pred] for pred in predictions]

prediction_data["predicted-product-category"] = predicted_labels
# prediction_data.to_csv("predictions.csv", index=False)

print(prediction_data)


In [None]:
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(prediction_data["product-category"], prediction_data["predicted-product-category"])

print("Classification Report:")
print(report)

In [None]:
! pip install evaluate
! pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu117
! pip install peft
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
# ! pip install /kaggle/input/llm-peft-pkg/pyarrow_hotfix-0.6-py3-none-any.whl
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from huggingface_hub import login
import os

# HuggingFace and WandB login
token = 'give_your_hugging_face_token'
login(token)
os.environ["WANDB_API_KEY"] = "give_your_WANDB_API_key"

# Load dataset
df = pd.read_csv("TRAIN-DATASET-PATH")

# Map 'hazard-category' to numerical labels
label2id = {
    "allergens": 0,
    "biological": 1,
    "chemical": 2,
    "food additives and flavourings": 3,
    "foreign bodies": 4,
    "fraud": 5,
    "migration": 6,
    "organoleptic aspects": 7,
    "other hazard": 8,
    "packaging defect": 9,
}

df['label'] = df['hazard-category'].map(label2id)
df['input'] = df['text']
df = df[['input', 'label']]

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})

# Define model checkpoint and label mappings
model_checkpoint = 'openai-community/gpt2-large'
id2label = {i: label for label, i in label2id.items()}
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=10, id2label=id2label, label2id=label2id, device_map = 'auto'
)
print(model)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["input"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy_score}

# PEFT configuration
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "c_attn",
        "c_proj",
        "c_fc",
        "mlp.c_proj",
    ]
)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

# Training arguments
lr = 2e-5
batch_size = 8
num_epochs = 5

training_args = TrainingArguments(
    output_dir="gpt2-final-hazard",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True
)
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

model.save_pretrained("gpt2-final-hazard")
tokenizer.save_pretrained("gpt2-final-hazard")

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained model and tokenizer
model_path = "Fine tuned model path"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 10)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Label mapping
id2label = {
    0: "allergens",
    1: "biological",
    2: "chemical",
    3: "food additives and flavourings",
    4: "foreign bodies",
    5: "fraud",
    6: "migration",
    7: "organoleptic aspects",
    8: "other hazard",
    9: "packaging defect"
}

# Prediction function
def predict_hazard_category(model, tokenizer, input_text):
    # Tokenize input
    inputs = tokenizer(
        input_text,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    )

    # Move inputs to device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Map to label
    return id2label[predicted_class]

df = pd.read_csv("TEST DATASET PATH")


# Apply predictions to the DataFrame
df['predicted_hazard_category'] = df['text'].apply(
    lambda text: predict_hazard_category(model, tokenizer, text)
)

print(df.head())


In [None]:
#! ANOTHER WAY TO MAKE PREDICTIONS


prediction_data = pd.read_csv("TEST DATASET PATH")
prediction_data['input'] = prediction_data['text']
# Convert the data to a Hugging Face Dataset
prediction_dataset = Dataset.from_pandas(prediction_data)

# Tokenize the prediction data
def tokenize_for_prediction(examples):
    return tokenizer(
        examples["input"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_prediction_dataset = prediction_dataset.map(tokenize_for_prediction, batched=True)

raw_predictions = trainer.predict(tokenized_prediction_dataset)

predictions = np.argmax(raw_predictions.predictions, axis=1)

# Map numerical labels back to category names
predicted_labels = [id2label[pred] for pred in predictions]

# Add predictions to the DataFrame
prediction_data["predicted-hazard-category"] = predicted_labels
# prediction_data.to_csv("predictions.csv", index=False)

print(prediction_data)


In [None]:
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(prediction_data["hazard-category"], prediction_data["predicted-hazard-category"])

print("Classification Report:")
print(report)