In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from transformers.training_args import TrainingArguments
from transformers.trainer import Trainer
from transformers.data.data_collator import DataCollatorWithPadding

from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('fine_tuning_df.csv')

In [3]:
df = df.rename(columns={"desc":"text", "simple_categories":"label_name"})

In [4]:
df.head(3)

Unnamed: 0,text,label_name
0,"""Even if you know Hawaiian history you will fi...",Nonfiction
1,In the course of the long debate on the nature...,Nonfiction
2,This engrossing book examines the particular i...,Nonfiction


In [5]:
# Create label mappings
# This converts our text labels ("Fiction", "Nonfiction") into integers (0, 1)
# which the model needs.
labels = list(df['label_name'].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

print(f"Label to ID mapping: {label2id}")
print(f"ID to Label mapping: {id2label}")

# Apply the mapping to our dataframe
df['label'] = df['label_name'].map(label2id)

# Convert the pandas DataFrame to a Hugging Face Dataset object
hg_dataset = Dataset.from_pandas(df)

# Split the dataset into training and testing sets (80% train, 20% test)
train_test_split = hg_dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print("\nDataset structure:")
print(dataset)

Label to ID mapping: {'Nonfiction': 0, 'Fiction': 1}
ID to Label mapping: {0: 'Nonfiction', 1: 'Fiction'}

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label_name', 'label'],
        num_rows: 3150
    })
    test: Dataset({
        features: ['text', 'label_name', 'label'],
        num_rows: 788
    })
})


In [None]:
model_checkpoint = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text", "label_name"])
print("\nTokenized dataset structure:")
print(tokenized_datasets)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 3150/3150 [00:00<00:00, 6090.10 examples/s]
Map: 100%|██████████| 788/788 [00:00<00:00, 7276.31 examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 3150
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 788
    })
})





In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    
    return {"accuracy": accuracy, "f1": f1}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./bart-fiction-nonfiction-classifier", 
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch", 
    save_strategy="epoch",       
    load_best_model_at_end=True,
    push_to_hub=False            
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("\nStarting fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

print("\nEvaluating final model on the test set:")
eval_results = trainer.evaluate()
print(eval_results)