In [21]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
from sklearn.metrics import accuracy_score
import pickle

In [67]:
#use GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Tokenizer and Preprocess Data
dataset = load_dataset(
    "parquet",
    data_files={
        "train": r"C:\Users\ymjr1\Desktop\Python course\ag_news\train-ag.parquet",
        "test": r"C:\Users\ymjr1\Desktop\Python course\ag_news\test-ag.parquet"
    }
)

tokenizer = AutoTokenizer.from_pretrained(r"C:\Users\ymjr1\Desktop\Python course\roberta-base")

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    return tokenized
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
#Tokenizer
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Using device: cpu


In [68]:
#the number of classess and names
num_labels = dataset["train"].features["label"].num_classes
class_names = dataset["train"].features["label"].names

In [69]:
#load pre_trained model
model = AutoModelForSequenceClassification.from_pretrained(
    r"C:\Users\ymjr1\Desktop\Python course\roberta-base",
    num_labels=num_labels
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at C:\Users\ymjr1\Desktop\Python course\roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
# PEFT Config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias = 'none',
    target_modules = ['query', 'value'],
    task_type="SEQ_CLS",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


In [71]:
#evaluation accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy}

In [72]:
#training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    optim="adamw_torch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_dir="./logs"
)

In [80]:
#train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1697,0.182493,0.945
2,0.1536,0.180086,0.947368
3,0.132,0.17955,0.948816


TrainOutput(global_step=22500, training_loss=0.15077081231011286, metrics={'train_runtime': 52277.2384, 'train_samples_per_second': 6.886, 'train_steps_per_second': 0.43, 'total_flos': 2.392609480704e+16, 'train_loss': 0.15077081231011286, 'epoch': 3.0})

In [82]:
#load test data
unlabelled_dataset = pd.read_pickle("C:\\Users\\ymjr1\\Desktop\\Python course\\ag_news\\test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [86]:
#Tokenizer
from torch.utils.data import DataLoader
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
test_dataloader = DataLoader(test_dataset, batch_size=64)

In [95]:
#prediction and save
import numpy as np
model.eval()
preds = [ ]
for batch in test_dataloader:
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        preds.append(predictions.cpu())

preds_np = torch.cat(preds).numpy()
df_output = pd.DataFrame({
    'ID': list(range(len(preds_np))),
    'Label': preds_np
})
df_output.to_csv("output.csv", index=False)
