# Starter Notebook

Install and import required libraries

In [3]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.15.1-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.5-py3-none-win_amd64.whl.metadata (5.1 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp39-cp39-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transfo

In [1]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer

  from .autonotebook import tqdm as notebook_tqdm


## Load Tokenizer and Preprocess Data

In [2]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
dataset_test = load_dataset('ag_news', split='test')

In [8]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
porter_stemmer = PorterStemmer()

# Data cleaning
def clean_text(example):
    text = example["text"]
    
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # 1. Lowercase
    # text = text.lower()

    # 2. Remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)

    # 3. Remove numbers
    #text = re.sub(r'[^a-zA/-Z\s]', '', text)
    text = re.sub(r"\d+", "", text)

    # 4. Remove stopwords
    # words = text.split()
    # words = [word for word in words if word not in stop_words]
    # more_stopwords = ['href', 'lt', 'gt', 'ii', 'iii', 'ie', 'quot', 'com']
    # words = [word for word in words if word not in more_stopwords]
    # stemmed_words = [porter_stemmer.stem(word) for word in words]

    return {"text": text} #" ".join(words)

dataset_cleaned_train = dataset.map(clean_text)
dataset_cleaned_test = dataset_test.map(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Map: 100%|██████████| 120000/120000 [00:44<00:00, 2684.49 examples/s]
Map: 100%|██████████| 7600/7600 [00:02<00:00, 2666.20 examples/s]


In [4]:
tokenizer = RobertaTokenizer.from_pretrained(base_model)
def length_filter(example):
    # Calculate the number of tokens
    tokenized_input = tokenizer(example["text"], truncation=False)
    token_length = len(tokenized_input["input_ids"]) 
    return 10 <= token_length <= 256

dataset_cleaned_train = dataset_cleaned_train.filter(length_filter)
dataset_cleaned_test = dataset_cleaned_test.filter(length_filter)
print(f"Train dataset length: {len(dataset_cleaned_train)}")
print(f"Test dataset length: {len(dataset_cleaned_test)}")

Filter: 100%|██████████| 120000/120000 [00:51<00:00, 2321.87 examples/s]
Filter: 100%|██████████| 7600/7600 [00:03<00:00, 1975.34 examples/s]

Train dataset length: 119998
Test dataset length: 7600





In [5]:
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True,max_length=128)
    return tokenized

tokenized_dataset = dataset_cleaned_train.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset_test = dataset_cleaned_test.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset_test = tokenized_dataset_test.rename_column("label", "labels")

Map: 100%|██████████| 120000/120000 [00:59<00:00, 2018.50 examples/s]
Map: 100%|██████████| 7600/7600 [00:05<00:00, 1476.77 examples/s]


In [6]:
# Extract the number of classess and their names
num_labels = dataset_cleaned_train.features['label'].num_classes
class_names = dataset_cleaned_train.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

## Anything from here on can be modified

In [7]:
# Split the original training set
# split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
# train_dataset = split_datasets['train']
# eval_dataset = split_datasets['test']
train_dataset = tokenized_dataset
eval_dataset = tokenized_dataset_test

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [8]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [9]:
# peft_model.unload() 
lora_configs = [
    {
        "target_modules": [
            "layer.0.attention.self.query",
            "layer.0.attention.self.value",
            "layer.0.attention.self.key"
        ],
        "r": 4,
        "lora_alpha": 8,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.1.attention.self.query",
            "layer.1.attention.self.value",
            "layer.1.attention.self.key"
        ],
        "r": 4,
        "lora_alpha": 8,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.2.attention.self.query",
            "layer.2.attention.self.value",
            "layer.2.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.3.attention.self.query",
            "layer.3.attention.self.value",
            "layer.3.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.4.attention.self.query",
            "layer.4.attention.self.value",
            "layer.4.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.5.attention.self.query",
            "layer.5.attention.self.value",
            "layer.5.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.6.attention.self.query",
            "layer.6.attention.self.value",
            "layer.6.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.7.attention.self.query",
            "layer.7.attention.self.value",
            "layer.7.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.8.attention.self.query",
            "layer.8.attention.self.value",
            "layer.8.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.9.attention.self.query",
            "layer.9.attention.self.value",
            "layer.9.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.10.attention.self.query",
            "layer.10.attention.self.value",
            "layer.10.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
    {
        "target_modules": [
            "layer.11.attention.self.query",
            "layer.11.attention.self.value",
            "layer.11.attention.self.key"
        ],
        "r": 8,
        "lora_alpha": 16,
        "lora_dropout": 0.1
    },
   
]


for config in lora_configs:
    lora_config = LoraConfig(
        r=config["r"],
        lora_alpha=config["lora_alpha"],
        target_modules=config["target_modules"],
        lora_dropout=config["lora_dropout"],
        bias = 'none',
        task_type="SEQ_CLS",
        modules_to_save=["classifier"]
    )
    peft_model = get_peft_model(model, lora_config)



In [49]:
# print("Trainable parameters:")
# for name, param in peft_model.named_parameters():
#     if param.requires_grad:
#         print(name)

In [10]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 999,172 || all params: 125,647,880 || trainable%: 0.7952


## Training Setup

In [11]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [15]:
# Setup Training args
output_dir = "results"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4, #2e-4
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=12,
    dataloader_num_workers=8,
    use_cpu=False,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    label_names=["labels"],
    lr_scheduler_type="linear"
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )

Using device: cuda:0


### Start Training

In [16]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
# peft_model.to(device)
# 
# 
# peft_lora_finetuning_trainer = get_trainer(peft_model)
# # peft_lora_finetuning_trainer.train(resume_from_checkpoint="./outputs/checkpoint-1000")
# result = peft_lora_finetuning_trainer.train()

Using device: cuda:0


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2308,0.221259,0.933289
2,0.2086,0.218507,0.935658
3,0.1853,0.206328,0.940263
4,0.1792,0.205982,0.941053
5,0.166,0.208451,0.942237
6,0.1588,0.202387,0.944868
7,0.1476,0.213131,0.944474
8,0.133,0.204317,0.946316
9,0.128,0.208751,0.949211
10,0.1237,0.214002,0.948026


In [23]:
import torch.nn.functional as F

teacher_model = RobertaForSequenceClassification.from_pretrained("roberta-large",num_labels=4, id2label=id2label)
teacher_model = teacher_model.to(device)
teacher_model.eval()

def distillation_loss(student_logits, teacher_logits, labels, alpha=0.5, T=2.0):
    soft_loss = F.kl_div(
        F.log_softmax(student_logits / T, dim=-1),
        F.softmax(teacher_logits / T, dim=-1),
        reduction="batchmean"
    ) * (T ** 2)


    hard_loss = F.cross_entropy(student_logits, labels)

    return alpha * soft_loss + (1 - alpha) * hard_loss

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim


def evaluate_model_TS(model, eval_dataloader):
    model.eval()
    preds = []
    labels = []

    for idx, batch in enumerate(eval_dataloader):
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['labels'].to(device)

        with torch.no_grad():
            output = model(input_ids=inputs, attention_mask=attention_mask)
        
        preds.extend(torch.argmax(output.logits, dim=-1).cpu().numpy())
        labels.extend(label.cpu().numpy())

    accuracy = accuracy_score(labels, preds)
    print(f"Evaluation Accuracy: {accuracy}")
    
def preprocess_data(examples):
    encodings = tokenizer(examples["text"], truncation=False, padding="max_length", max_length=128)
    encodings["labels"] = examples["label"]
    return encodings


train_dataset = dataset_cleaned_train.map(preprocess_data, batched=True)
eval_dataset = dataset_cleaned_test.map(preprocess_data, batched=True)


Map: 100%|██████████| 120000/120000 [00:41<00:00, 2890.41 examples/s]
Map: 100%|██████████| 7600/7600 [00:03<00:00, 2328.83 examples/s]


In [44]:
from tqdm import tqdm
num_epochs = 20
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=True)
peft_model = peft_model.to(device)
optimizer = optim.AdamW(peft_model.parameters(), lr=5e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

for epoch in range(num_epochs):
    peft_model.train()
    total_loss = 0
    for idx, batch in enumerate(tqdm(train_dataloader)):
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            teacher_logits = teacher_model(input_ids=inputs, attention_mask=attention_mask).logits

        student_logits = peft_model(input_ids=inputs, attention_mask=attention_mask).logits

        loss = distillation_loss(student_logits, teacher_logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
    #save model
    peft_model.save_pretrained(f"./results/checkpoint-{epoch+1}")

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")
    evaluate_model_TS(peft_model, eval_dataloader)



100%|██████████| 3750/3750 [26:55<00:00,  2.32it/s]


Epoch 1, Loss: 0.4221450679620107
Evaluation Accuracy: 0.9378947368421052


100%|██████████| 3750/3750 [26:57<00:00,  2.32it/s]


Epoch 2, Loss: 0.4193462567090988
Evaluation Accuracy: 0.9411842105263157


100%|██████████| 3750/3750 [26:58<00:00,  2.32it/s]


Epoch 3, Loss: 0.41787741750876106
Evaluation Accuracy: 0.9403947368421053


100%|██████████| 3750/3750 [26:57<00:00,  2.32it/s]


Epoch 4, Loss: 0.41572676539421083
Evaluation Accuracy: 0.9356578947368421


100%|██████████| 3750/3750 [26:58<00:00,  2.32it/s]


Epoch 5, Loss: 0.4146461878299713
Evaluation Accuracy: 0.9434210526315789


100%|██████████| 3750/3750 [27:03<00:00,  2.31it/s]


Epoch 6, Loss: 0.41322443083922067
Evaluation Accuracy: 0.9425


100%|██████████| 3750/3750 [58:34<00:00,  1.07it/s]


Epoch 7, Loss: 0.4115957844336828
Evaluation Accuracy: 0.9442105263157895


100%|██████████| 3750/3750 [1:06:29<00:00,  1.06s/it]


Epoch 8, Loss: 0.4105415784200033
Evaluation Accuracy: 0.9463157894736842


100%|██████████| 3750/3750 [1:05:19<00:00,  1.05s/it]


Epoch 9, Loss: 0.4092999792257945
Evaluation Accuracy: 0.9413157894736842


100%|██████████| 3750/3750 [1:07:01<00:00,  1.07s/it]


Epoch 10, Loss: 0.40818362799485525
Evaluation Accuracy: 0.9447368421052632


100%|██████████| 3750/3750 [1:06:42<00:00,  1.07s/it]


Epoch 11, Loss: 0.40670950167973835
Evaluation Accuracy: 0.9448684210526316


100%|██████████| 3750/3750 [1:05:23<00:00,  1.05s/it]


Epoch 12, Loss: 0.40593271228472394
Evaluation Accuracy: 0.9453947368421053


100%|██████████| 3750/3750 [1:07:02<00:00,  1.07s/it]


Epoch 13, Loss: 0.40477193874518075
Evaluation Accuracy: 0.9468421052631579


100%|██████████| 3750/3750 [1:06:58<00:00,  1.07s/it]


Epoch 14, Loss: 0.40356317480405174
Evaluation Accuracy: 0.9460526315789474


100%|██████████| 3750/3750 [1:05:25<00:00,  1.05s/it]


Epoch 15, Loss: 0.4022507468779882
Evaluation Accuracy: 0.9465789473684211


100%|██████████| 3750/3750 [1:07:04<00:00,  1.07s/it]


Epoch 16, Loss: 0.4012085372130076
Evaluation Accuracy: 0.9467105263157894


100%|██████████| 3750/3750 [1:06:02<00:00,  1.06s/it]


Epoch 17, Loss: 0.4000289070447286
Evaluation Accuracy: 0.9468421052631579


100%|██████████| 3750/3750 [1:06:26<00:00,  1.06s/it]


Epoch 18, Loss: 0.3994322786887487
Evaluation Accuracy: 0.9453947368421053


 17%|█▋        | 653/3750 [11:40<55:23,  1.07s/it]  


KeyboardInterrupt: 

In [57]:
# peft_model = PeftModel.from_pretrained(model, "./results/checkpoint-5")
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=True)
evaluate_model_TS(peft_model, eval_dataloader)

Evaluation Accuracy: 0.9468421052631579


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [66]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [94]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 0, Label: World, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [21]:
# from torch.utils.data import DataLoader
# import evaluate
# from tqdm import tqdm
# 
# def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
#     """
#     Evaluate a PEFT model on a dataset.
# 
#     Args:
#         inference_model: The model to evaluate.
#         dataset: The dataset (Hugging Face Dataset) to run inference on.
#         labelled (bool): If True, the dataset includes labels and metrics will be computed.
#                          If False, only predictions will be returned.
#         batch_size (int): Batch size for inference.
#         data_collator: Function to collate batches. If None, the default collate_fn is used.
# 
#     Returns:
#         If labelled is True, returns a tuple (metrics, predictions)
#         If labelled is False, returns the predictions.
#     """
#     # Create the DataLoader
#     eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
#     
# 
#     inference_model.to(device)
#     inference_model.eval()
# 
#     all_predictions = []
#     if labelled:
#         metric = evaluate.load('accuracy')
# 
#     # Loop over the DataLoader
#     for batch in tqdm(eval_dataloader):
#         # Move each tensor in the batch to the device
#         batch = {k: v.to(device) for k, v in batch.items()}
#         with torch.no_grad():
#             outputs = inference_model(**batch)
#         predictions = outputs.logits.argmax(dim=-1)
#         all_predictions.append(predictions.cpu())
# 
#         if labelled:
#             # Expecting that labels are provided under the "labels" key.
#             references = batch["labels"]
#             metric.add_batch(
#                 predictions=predictions.cpu().numpy(),
#                 references=references.cpu().numpy()
#             )
# 
#     # Concatenate predictions from all batches
#     all_predictions = torch.cat(all_predictions, dim=0)
# 
#     if labelled:
#         eval_metric = metric.compute()
#         print("Evaluation Metric:", eval_metric)
#         return eval_metric, all_predictions
#     else:
#         return all_predictions

In [None]:
# Check evaluation accuracy
# dataset_cleaned_test = dataset_test.map(clean_text)
# eval_dataset1 = dataset_cleaned_test.map(preprocess, batched=True, remove_columns=["text"])
# eval_dataset1 = eval_dataset1.rename_column("label", "labels")
# peft_model = PeftModel.from_pretrained(model, "./results/checkpoint-67500")
# _, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

### Run Inference on unlabelled dataset

In [67]:
#Load your unlabelled data
import pickle
import pandas as pd
with open("test_unlabelled38.pkl", "rb") as f:
    unlabelled_dataset = pickle.load(f)
# unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
unlabelled_dataset


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [23]:
# Run inference and save predictions
# peft_model = PeftModel.from_pretrained(model, "./results/checkpoint-67500")
# unlabelled_dataset = unlabelled_dataset.map(clean_text)
# test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
# preds = evaluate_model_TS(peft_model, test_dataset, False, 8, data_collator)
# df_output = pd.DataFrame({
#     'ID': range(len(preds)),
#     'Label': preds.numpy()  # or preds.tolist()
# })
# df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
# print("Inference complete. Predictions saved to inference_output.csv")

Map: 100%|██████████| 8000/8000 [00:00<00:00, 11337.45 examples/s]
Map: 100%|██████████| 8000/8000 [00:04<00:00, 1851.86 examples/s]
100%|██████████| 1000/1000 [00:43<00:00, 22.91it/s]

Inference complete. Predictions saved to inference_output.csv





In [73]:
def preprocess_data1(examples):
    encodings = tokenizer(examples["text"],  truncation=True, padding=True, max_length=128)
    return encodings


unlabelled_dataset = unlabelled_dataset.map(clean_text)
taggle_dataset = unlabelled_dataset.map(preprocess_data1, batched=True,  remove_columns=["text"])

taggle_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
taggle_dataloader = DataLoader(taggle_dataset, batch_size=32, shuffle=False)
peft_model = PeftModel.from_pretrained(model, "./results/checkpoint-17")
peft_model.eval()


Map: 100%|██████████| 8000/8000 [00:00<00:00, 10595.30 examples/s]
Map: 100%|██████████| 8000/8000 [00:05<00:00, 1578.83 examples/s]


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-1): 2 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Modul

In [74]:
import numpy as np
preds = []
for idx, batch in enumerate(taggle_dataloader):
    inputs = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        output = peft_model(input_ids=inputs, attention_mask=attention_mask)
    
    preds.extend(torch.argmax(output.logits, dim=-1).cpu().numpy())
    


In [75]:
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': np.asarray(preds)  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output04181.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

Inference complete. Predictions saved to inference_output.csv
