In [40]:
import pandas as pd
import numpy as np
import os
import re


In [41]:
! pip install datasets



In [42]:
from datasets import load_dataset

# Load Subsets
labeled_final = load_dataset("paws", "labeled_final")

In [43]:
labeled_final

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})

In [44]:
train = labeled_final['train'].select(range(500))
test = labeled_final['test'].select(range(50))
valid = labeled_final['validation'].select(range(50))

In [45]:
train[0]

{'id': 1,
 'sentence1': 'In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland .',
 'sentence2': 'In October 1560 , he secretly met with the English ambassador , Nicolas Throckmorton , in Paris , and asked him for a passport to return to Scotland through England .',
 'label': 0}

In [46]:
from google.colab import userdata
HF_TOKEN= userdata.get('HuggingFace')

from huggingface_hub import login
login(token=HF_TOKEN)

In [47]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [48]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)




Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
# Ensure tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Padding token added as [PAD].")

# Resize model embeddings if new token is added
if tokenizer.pad_token_id is not None and model.get_input_embeddings().num_embeddings != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))

# Set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding="max_length",  # Use padding
        truncation=True,
        max_length=128
    )

tokenized_train_dataset = train.map(tokenize_function, batched=True)

Padding token added as [PAD].


In [50]:
tokenized_train_dataset

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'attention_mask'],
    num_rows: 500
})

In [51]:
tokenized_test_dataset = test.map(tokenize_function, batched=True)
tokenized_valid_dataset = valid.map(tokenize_function, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [52]:
# Keep only the necessary features
tokenized_train_dataset = tokenized_train_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["id", "sentence1", "sentence2", "label"]
)
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [53]:
# Keep only the necessary features
tokenized_valid_dataset = tokenized_valid_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["id", "sentence1", "sentence2", "label"]
)

tokenized_valid_dataset

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})

In [54]:
# # Keep only the necessary features
# tokenized_test_dataset = tokenized_test_dataset.map(
#     lambda examples: {
#         "input_ids": examples["input_ids"],
#         "attention_mask": examples["attention_mask"],
#         "labels": examples["label"]
#     },
#     remove_columns=["id", "sentence1", "sentence2", "label"]
# )

tokenized_test_dataset

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [55]:
# import os
# # disable Weights and Biases
# os.environ['WANDB_DISABLED']="true"


In [56]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_lora",         # Output directory for saved models
    eval_strategy="epoch",   # Evaluate after each epoch
    learning_rate=2e-5,          # Learning rate
    per_device_train_batch_size=4, # Training batch size
    per_device_eval_batch_size=4, # Evaluation batch size
    num_train_epochs=2,            # Number of training epochs
    weight_decay=0.01,             # Weight decay
    save_total_limit=2,            # Save only the 2 most recent models
    logging_dir="./logs",          # Log directory
    logging_steps=50,              # Log every 50 steps
    load_best_model_at_end=True,    # Load the best model at the end of training
    save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
    #gradient_accumulation_steps=2, # Added gradient accumulation
    fp16=True,                       # Enabled mixed precision training
    #gradient_checkpointing=True
)



In [57]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                          # Low-rank size
    lora_alpha=32,                # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers (specific to LLaMA)
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none",                  # No bias adaptation
    task_type="CAUSAL_LM"         # Task type: causal language modeling
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

In [58]:
model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,672,512 || trainable%: 0.0689


In [59]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the class with the highest score

    # Calculate accuracy, precision, recall, and F1
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [60]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,  # Assuming you have a validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [61]:
!nvidia-smi

Thu Jan 16 09:20:35 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0              30W /  70W |  12457MiB / 15360MiB |     17%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [62]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0292,1.042668,0.44,0.472222,0.653846,0.548387
2,1.1094,0.965993,0.44,0.464286,0.5,0.481481




TrainOutput(global_step=126, training_loss=1.4365655505467976, metrics={'train_runtime': 81.2406, 'train_samples_per_second': 12.309, 'train_steps_per_second': 1.551, 'total_flos': 748033671168000.0, 'train_loss': 1.4365655505467976, 'epoch': 2.0})

In [63]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.965993344783783, 'eval_accuracy': 0.44, 'eval_precision': 0.4642857142857143, 'eval_recall': 0.5, 'eval_f1': 0.48148148148148145, 'eval_runtime': 1.0181, 'eval_samples_per_second': 49.113, 'eval_steps_per_second': 6.876, 'epoch': 2.0}


In [64]:
test_results = trainer.predict(tokenized_test_dataset)
metrics = compute_metrics(test_results)
print(metrics)

{'accuracy': 0.56, 'precision': 0.4, 'recall': 0.3157894736842105, 'f1': 0.35294117647058826}


In [65]:
# Generate predictions for the test set
predictions = test_results.predictions.argmax(-1)  # Predicted labels
true_labels = test_results.label_ids               # True labels

# Pair predictions with true labels
for i in range(len(predictions)):
    print(f"Sentence1: {tokenized_test_dataset[i]['sentence1']}")
    print(f"Sentence2: {tokenized_test_dataset[i]['sentence2']}")
    print(f"True Label: {true_labels[i]}, Predicted Label: {predictions[i]}\n")

Sentence1: This was a series of nested angular standards , so that measurements in azimuth and elevation could be done directly in polar coordinates relative to the ecliptic .
Sentence2: This was a series of nested polar scales , so that measurements in azimuth and elevation could be performed directly in angular coordinates relative to the ecliptic .
True Label: 0, Predicted Label: 1

Sentence1: His father emigrated to Missouri in 1868 but returned when his wife became ill and before the rest of the family could also go to America .
Sentence2: His father emigrated to America in 1868 , but returned when his wife became ill and before the rest of the family could go to Missouri .
True Label: 0, Predicted Label: 0

Sentence1: In January 2011 , the Deputy Secretary General of FIBA Asia , Hagop Khajirian , inspected the venue together with SBP - President Manuel V. Pangilinan .
Sentence2: In January 2011 , FIBA Asia deputy secretary general Hagop Khajirian along with SBP president Manuel V