In [1]:
import pandas as pd
import numpy as np
import os
import re


In [2]:
! pip install datasets



In [3]:
from datasets import load_dataset

# Load Subsets
labeled_final = load_dataset("paws", "labeled_final")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
labeled_final

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})

In [5]:
# # Access train, test, and validation splits
# labeled_final_train = labeled_final["train"]
# labeled_final_test = labeled_final["test"]
# labeled_final_validation = labeled_final["validation"]

In [6]:
# Define text preprocessing function
def preprocess_text(data):
  for eachsent in range(len(data['sentence1'])):

    # 1. Remove extra spaces
    data['sentence1'][eachsent] = re.sub(r'\s+', ' ', data['sentence1'][eachsent].strip())

    # 2. Remove unwanted text (e.g., URLs, special characters, digits)
    data['sentence1'][eachsent] = re.sub(r"http\S+|www\S+|https\S+", '', data['sentence1'][eachsent])  # Remove URLs
    #data['sentence1'][eachsent] = re.sub(r'[^\w\s]', '', data['sentence1'][eachsent])  # Remove special characters (punctuation)

    # 3. Convert to lowercase
    data['sentence1'][eachsent] = data['sentence1'][eachsent].lower()

    # 4. Normalize text (e.g., contractions)
    contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}
    data['sentence1'][eachsent] = ' '.join([contractions[word] if word in contractions else word for word in data['sentence1'][eachsent].split()])


  for eachsent in range(len(data['sentence2'])):

    # 1. Remove extra spaces
    data['sentence2'][eachsent] = re.sub(r'\s+', ' ', data['sentence2'][eachsent].strip())

    # 2. Remove unwanted text (e.g., URLs, special characters, digits)
    data['sentence2'][eachsent] = re.sub(r"http\S+|www\S+|https\S+", '', data['sentence2'][eachsent])  # Remove URLs
    #data['sentence2'][eachsent] = re.sub(r'[^\w\s]', '', data['sentence2'][eachsent])  # Remove special characters (punctuation)

    # 3. Convert to lowercase
    data['sentence2'][eachsent] = data['sentence1'][eachsent].lower()

    # 4. Normalize text (e.g., contractions)
    contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}
    data['sentence2'][eachsent] = ' '.join([contractions[word] if word in contractions else word for word in data['sentence2'][eachsent].split()])

  return data

In [7]:
train_dataset = labeled_final["train"].map(preprocess_text, batched=True)
valid_dataset = labeled_final["validation"].map(preprocess_text, batched=True)
test_dataset = labeled_final["test"].map(preprocess_text, batched=True)

In [8]:
# train = labeled_final['train'].select(range(500))
# test = labeled_final['test'].select(range(50))
# valid = labeled_final['validation'].select(range(50))

train = train_dataset.select(range(1000))
valid = valid_dataset.select(range(50))
test = test_dataset.select(range(50))


In [9]:
train[0]

{'id': 1,
 'sentence1': 'in paris , in october 1560 , he secretly met the english ambassador , nicolas throckmorton , asking him for a passport to return to england through scotland .',
 'sentence2': 'in paris , in october 1560 , he secretly met the english ambassador , nicolas throckmorton , asking him for a passport to return to england through scotland .',
 'label': 0}

In [10]:
from google.colab import userdata
HF_TOKEN= userdata.get('HuggingFace')

from huggingface_hub import login
login(token=HF_TOKEN)

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

Since LLAMA3 pre-training doesn't have EOS token
* Set the pad_token_id to eos_token_id
* Set pad token ot eos_token

In [12]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

# # Load the model
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)




In [13]:
!pip install -U bitsandbytes




In [14]:
# Incase of quantization
import torch
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model


# Define QLORA configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_use_double_quant=True,  # Double quantization for stability
    bnb_4bit_quant_type="nf4",  # Quantization type (e.g., NormalFloat4)
    bnb_4bit_compute_dtype=torch.float16  # Computation type
)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                          # Low-rank size
    lora_alpha=32,                # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers (specific to LLaMA)
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none",                  # No bias adaptation
    task_type="SEQ_CLS"         # Task type: causal language modeling ###'SEQ_CLS' ###CAUSAL_LM
)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,  # Explicitly set for binary classification
    device_map="auto"  # Automatically distribute layers across available GPUs
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
#call the prepare_model_for_kbit_training() function to preprocess the quantized model for training.
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)


#use the get_peft_model() function to create a PeftModel from the quantized model and configuration.
from peft import get_peft_model

model = get_peft_model(model, lora_config)

Update some model configs
* Must use .cache = False as below or it crashes from my experience

In [16]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [17]:



# # Wrap the model with LoRA
# model = get_peft_model(model, lora_config)

# # Freeze all model parameters except LoRA layers
# for param in model.parameters():
#     param.requires_grad = False

# # Enable gradients for LoRA layers
# for name, param in model.named_parameters():
#     if "lora" in name:
#         param.requires_grad = True

# # Check which parameters require gradients
# for name, param in model.named_parameters():
#     print(f"{name} requires_grad: {param.requires_grad}")

### Tokenizer

In [18]:
# Ensure tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Padding token added as [PAD].")

# Resize model embeddings if new token is added
if tokenizer.pad_token_id is not None and model.get_input_embeddings().num_embeddings != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))

# Set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding="max_length",  # Use padding
        truncation=True,
        max_length=128
    )

tokenized_train_dataset = train.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [19]:
tokenized_train_dataset

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [20]:
tokenized_test_dataset = test.map(tokenize_function, batched=True)
tokenized_valid_dataset = valid.map(tokenize_function, batched=True)

In [21]:
# Keep only the necessary features
tokenized_train_dataset = tokenized_train_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["id", "sentence1", "sentence2", "label"]
)
tokenized_train_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [22]:
# Keep only the necessary features
tokenized_valid_dataset = tokenized_valid_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["id", "sentence1", "sentence2", "label"]
)

tokenized_valid_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})

In [23]:
# # Keep only the necessary features
# tokenized_test_dataset = tokenized_test_dataset.map(
#     lambda examples: {
#         "input_ids": examples["input_ids"],
#         "attention_mask": examples["attention_mask"],
#         "labels": examples["label"]
#     },
#     remove_columns=["id", "sentence1", "sentence2", "label"]
# )

tokenized_test_dataset

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [24]:
# Data collator for padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"


In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_lora",         # Output directory for saved models
    learning_rate=2e-5,          # Learning rate
    per_device_train_batch_size=4, # Training batch size
    per_device_eval_batch_size=4, # Evaluation batch size
    num_train_epochs=5,            # Number of training epochs
    weight_decay=0.01,             # Weight decay
    #save_total_limit=2,            # Save only the 2 most recent models
    logging_dir="./logs",          # Log directory
    logging_steps=50,              # Log every 50 steps
    load_best_model_at_end=True,    # Load the best model at the end of training
    eval_strategy="epoch",   # Evaluate after each epoch
    save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
    #gradient_accumulation_steps=2, # Added gradient accumulation
    #fp16=False,                       # Enabled mixed precision training
    #gradient_checkpointing=True
)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [27]:
model.print_trainable_parameters()

trainable params: 856,064 || all params: 1,236,674,560 || trainable%: 0.0692


In [28]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the class with the highest score

    # Calculate accuracy, precision, recall, and F1
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,  # Assuming you have a validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

  trainer = Trainer(


In [30]:
!nvidia-smi

Sat Jan 18 09:20:14 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0              33W /  70W |   2139MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [31]:
trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7752,0.815398,0.46,0.470588,0.307692,0.372093
2,0.7849,0.760804,0.54,0.565217,0.5,0.530612
3,0.6826,0.738576,0.52,0.538462,0.538462,0.538462
4,0.6429,0.74395,0.52,0.5625,0.346154,0.428571
5,0.5896,0.742243,0.52,0.555556,0.384615,0.454545


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=1250, training_loss=0.7023217407226563, metrics={'train_runtime': 1345.8462, 'train_samples_per_second': 3.715, 'train_steps_per_second': 0.929, 'total_flos': 3740184084480000.0, 'train_loss': 0.7023217407226563, 'epoch': 5.0})

In [32]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.7385758757591248, 'eval_accuracy': 0.52, 'eval_precision': 0.5384615384615384, 'eval_recall': 0.5384615384615384, 'eval_f1': 0.5384615384615384, 'eval_runtime': 4.5666, 'eval_samples_per_second': 10.949, 'eval_steps_per_second': 2.847, 'epoch': 5.0}


In [33]:
test_results = trainer.predict(tokenized_test_dataset)
metrics = compute_metrics(test_results)
print(metrics)

{'accuracy': 0.56, 'precision': 0.4117647058823529, 'recall': 0.3684210526315789, 'f1': 0.3888888888888889}


In [34]:
# Generate predictions for the test set
predictions = test_results.predictions.argmax(-1)  # Predicted labels
true_labels = test_results.label_ids               # True labels

# Pair predictions with true labels
for i in range(len(predictions)):
    print(f"Sentence1: {tokenized_test_dataset[i]['sentence1']}")
    print(f"Sentence2: {tokenized_test_dataset[i]['sentence2']}")
    print(f"True Label: {true_labels[i]}, Predicted Label: {predictions[i]}\n")

Sentence1: this was a series of nested angular standards , so that measurements in azimuth and elevation could be done directly in polar coordinates relative to the ecliptic .
Sentence2: this was a series of nested angular standards , so that measurements in azimuth and elevation could be done directly in polar coordinates relative to the ecliptic .
True Label: 0, Predicted Label: 0

Sentence1: his father emigrated to missouri in 1868 but returned when his wife became ill and before the rest of the family could also go to america .
Sentence2: his father emigrated to missouri in 1868 but returned when his wife became ill and before the rest of the family could also go to america .
True Label: 0, Predicted Label: 0

Sentence1: in january 2011 , the deputy secretary general of fiba asia , hagop khajirian , inspected the venue together with sbp - president manuel v. pangilinan .
Sentence2: in january 2011 , the deputy secretary general of fiba asia , hagop khajirian , inspected the venue t