In [1]:
from datetime import datetime
start_time = datetime.now()

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0


In [3]:
import pandas as pd
import numpy as np
import os
import re


In [4]:
! pip install datasets



In [5]:
from datasets import load_dataset

# Load PAWS Subsets
#labeled_final = load_dataset("paws", "labeled_final")

#load MRPC Dataset
labeled_final = load_dataset("glue", "mrpc")

In [6]:
labeled_final

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [7]:
# # Access train, test, and validation splits
# labeled_final_train = labeled_final["train"]
# labeled_final_test = labeled_final["test"]
# labeled_final_validation = labeled_final["validation"]

In [8]:
# Define text preprocessing function
def preprocess_text(data):
  for eachsent in range(len(data['sentence1'])):

    # 1. Remove extra spaces
    data['sentence1'][eachsent] = re.sub(r'\s+', ' ', data['sentence1'][eachsent].strip())

    # 2. Remove unwanted text (e.g., URLs, special characters, digits)
    data['sentence1'][eachsent] = re.sub(r"http\S+|www\S+|https\S+", '', data['sentence1'][eachsent])  # Remove URLs
    #data['sentence1'][eachsent] = re.sub(r'[^\w\s]', '', data['sentence1'][eachsent])  # Remove special characters (punctuation)

    # 3. Convert to lowercase
    data['sentence1'][eachsent] = data['sentence1'][eachsent].lower()

    # 4. Normalize text (e.g., contractions)
    contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}
    data['sentence1'][eachsent] = ' '.join([contractions[word] if word in contractions else word for word in data['sentence1'][eachsent].split()])


  for eachsent in range(len(data['sentence2'])):

    # 1. Remove extra spaces
    data['sentence2'][eachsent] = re.sub(r'\s+', ' ', data['sentence2'][eachsent].strip())

    # 2. Remove unwanted text (e.g., URLs, special characters, digits)
    data['sentence2'][eachsent] = re.sub(r"http\S+|www\S+|https\S+", '', data['sentence2'][eachsent])  # Remove URLs
    #data['sentence2'][eachsent] = re.sub(r'[^\w\s]', '', data['sentence2'][eachsent])  # Remove special characters (punctuation)

    # 3. Convert to lowercase
    data['sentence2'][eachsent] = data['sentence2'][eachsent].lower()

    # 4. Normalize text (e.g., contractions)
    contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}
    data['sentence2'][eachsent] = ' '.join([contractions[word] if word in contractions else word for word in data['sentence2'][eachsent].split()])

  return data

In [9]:
train_dataset = labeled_final["train"].map(preprocess_text, batched=True)
valid_dataset = labeled_final["validation"].map(preprocess_text, batched=True)
test_dataset = labeled_final["test"].map(preprocess_text, batched=True)

In [10]:
# train = labeled_final['train'].select(range(500))
# test = labeled_final['test'].select(range(50))
# valid = labeled_final['validation'].select(range(50))
#---

train = train_dataset.select(range(1000))
valid = valid_dataset.select(range(50))
test = test_dataset.select(range(50))
#---
# train = train_dataset
# valid = valid_dataset
# test = test_dataset

In [11]:
train[0]

{'sentence1': 'amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'referring to him as only " the witness " , amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
# from google.colab import userdata
# HF_TOKEN= userdata.get('HuggingFace')
HF_TOKEN=""
from huggingface_hub import login
login(token=HF_TOKEN)

In [13]:
!pip install -U transformers
!pip install accelerate>=0.26.0



In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from accelerate import init_empty_weights

In [15]:
# with init_empty_weights():
#     model = AutoModelForSequenceClassification.from_pretrained(model_name)

# model = model.half().cuda()  # Use half-precision and move to GPU

Since LLAMA3 pre-training doesn't have EOS token
* Set the pad_token_id to eos_token_id
* Set pad token ot eos_token

In [16]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=2, 
                                                           device_map="auto")




Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Update some model configs
* Must use .cache = False as below or it crashes from my experience

In [17]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [18]:



# # Wrap the model with LoRA
# model = get_peft_model(model, lora_config)

# # Freeze all model parameters except LoRA layers
# for param in model.parameters():
#     param.requires_grad = False

# # Enable gradients for LoRA layers
# for name, param in model.named_parameters():
#     if "lora" in name:
#         param.requires_grad = True

# # Check which parameters require gradients
# for name, param in model.named_parameters():
#     print(f"{name} requires_grad: {param.requires_grad}")

### Tokenizer

In [19]:
# Ensure tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Padding token added as [PAD].")

# Resize model embeddings if new token is added
if tokenizer.pad_token_id is not None and model.get_input_embeddings().num_embeddings != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))

# Set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding="max_length",  # Use padding
        truncation=True,
        max_length=128
    )

tokenized_train_dataset = train.map(tokenize_function, batched=True)

In [20]:
tokenized_train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [21]:
tokenized_test_dataset = test.map(tokenize_function, batched=True)
tokenized_valid_dataset = valid.map(tokenize_function, batched=True)

In [22]:
# Keep only the necessary features
tokenized_train_dataset = tokenized_train_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["idx", "sentence1", "sentence2", "label"]
)
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [23]:
# Keep only the necessary features
tokenized_valid_dataset = tokenized_valid_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["idx", "sentence1", "sentence2", "label"]
)

tokenized_valid_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})

In [24]:
# # Keep only the necessary features
# tokenized_test_dataset = tokenized_test_dataset.map(
#     lambda examples: {
#         "input_ids": examples["input_ids"],
#         "attention_mask": examples["attention_mask"],
#         "labels": examples["label"]
#     },
#     remove_columns=["id", "sentence1", "sentence2", "label"]
# )

tokenized_test_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [25]:
# Data collator for padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"


In [27]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_base_model",         # Output directory for saved models
    learning_rate=2e-5,               # Learning rate
    per_device_train_batch_size=1,   # Training batch size
    per_device_eval_batch_size=1,   # Evaluation batch size
    num_train_epochs=10,            # Number of training epochs
    weight_decay=0.01,             # Weight decay
    #save_total_limit=2,            # Save only the 2 most recent models
    logging_dir="./logs",          # Log directory
    logging_steps=50,              # Log every 50 steps
    load_best_model_at_end=True,    # Load the best model at the end of training
    eval_strategy="epoch",          # Evaluate after each epoch
    save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
    gradient_accumulation_steps=48, # Added gradient accumulation
    fp16=True,                       # Enabled mixed precision training
    gradient_checkpointing=True,
    report_to="none", 
)



In [28]:
# model.print_trainable_parameters()

In [29]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the class with the highest score

    # Calculate accuracy, precision, recall, and F1
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [30]:
import torch
torch.cuda.empty_cache()


In [31]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   4714 MiB |   4714 MiB |   4714 MiB |    512 B   |
|       from large pool |   4714 MiB |   4714 MiB |   4714 MiB |      0 B   |
|       from small pool |      0 MiB |      0 MiB |      0 MiB |    512 B   |
|---------------------------------------------------------------------------|
| Active memory         |   4714 MiB |   4714 MiB |   4714 MiB |    512 B   |
|       from large pool |   4714 MiB |   4714 MiB |   4714 MiB |      0 B   |
|       from small pool |      0 MiB |      0 MiB |      0 MiB |    512 B   |
|---------------------------------------------------------------

In [32]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,  # Assuming you have a validation set
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

  trainer = Trainer(


In [33]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 21.98 GiB of which 34.44 MiB is free. Including non-PyTorch memory, this process has 21.93 GiB memory in use. Of the allocated memory 21.61 GiB is allocated by PyTorch, and 15.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
results = trainer.evaluate()
print("Evaluation Results:", results)

In [None]:
test_results = trainer.predict(tokenized_test_dataset)
metrics = compute_metrics(test_results)
print(metrics)

In [None]:
# Generate predictions for the test set
predictions = test_results.predictions.argmax(-1)  # Predicted labels
true_labels = test_results.label_ids               # True labels

# Pair predictions with true labels
for i in range(len(predictions)):
    print(f"Sentence1: {tokenized_test_dataset[i]['sentence1']}")
    print(f"Sentence2: {tokenized_test_dataset[i]['sentence2']}")
    print(f"True Label: {true_labels[i]}, Predicted Label: {predictions[i]}\n")