In [100]:
# !pip install --upgrade transformers
# !pip install --upgrade transformers accelerate peft bitsandbytes
# !pip install datasets
# !pip install scikit-learn
# !pip install evaluate

In [101]:
#import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import re
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate

In [102]:
labeled_final = load_dataset("glue", "mrpc")
labeled_final

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [103]:
# Define text preprocessing function
def preprocess_text(data):
    contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}

    def clean_sentence(sentence):
        # 1. Remove extra spaces
        sentence = re.sub(r'\s+', ' ', sentence.strip())

        # 2. Remove URLs
        sentence = re.sub(r"http\S+|www\S+|https\S+", '', sentence)

        # 3. Remove special characters and punctuation (except dots)
        sentence = re.sub(r"[^\w\s.]", '', sentence)

        # 4. Remove consecutive dots
        sentence = re.sub(r'\.{3,}', ' ', sentence)

        # 5. Convert to lowercase
        sentence = sentence.lower()

        # 6. Normalize contractions
        sentence = ' '.join([contractions[word] if word in contractions else word for word in sentence.split()])

        return sentence

    for eachsent in range(len(data['sentence1'])):
        data['sentence1'][eachsent] = clean_sentence(data['sentence1'][eachsent])

    for eachsent in range(len(data['sentence2'])):
        data['sentence2'][eachsent] = clean_sentence(data['sentence2'][eachsent])

    return data

In [104]:
train_dataset = labeled_final["train"].map(preprocess_text, batched=True)
valid_dataset = labeled_final["validation"].map(preprocess_text, batched=True)
test_dataset = labeled_final["test"].map(preprocess_text, batched=True)

In [105]:
# train = train_dataset.select(range(1000))
# valid = valid_dataset.select(range(100))
# test = test_dataset.select(range(50))
#---
train = train_dataset
valid = valid_dataset
test = test_dataset

In [106]:
train[0]

{'sentence1': 'amrozi accused his brother whom he called the witness of deliberately distorting his evidence .',
 'sentence2': 'referring to him as only the witness amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
HF_TOKEN=""
from huggingface_hub import login
login(token=HF_TOKEN)

In [108]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [109]:
# Incase of quantization
# import torch
# from transformers import BitsAndBytesConfig
# from peft import LoraConfig, get_peft_model


# Define QLORA configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_use_double_quant=True,  # Double quantization for stability
    bnb_4bit_quant_type="nf4",  # Quantization type (e.g., NormalFloat4)
    bnb_4bit_compute_dtype=torch.float16  # Computation type
)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,                          # Low-rank size
    lora_alpha=32,                # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers (specific to LLaMA)
    lora_dropout=0.1,             # Dropout for LoRA layers
    bias="none",                  # No bias adaptation
    task_type="SEQ_CLS"         # Task type: causal language modeling ###'SEQ_CLS' ###CAUSAL_LM
)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,  # Explicitly set for binary classification
    device_map="auto"  # Automatically distribute layers across available GPUs
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [110]:
#call the prepare_model_for_kbit_training() function to preprocess the quantized model for training.
# from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)


#use the get_peft_model() function to create a PeftModel from the quantized model and configuration.
# from peft import get_peft_model

model = get_peft_model(model, lora_config)

In [111]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [112]:
# Ensure tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Padding token added as [PAD].")

# Resize model embeddings if new token is added
if tokenizer.pad_token_id is not None and model.get_input_embeddings().num_embeddings != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))

# Set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding=True,  # Use padding
        truncation=True,
        max_length=128
    )

tokenized_train_dataset = train.map(tokenize_function, batched=True)

In [113]:
tokenized_train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 3668
})

In [114]:
tokenized_test_dataset = test.map(tokenize_function, batched=True)
tokenized_valid_dataset = valid.map(tokenize_function, batched=True)

In [115]:
# Keep only the necessary features
tokenized_train_dataset = tokenized_train_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["idx", "sentence1", "sentence2", "label"]
)
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3668
})

In [116]:
# Keep only the necessary features
tokenized_valid_dataset = tokenized_valid_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["idx", "sentence1", "sentence2", "label"]
)

tokenized_valid_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})

In [117]:
# Data collator for padding
# from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [118]:
training_args = TrainingArguments(
    output_dir="./results_lora",         # Output directory for saved models
    learning_rate=2e-5,          # Learning rate
    lr_scheduler_type="linear",
    per_device_train_batch_size=16, # Training batch size
    per_device_eval_batch_size=16, # Evaluation batch size
    num_train_epochs=20,            # Number of training epochs
    weight_decay=0.1,             # Weight decay
    warmup_steps=250,
    #save_total_limit=2,            # Save only the 2 most recent models
    logging_dir="./logs",          # Log directory
    logging_steps=25,              # Log every 50 steps
    load_best_model_at_end=True,    # Load the best model at the end of training
    eval_strategy="epoch",   # Evaluate after each epoch
    save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
    gradient_accumulation_steps=4, # Added gradient accumulation
    fp16=True,                       # Enabled mixed precision training
    gradient_checkpointing=True,
    max_grad_norm=1.0,
)

In [119]:
model.print_trainable_parameters()

trainable params: 1,708,032 || all params: 1,237,526,528 || trainable%: 0.1380


In [120]:

# # Define a metric function
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)  # Get the class with the highest score

#     # Calculate accuracy, precision, recall, and F1
#     accuracy = accuracy_score(labels, preds)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

#     return {
#         "accuracy": accuracy,
#         "precision": precision,
#         "recall": recall,
#         "f1": f1,
#     }

# Metrics
metric = evaluate.load("glue", "mrpc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     results = metric.compute(predictions=predictions, references=labels)

#     print(f"Results keys: {results.keys()}") #add this line

#     accuracy = results["accuracy"]
#     f1 = results["f1"]
#     precision = results.get("precision", None)
#     recall = results.get("recall", None)

#     return {
#         "accuracy": accuracy,
#         "f1": f1,
#         "precision": precision,
#         "recall": recall,
#     }

In [121]:
# from transformers import EarlyStoppingCallback
# Early stopping
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0,
)

In [122]:
# from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,  # Assuming you have a validation set
    #tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [123]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0183,0.870621,0.531863,0.623274
2,0.7805,0.743028,0.64951,0.758037
3,0.6946,0.648401,0.698529,0.795341
4,0.5778,0.556485,0.72549,0.80756
5,0.4944,0.500381,0.767157,0.846527
6,0.4626,0.458546,0.79902,0.864238
7,0.4075,0.467557,0.789216,0.86039
8,0.3632,0.43325,0.808824,0.866438
9,0.3452,0.444877,0.808824,0.865517
10,0.3086,0.438286,0.813725,0.866197


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

TrainOutput(global_step=638, training_loss=0.5182935010676847, metrics={'train_runtime': 802.0147, 'train_samples_per_second': 91.47, 'train_steps_per_second': 1.421, 'total_flos': 2.14746842370048e+16, 'train_loss': 0.5182935010676847, 'epoch': 11.0})

In [124]:
# Evaluate
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.43325018882751465, 'eval_accuracy': 0.8088235294117647, 'eval_f1': 0.8664383561643836, 'eval_runtime': 2.444, 'eval_samples_per_second': 166.938, 'eval_steps_per_second': 10.638, 'epoch': 11.0}
