In [None]:
# !pip install --upgrade transformers
# !pip install --upgrade transformers accelerate peft bitsandbytes
# !pip install datasets
# !pip install scikit-learn
# !pip install evaluate
##!pip install tf-keras
#!pip install wandb
#import os

# Set the WIND_API_KEY environment variable
#os.environ['WIND_API_KEY'] = ''

In [33]:
#import optuna
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import re
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate

In [34]:
labeled_final = load_dataset("glue", "mrpc")
labeled_final

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [35]:
# Define a function to remove noise
def remove_noise(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove special patterns like "< .SPX >", "< .IXIC >"
    text = re.sub(r'< \.[A-Z]+ >', '', text)
    
    # Remove ellipsis (...)
    text = re.sub(r'\.\s*\.\s*\.+', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove special characters like "â€™", "Â½", "Â£", etc.
    text = re.sub(r'[â€™Â½Â£]', '', text)
    
    # Remove single alphabets (e.g., "C")
    text = re.sub(r'\b\w\b', '', text)
    
    # Remove equal (=) sign at the end
    text = re.sub(r'=$', '', text)
    
    # Remove double hyphens (--)
    text = re.sub(r'--+', '', text)
    
    # Remove unwanted quotes
    text = re.sub(r'["“”]', '', text)

    # Fix short words with apostrophes (e.g., "'re" -> "are")
    text = re.sub(r"\s+'re\b", " are", text)
    text = re.sub(r"\b're\b", "are", text)
    text = re.sub(r"\b've\b", "have", text)
    text = re.sub(r"\b'll\b", "will", text)
    text = re.sub(r"\b'd\b", "would", text)
    text = re.sub(r"\b'm\b", "am", text)
    text = re.sub(r"\b's\b", "is", text)
    text = re.sub(r"\b'n\b", "and", text)
    text = text.lower()
    
    return text

def remove_noise_batch(examples):
    examples["cleaned_sentence1"] = [remove_noise(sentence) for sentence in examples["sentence1"]]
    examples["cleaned_sentence2"] = [remove_noise(sentence) for sentence in examples["sentence2"]]
    return examples


In [36]:
# dataset=labeled_final.map(remove_noise_batch, batched=True)
# dataset

In [37]:
# # Define text preprocessing function
# def preprocess_text(data):
#     contractions = {"can't": "cannot", "won't": "will not", "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"}

#     def clean_sentence(sentence):
#         # 1. Remove extra spaces
#         sentence = re.sub(r'\s+', ' ', sentence.strip())

#         # 2. Remove URLs
#         sentence = re.sub(r"http\S+|www\S+|https\S+", '', sentence)

#         # 3. Remove special characters and punctuation (except dots)
#         sentence = re.sub(r"[^\w\s.]", '', sentence)

#         # 4. Remove consecutive dots
#         sentence = re.sub(r'\.{3,}', ' ', sentence)

#         # 5. Convert to lowercase
#         sentence = sentence.lower()

#         # 6. Normalize contractions
#         sentence = ' '.join([contractions[word] if word in contractions else word for word in sentence.split()])

#         return sentence

#     for eachsent in range(len(data['sentence1'])):
#         data['sentence1'][eachsent] = clean_sentence(data['sentence1'][eachsent])

#     for eachsent in range(len(data['sentence2'])):
#         data['sentence2'][eachsent] = clean_sentence(data['sentence2'][eachsent])

#     return data

In [38]:
train_dataset = labeled_final["train"].map(remove_noise_batch, batched=True)
valid_dataset = labeled_final["validation"].map(remove_noise_batch, batched=True)
test_dataset = labeled_final["test"].map(remove_noise_batch, batched=True)

In [39]:
train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'cleaned_sentence1', 'cleaned_sentence2'],
    num_rows: 3668
})

In [40]:
# train = train_dataset.select(range(1000))
# valid = valid_dataset.select(range(100))
# test = test_dataset.select(range(50))
#---
train = train_dataset
valid = valid_dataset
test = test_dataset

In [41]:
train[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0,
 'cleaned_sentence1': 'amrozi accused his brother , whom he called  the witness  , of deliberately distorting his evidence .',
 'cleaned_sentence2': 'referring to him as only  the witness  , amrozi accused his brother of deliberately distorting his evidence .'}

In [None]:
HF_TOKEN=""
from huggingface_hub import login
login(token=HF_TOKEN)

In [43]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [44]:
# Incase of quantization
# import torch
# from transformers import BitsAndBytesConfig
# from peft import LoraConfig, get_peft_model


# Define QLORA configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_use_double_quant=True,  # Double quantization for stability
    bnb_4bit_quant_type="nf4",  # Quantization type (e.g., NormalFloat4)
    bnb_4bit_compute_dtype=torch.float16  # Computation type
)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,                          # Low-rank size
    lora_alpha=32,                # LoRA scaling factor
    target_modules=["q_proj", "v_proj"],  # Target attention layers (specific to LLaMA)
    lora_dropout=0.12,             # Dropout for LoRA layers
    bias="none",                  # No bias adaptation
    task_type="SEQ_CLS"         # Task type: causal language modeling ###'SEQ_CLS' ###CAUSAL_LM
)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,  # Explicitly set for binary classification
    device_map="auto"  # Automatically distribute layers across available GPUs
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
#call the prepare_model_for_kbit_training() function to preprocess the quantized model for training.
# from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)


#use the get_peft_model() function to create a PeftModel from the quantized model and configuration.
# from peft import get_peft_model

model = get_peft_model(model, lora_config)

In [46]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [47]:
# Ensure tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Padding token added as [PAD].")

# Resize model embeddings if new token is added
if tokenizer.pad_token_id is not None and model.get_input_embeddings().num_embeddings != len(tokenizer):
    model.resize_token_embeddings(len(tokenizer))

# Set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_sentence1"],
        examples["cleaned_sentence2"],
        padding=True,  # Use padding
        truncation=True,
        max_length=128
    )

tokenized_train_dataset = train.map(tokenize_function, batched=True)

In [48]:
tokenized_train_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'cleaned_sentence1', 'cleaned_sentence2', 'input_ids', 'attention_mask'],
    num_rows: 3668
})

In [49]:
tokenized_test_dataset = test.map(tokenize_function, batched=True)
tokenized_valid_dataset = valid.map(tokenize_function, batched=True)

In [50]:
# Keep only the necessary features
tokenized_train_dataset = tokenized_train_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["idx", "sentence1", "sentence2", "label",'cleaned_sentence1', 'cleaned_sentence2']
)
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3668
})

In [51]:
# Keep only the necessary features
tokenized_valid_dataset = tokenized_valid_dataset.map(
    lambda examples: {
        "input_ids": examples["input_ids"],
        "attention_mask": examples["attention_mask"],
        "labels": examples["label"]
    },
    remove_columns=["idx", "sentence1", "sentence2", "label"]
)

tokenized_valid_dataset

Dataset({
    features: ['cleaned_sentence1', 'cleaned_sentence2', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})

In [52]:
# Data collator for padding
# from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [53]:
training_args = TrainingArguments(
    output_dir="./results_llama3.2-1B",         # Output directory for saved models
    learning_rate=2e-4,          # Learning rate
    lr_scheduler_type="linear",
    per_device_train_batch_size=16, # Training batch size
    per_device_eval_batch_size=16, # Evaluation batch size
    num_train_epochs=15,            # Number of training epochs
    weight_decay=0.1,             # Weight decay
    warmup_steps=250,
    #save_total_limit=2,            # Save only the 2 most recent models
    logging_dir="./logs",          # Log directory
    logging_steps=25,              # Log every 50 steps
    load_best_model_at_end=True,    # Load the best model at the end of training
    eval_strategy="epoch",   # Evaluate after each epoch
    save_strategy="epoch",         # Change save_strategy to 'epoch' to match eval_strategy
    gradient_accumulation_steps=4, # Added gradient accumulation
    fp16=True,                       # Enabled mixed precision training
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    report_to="wandb"
)

In [54]:
model.print_trainable_parameters()

trainable params: 1,708,032 || all params: 1,237,526,528 || trainable%: 0.1380


In [55]:

# Define a metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the class with the highest score

    # Calculate accuracy, precision, recall, and F1
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# # Metrics
# metric = evaluate.load("glue", "mrpc")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     results = metric.compute(predictions=predictions, references=labels)

#     print(f"Results keys: {results.keys()}") #add this line

#     accuracy = results["accuracy"]
#     f1 = results["f1"]
#     precision = results.get("precision", None)
#     recall = results.get("recall", None)

#     return {
#         "accuracy": accuracy,
#         "f1": f1,
#         "precision": precision,
#         "recall": recall,
#     }

In [56]:
# from transformers import EarlyStoppingCallback
# Early stopping
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0,
)

In [57]:
# from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,  # Assuming you have a validation set
    #tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9953,0.773239,0.595588,0.679245,0.774194,0.723618
2,0.5943,0.513667,0.75,0.761062,0.924731,0.834951
3,0.4845,0.56075,0.718137,0.883178,0.677419,0.766734
4,0.4142,0.476347,0.776961,0.773256,0.953405,0.853933
5,0.3275,0.433941,0.823529,0.848485,0.903226,0.875
6,0.2051,0.408825,0.833333,0.883636,0.870968,0.877256
7,0.1043,0.664317,0.82598,0.846667,0.910394,0.877375
8,0.0511,0.69069,0.818627,0.854671,0.885305,0.869718
9,0.0148,0.932343,0.840686,0.876761,0.892473,0.884547


TrainOutput(global_step=522, training_loss=0.3976029422548082, metrics={'train_runtime': 653.2181, 'train_samples_per_second': 84.229, 'train_steps_per_second': 1.309, 'total_flos': 1.834253490167808e+16, 'train_loss': 0.3976029422548082, 'epoch': 9.0})

In [60]:
# Evaluate
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.4088248312473297, 'eval_accuracy': 0.8333333333333334, 'eval_precision': 0.8836363636363637, 'eval_recall': 0.8709677419354839, 'eval_f1': 0.8772563176895307, 'eval_runtime': 2.3711, 'eval_samples_per_second': 172.071, 'eval_steps_per_second': 10.965, 'epoch': 9.0}


In [61]:
#model
trainer.model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.12, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )