# NLP - CA4 - Q2

* **Name:** Mohammad Mahdi Salmani
* **Student id:** 810102174

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q bitsandbytes datasets
!pip install evaluate
# => Restart session (*)

In [1]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, Trainer, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import notebook_login
import matplotlib.pyplot as plt
import numpy as np
import os,torch,evaluate
from datasets import load_dataset, DatasetDict
from tqdm import tqdm
import re

import warnings
warnings.simplefilter('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device type:', device)

Device type: cuda


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
class Config:
    # Model and tokenizer config
    model_name = "meta-llama/Meta-Llama-3-8B"
    saved_model_path = "/content/drive/MyDrive/NLP/QLoRA/trained_model"

    # Dataset config
    dataset_name = "nyu-mll/multi_nli"
    train_subset_ratio=0.02
    val_subset_ratio=0.02

    # Quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )

    # LoRA config
    lora_rank = 4
    lora_alpha = 16
    lora_dropout = 0.1
    # lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"]
    lora_target_modules = ["q_proj", "k_proj", "v_proj"]

    # Training arguments
    output_dir = "/content/drive/MyDrive/NLP/QLoRA/results"
    logging_dir = './logs'
    save_steps = 200
    eval_steps = 50
    logging_steps = 50
    learning_rate = 1e-4
    per_device_batch_size = 8
    weight_decay = 0.01
    save_total_limit = 2
    num_train_epochs = 1
    optim = "paged_adamw_8bit"
    max_grad_norm=0.3
    warmup_steps=2

config = Config()

In [4]:
notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load Llama3 model

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    quantization_config= config.bnb_config,
    device_map="auto",
)
model = prepare_model_for_kbit_training(model)
model

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   7%|7         | 357M/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

## Load LLaMA tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Part1: ICL

In [None]:
dataset = load_dataset(config.dataset_name, split='validation_matched[:4%]')
dataset

Dataset({
    features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
    num_rows: 393
})

### Zero-shot prompt

In [None]:
def create_zero_shot_prompt(premise, hypothesis):
    prompt = f"""Classify the hypothesis as entailment(0), neutral(1), or contradiction(2) with respect to the premise.
Premise: {premise}
Hypothesis: {hypothesis}
Label: """
    return prompt

### One-shot prompt

In [None]:
def create_one_shot_prompt(premise, hypothesis, example_premise, example_hypothesis, example_label):
    prompt = f"""Classify the hypothesis as entailment(0), neutral(1), or contradiction(2) with respect to the premise.
Example:
Premise: {example_premise}
Hypothesis: {example_hypothesis}
Label: {example_label}

Premise: {premise}
Hypothesis: {hypothesis}
Label: """
    return prompt

### Evaluation

In [None]:
# Example:
premise = dataset[40]['premise']
hypothesis = dataset[40]['hypothesis']
label = dataset[40]['label']
sample_premise = dataset[20]['premise']
sample_hypothesis = dataset[20]['hypothesis']
sample_label = dataset[20]['label']

In [None]:
def show_example(premise, hypothesis, label, temperature):
    zero_shot_prompt = create_zero_shot_prompt(premise, hypothesis)
    one_shot_prompt = create_one_shot_prompt(premise, hypothesis, sample_premise, sample_hypothesis, sample_label)
    # print('Prompt:\n', f'\n{prompt}')
    print('True Actual => ', sample_label)

    inputs = tokenizer(zero_shot_prompt, return_tensors='pt').to(device)
    max_length = inputs['input_ids'].shape[1] + 2
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length, temperature=temperature, pad_token_id = tokenizer.eos_token_id)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print('Zero-shot Prediction =>',prediction[len(zero_shot_prompt):].split()[0])

    inputs = tokenizer(one_shot_prompt, return_tensors='pt').to(device)
    max_length = inputs['input_ids'].shape[1] + 2
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length, temperature=temperature, pad_token_id = tokenizer.eos_token_id)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print('One-shot Prediction =>',prediction[len(one_shot_prompt):].split()[0])

In [None]:
show_example(premise, hypothesis, label, temperature = 0.2)

True Actual =>  1
Zero-shot Prediction => 1
One-shot Prediction => 1


In [None]:
show_example(premise, hypothesis, label, temperature = 0.8)

True Actual =>  1
Zero-shot Prediction => 1
One-shot Prediction => 1


* Zero-shot evaluation

In [None]:
def evaluate_model(model, tokenizer, dataset):
    correct = 0
    total = 0
    for example in tqdm(dataset):
        model.eval()
        label = example['label']

        prompt = create_zero_shot_prompt(example['premise'], example['hypothesis'])
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        max_length = inputs['input_ids'].shape[1] + 2

        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length, temperature=temperature, pad_token_id = tokenizer.eos_token_id)

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        try:
            prediction = int(prediction[len(prompt):][0])
            if prediction == label:
                correct += 1
        except:
            print(f'\nError: {prediction[len(prompt):]}')
        total += 1

    accuracy = correct / total
    print('\n', '-'*10)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    # return accuracy

In [None]:
temperature =0.2
evaluate_model(model, tokenizer, dataset) # temperrature = 0.2

100%|██████████| 393/393 [05:56<00:00,  1.10it/s]


 ----------
Accuracy: 40.46%





* One-shot evaluation

In [None]:
def evaluate_one_shot_model(model, tokenizer, dataset):
    correct = 0
    total = 0
    for example in tqdm(dataset):
        model.eval()

        prompt = create_one_shot_prompt(example['premise'], example['hypothesis'], sample_premise, sample_hypothesis, sample_label)
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        max_length = inputs['input_ids'].shape[1] + 2

        with torch.no_grad():
            tokenizer.pad_token = tokenizer.eos_token
            outputs = model.generate(**inputs, max_length=max_length, temperature=temperature, pad_token_id = tokenizer.eos_token_id)

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        try:
            prediction = int(prediction[len(prompt):][0])
            if prediction == label:
                correct += 1
        except:
            # print(f'\nError: {prediction[len(prompt):]}')
            pass
        total += 1

    accuracy = correct / total
    print('\n', '-'*10,'\n')
    print(f"Accuracy: {accuracy * 100:.2f}%")
    # return accuracy

In [None]:
evaluate_one_shot_model(model, tokenizer, dataset) # temperature = 0.2

100%|██████████| 393/393 [06:53<00:00,  1.05s/it]


 ---------- 

Accuracy: 26.72%





In [None]:
def create_few_shot_prompt(premise, hypothesis, example_premise, example_hypothesis, example_label):
    prompt = f"""Classify the hypothesis as entailment(0), neutral(1), or contradiction(2) with respect to the premise.
Example:
Premise: {example_premise[0]}
Hypothesis: {example_hypothesis[0]}
Label: {example_label[0]}

Premise: {example_premise[1]}
Hypothesis: {example_hypothesis[1]}
Label: {example_label[1]}

Premise: {example_premise[2]}
Hypothesis: {example_hypothesis[2]}
Label: {example_label[2]}

Premise: {premise}
Hypothesis: {hypothesis}
Label: """
    return prompt

In [None]:
def evaluate_one_shot_model(model, tokenizer, dataset):
    correct = 0
    total = 0
    for example in tqdm(dataset):
        model.eval()

        prompt = create_one_shot_prompt(example['premise'], example['hypothesis'], sample_premise, sample_hypothesis, sample_label)
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        max_length = inputs['input_ids'].shape[1] + 2

        with torch.no_grad():
            tokenizer.pad_token = tokenizer.eos_token
            outputs = model.generate(**inputs, max_length=max_length, temperature=temperature, pad_token_id = tokenizer.eos_token_id)

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        try:
            prediction = int(prediction[len(prompt):][0])
            if prediction == label:
                correct += 1
        except:
            # print(f'\nError: {prediction[len(prompt):]}')
            pass
        total += 1

    accuracy = correct / total
    print('\n', '-'*10,'\n')
    print(f"Accuracy: {accuracy * 100:.2f}%")
    # return accuracy

## Part2: QLoRA (First method)

In [None]:
# ## Load saved model...
# model = AutoModelForCausalLM.from_pretrained(
#     config.saved_model_path,
#     quantization_config= config.bnb_config,
#     device_map="auto",
# )
# model = prepare_model_for_kbit_training(model)
# tokenizer = AutoTokenizer.from_pretrained(config.saved_model_path, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_eos_token = True

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load Multi-NLI dataset

In [None]:
def load_subset_dataset(dataset_name, train_subset_ratio=0.01, val_subset_ratio=0.01):
    dataset = load_dataset(dataset_name)
    # Reduce the dataset size:
    train_subset = dataset['train'].train_test_split(test_size = train_subset_ratio)['test']
    validation_subset = dataset['validation_matched'].train_test_split(test_size=val_subset_ratio)['test']
    subset_dataset = DatasetDict({'train': train_subset, 'validation': validation_subset})
    return subset_dataset

dataset = load_subset_dataset(config.dataset_name)
print(f'Dataset:\n{dataset}')

class_names = dataset["train"].features["label"].names
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}
print(f"Labels: {class_names}")

Downloading readme:   0%|          | 0.00/8.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/214M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.94M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Dataset:
DatasetDict({
    train: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 3928
    })
    validation: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 99
    })
})
Labels: ['entailment', 'neutral', 'contradiction']


In [None]:
def preprocess_function(examples):
    # Input template: [INST] Premise: ... [SEP] Hypothesis: ... [/INST] [LABEL] (entailment/neutral/contradiction) [/LABEL]
    inputs = [
        f"[INST] Premise: {premise} [SEP] Hypothesis: {hypothesis} [/INST] [LABEL] {id2label[label_id]} [/LABEL]"
        for premise, hypothesis, label_id in zip(examples['premise'], examples['hypothesis'], examples['label'])
    ]
    model_inputs = tokenizer(inputs, max_length=300, padding='max_length', truncation=True)
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/3928 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

### Lora Config

In [None]:
lora_config = LoraConfig(
    r=config.lora_rank,
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    task_type="CAUSAL_LM",
    target_modules=config.lora_target_modules
)

model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

trainable params: 8,126,464 || all params: 8,038,387,712 || trainable%: 0.1011


### Trainer Config

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP/QLoRA/results",
    logging_dir='./logs',
    save_total_limit=config.save_total_limit,
    num_train_epochs=config.num_train_epochs,
    max_steps=200,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_steps=200,
    eval_steps=50,
    logging_steps=50,
    learning_rate=config.learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=config.weight_decay,
    report_to="none",
    optim=config.optim,
    fp16=False,  # Disable fp16 and bf16 since the model is quantized
    bf16=False,
    max_grad_norm= 0.3,
    warmup_ratio= 0.3,
    group_by_length=True,
    load_best_model_at_end=True,
    lr_scheduler_type= "linear",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator= transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
torch.cuda.empty_cache()
model.config.use_cache = False

### Train model

#### First try:
* 0.25% of data
* template: `[INST] {premise} [SEP] {hypothesis} [/INST] NLI Label: `
* Using `SFTTrainer` and with labeled dataset (CasualLM)

In [None]:
trainer_results = trainer.train()

Step,Training Loss,Validation Loss
200,2.3609,2.062937
400,2.0346,2.028962
600,2.0231,2.016631
800,1.99,2.010569
1000,1.9895,2.005716
1200,1.9767,2.003806


In [None]:
trainer_results.metrics

{'train_runtime': 9452.8292,
 'train_samples_per_second': 1.039,
 'train_steps_per_second': 0.13,
 'total_flos': 1.2386585831079936e+17,
 'train_loss': 2.060707319054619,
 'epoch': 1.0}

#### Second try:
* Using `Trainer` and `DataCollatorForLanguageModeling`.
* template: `[INST] {premise} [SEP] {hypothesis} [/INST] NLI Label: {label}`

In [None]:
trainer_results = trainer.train()

In [None]:
trainer.evaluate()

Step,Training Loss,Validation Loss
250,2.2078,2.086363
500,2.0362,2.064372
750,2.0574,2.053003
1000,2.026,2.045347
1250,1.996,2.039289
1251,1.996,2.039223


{'eval_loss': 2.0392234325408936}

#### Third try:
* Change template and some hyperparameters, train on 0.2% of data.
* template: `[INST] Premise: ... [SEP] Hypothesis: ... [/INST] [LABEL] ... [/LABEL]`
* lora on these modules: ["q_proj", "k_proj", "v_proj"] -> 2M params

In [None]:
trainer_results = trainer.train()

In [None]:
## => Stop training before completed because of GPU limitation in colab
results = trainer.evaluate()

Step,Training Loss,Validation Loss
50,3.1154,2.165173
100,1.9551,1.893183
150,1.8789,1.827371
154,1.8789,1.825255


#### Forth try:
* Same as previous try but in 0.1% of data:
* lora on these modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"] -> 8M params

In [None]:
trainer_results = trainer.train()

Step,Training Loss,Validation Loss
50,2.6394,1.906286
100,1.8531,1.715609
150,1.793,1.689091
200,1.7699,1.682383


In [None]:
trainer_results

TrainOutput(global_step=200, training_loss=2.013865661621094, metrics={'train_runtime': 6651.3908, 'train_samples_per_second': 0.241, 'train_steps_per_second': 0.03, 'total_flos': 2.163758727168e+16, 'train_loss': 2.013865661621094, 'epoch': 0.4073319755600815})

### Merge Weights

In [None]:
model.save_pretrained("/content/drive/MyDrive/dataset/trained_model")
tokenizer.save_pretrained("/content/drive/MyDrive/dataset/trained_model")

('/content/drive/MyDrive/dataset/trained_model/tokenizer_config.json',
 '/content/drive/MyDrive/dataset/trained_model/special_tokens_map.json',
 '/content/drive/MyDrive/dataset/trained_model/tokenizer.json')

In [None]:
def merge_lora_weights(model):
    for name, param in model.named_parameters():
        if "lora" in name:
            original_name = name.replace("lora", "weight")
            if hasattr(model, original_name):
                original_param = getattr(model, original_name)
                original_param.data += param.data
                param.data = original_param.data

merge_lora_weights(model)

In [None]:
model.save_pretrained("/content/drive/MyDrive/NLP/QLoRA/merged_model")

### Evaluation & Prediction

In [None]:
def evaluate_model(model, tokenizer, dataset):
    correct = 0
    total = 0
    for example in tqdm(dataset):
        model.eval()
        label = example['label']

        prompt = f"{example['premise']} [SEP] {example['hypothesis']} [SEP] Label: "
        inputs = tokenizer(prompt, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=2, temperature=temperature, pad_token_id = tokenizer.eos_token_id)

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        try:
            prediction = int(prediction[len(prompt):][0])
            if prediction == label:
                correct += 1
        except:
            print(f'\nError: {prediction[len(prompt):]}')
        total += 1

    accuracy = correct / total
    print('\n', '-'*10)
    print(f"Accuracy: {accuracy * 100:.2f}%")

* First try results

In [None]:
def predict(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, pad_token_id=tokenizer.pad_token_id, max_new_tokens = 2, temperature=0.1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

premise = dataset["validation"]["premise"][10]
hypothesis = dataset["validation"]["hypothesis"][10]
label = dataset["validation"]["label"][10]
sentence = f"{premise} [SEP] {hypothesis} [SEP] Label: "
prediction = predict(sentence)
print(sentence)
print(f"Actual label: {label}")
print(f"Predicted label: {prediction[len(sentence):]}")

Maybe I am too. [SEP] It's possible that I am also. [SEP] Label: 
Actual label: 0
Predicted label: 0 


In [None]:
results = trainer.evaluate()
results

{'eval_loss': 2.0038058757781982,
 'eval_runtime': 72.2687,
 'eval_samples_per_second': 3.404,
 'eval_steps_per_second': 0.429,
 'epoch': 1.0}

In [None]:
temperature = 0.3
evaluate_model(model, tokenizer, dataset["validation"])

100%|██████████| 246/246 [03:11<00:00,  1.29it/s]


 ----------
Accuracy: 32.11%





* Second try

In [None]:
def predict(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, pad_token_id=tokenizer.pad_token_id, max_new_tokens = 2, temperature=0.1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

premise = dataset["validation"]["premise"][20]
hypothesis = dataset["validation"]["hypothesis"][20]
label = dataset["validation"]["label"][20]
sentence = f"[INS] {premise} [SEP] {hypothesis} [/INS] NLI Label: "
prediction = predict(sentence)
print(sentence)
print(f"Actual label: {id2label[label]}")
print(f"Predicted label: {prediction[len(sentence):]}")

[INS] The Women's Haven, which provides shelter and outreach to domestic-violence victims, already has a full-time attorney. [SEP] There is a full-time attorney at the Women's Haven already. [/INS] NLI Label: 
Actual label: entailment
Predicted label:  entailment


In [None]:
def evaluate_model(model, tokenizer, dataset):
    correct = 0
    total = 0
    for example in tqdm(dataset):
        model.eval()
        label = example['label']

        prompt = f"[INS] {premise} [SEP] {hypothesis} [/INS] NLI Label: "
        inputs = tokenizer(prompt, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=2, temperature=temperature, pad_token_id = tokenizer.eos_token_id)

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        try:
            prediction = label2id[prediction[len(prompt):].split()[0].lower()]
            if prediction == label:
                correct += 1
        except:
            # print(f'\nError:{prediction[len(prompt):].split()[0].lower()}*')
            pass
        total += 1

    accuracy = correct / total
    print('\n', '-'*10)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    # return accuracy

In [None]:
temperature = 0.1
evaluate_model(model, tokenizer, dataset["validation"])

100%|██████████| 197/197 [01:22<00:00,  2.39it/s]

 ----------
Accuracy: 65.99%


* Third try

In [None]:
def predict(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to('cuda')
    with torch.no_grad():
        outputs = model.generate(**inputs, pad_token_id=tokenizer.pad_token_id, max_new_tokens = 10, temperature=0.1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split('[LABEL] ')[-1].split(' [/LABEL]')[0].strip()

premise = dataset["validation"]["premise"][100]
hypothesis = dataset["validation"]["hypothesis"][100]
label = dataset["validation"]["label"][100]
prompt = f"[INST] Premise: {premise} [SEP] Hypothesis: {hypothesis} [/INST]"
print(prompt)
print(f"Actual label: {id2label[label]}")
prediction = predict(prompt)
print(f"Predicted label: {prediction}")

[INST] Premise: do you really romance [SEP] Hypothesis: Do you really love him? [/INST]
Actual label: neutral
Predicted label: neutral


In [None]:
def evaluate_model(model, tokenizer, dataset, temperature=0.2):
    model.eval()
    total, correct = 0, 0
    for example in tqdm(dataset):
        prompt = f"[INST] Premise: {example['premise']} [SEP] Hypothesis: {example['hypothesis']} [/INST]"
        inputs = tokenizer(prompt, return_tensors='pt')
        labels = example['label']
        with torch.no_grad():
            # outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=350)
            outputs = model.generate(**inputs, max_new_tokens=10, temperature=temperature, pad_token_id = tokenizer.eos_token_id)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[LABEL] ')[-1].split(' [/LABEL]')[0].strip()
        if prediction == id2label[labels]:
            correct += 1
        # else:
            # print(prediction)
        total += 1
    accuracy = correct / total
    return accuracy

In [None]:
# Evaluate model before merging, 0.2% of data, 155 step trainig
accuracy = evaluate_model(model, tokenizer, dataset['validation'])
print(f"\nAccuracy: {accuracy*100:.2f} %")

100%|██████████| 197/197 [05:47<00:00,  1.76s/it]

Accuracy: 68.52 %


In [None]:
# Evaluate merged model, 0.2% of data, 155 step trainig
accuracy = evaluate_model(model, tokenizer, dataset['validation'])
print(f"\nAccuracy: {accuracy*100:.2f} %")

100%|██████████| 197/197 [05:58<00:00,  1.82s/it]

Accuracy: 73.10 %


* Forth try

In [None]:
# Evaluate model before merging, 0.1% of data, 200 step trainig
accuracy = evaluate_model(model, tokenizer, dataset['validation'])
print(f"\nAccuracy: {accuracy*100:.2f} %")

100%|██████████| 99/99 [03:43<00:00,  2.26s/it]


Accuracy: 81.82 %





In [None]:
# Evaluate merged model, 0.1% of data, 200 step trainig
accuracy = evaluate_model(model, tokenizer, dataset['validation'])
print(f"\nAccuracy: {accuracy*100:.2f} %")

100%|██████████| 99/99 [03:48<00:00,  2.31s/it]


Accuracy: 80.81 %





## Part3: QLoaRA (Second method)

### Load Multi-NLI dataset

In [6]:
def load_subset_dataset(dataset_name, train_subset_ratio=0.1, val_subset_ratio=0.01):
    dataset = load_dataset(dataset_name)
    # Reduce the dataset size:
    train_subset = dataset['train'].train_test_split(test_size = train_subset_ratio)['test']
    validation_subset = dataset['validation_matched'].train_test_split(test_size=val_subset_ratio)['test']
    subset_dataset = DatasetDict({'train': train_subset, 'validation': validation_subset})
    return subset_dataset

dataset = load_subset_dataset(config.dataset_name)
print(f'Dataset:\n{dataset}')

class_names = dataset["train"].features["label"].names
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}
print(f"Labels: {class_names}")

Downloading readme:   0%|          | 0.00/8.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/214M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.94M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Dataset:
DatasetDict({
    train: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 39271
    })
    validation: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 99
    })
})
Labels: ['entailment', 'neutral', 'contradiction']


### Load customized model

In [6]:
class CustomLLaMAForSequenceClassification(nn.Module):
    def __init__(self, model_name, quantization_config, num_labels):
        super(CustomLLaMAForSequenceClassification, self).__init__()
        self.base_model = AutoModel.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
        )
        self.base_model = prepare_model_for_kbit_training(self.base_model)
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
            outputs = self.base_model(input_ids, attention_mask=attention_mask)
            logits = self.classifier(outputs.last_hidden_state[:, 0, :])
            # return logits
            loss = None
            if labels is not None:
                loss = nn.CrossEntropyLoss()(logits, labels)
            return {"loss": loss, "logits": logits}

In [8]:
model = CustomLLaMAForSequenceClassification(
    config.model_name,
    quantization_config=config.bnb_config,
    num_labels=len(class_names),
    )
model

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

CustomLLaMAForSequenceClassification(
  (base_model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
   

In [8]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def preprocess_function(examples):
    inputs = [f"[INST] Premise: {premise} [SEP] Hypothesis: {hypothesis} [/INST]" for premise, hypothesis in zip(examples['premise'], examples['hypothesis'])]
    model_inputs = tokenizer(inputs, max_length=300, padding='max_length', truncation=True)
    model_inputs["labels"] = examples["label"]
    return model_inputs

remove_columns = ['premise', 'hypothesis', 'promptID', 'pairID', 'premise_binary_parse', 'premise_parse', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre']
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=remove_columns)

Map:   0%|          | 0/39271 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7855
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 197
    })
})

### LoRA Config

In [11]:
lora_config = LoraConfig(
    r=config.lora_rank,
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    task_type="TASK_SEQUENCE_CLASSIFICATION",
    target_modules=config.lora_target_modules
)
model.base_model = get_peft_model(model.base_model, lora_config)

In [12]:
model.base_model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 7,507,283,968 || trainable%: 0.0314


### Trainer Config

In [13]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(p: transformers.EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return accuracy_metric.compute(predictions=preds, references=p.label_ids)

In [14]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP/QLoRA_2/results",
    logging_dir='./logs',
    save_total_limit=config.save_total_limit,
    num_train_epochs=config.num_train_epochs,
    max_steps=200,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_steps=config.save_steps,
    eval_steps=config.eval_steps,
    logging_steps=config.logging_steps,
    learning_rate=config.learning_rate,
    per_device_train_batch_size=config.per_device_batch_size,
    per_device_eval_batch_size=config.per_device_batch_size,
    weight_decay=config.weight_decay,
    report_to="none",
    optim=config.optim,
    fp16=False,
    bf16=False,
    max_grad_norm=config.max_grad_norm,
    warmup_steps=config.warmup_steps,
    group_by_length=True,
    load_best_model_at_end=True,
    lr_scheduler_type= "linear",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [15]:
torch.cuda.empty_cache()
model.base_model.config.use_cache = False

In [16]:
trainer_results = trainer.train()

Step,Training Loss,Validation Loss,Accuracy
50,1.2372,1.305407,0.292929
100,1.1616,1.161492,0.333333
150,1.127,1.103412,0.333333
200,1.1109,1.108272,0.292929


### Merge Weights

In [22]:
model.base_model.save_pretrained("/content/drive/MyDrive/NLP/QLoRA/trained_model")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/QLoRA/trained_model")

('/content/drive/MyDrive/NLP/QLoRA/trained_model/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/QLoRA/trained_model/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/QLoRA/trained_model/tokenizer.json')

In [23]:
def merge_lora_weights(model):
    for name, param in model.base_model.named_parameters():
        if "lora" in name:
            original_name = name.replace("lora", "weight")
            if hasattr(model.base_model, original_name):
                original_param = getattr(model.base_model, original_name)
                original_param.data += param.data
                param.data = original_param.data

merge_lora_weights(model)

In [None]:
model.save_pretrained("/content/drive/MyDrive/NLP/QLoRA/merged_model")

### Evaluation & Prediction

In [19]:
def evaluate_model(model, tokenizer, dataset):
    model.eval()
    total, correct = 0, 0
    for example in tqdm(dataset):
        # "[INST] Premise: ... [SEP] Hypothesis: ... [/INST]"
        inputs = tokenizer(f"[INST] Premise: {example['premise']} [SEP] Hypothesis: {example['hypothesis']} [/INST]", return_tensors='pt', max_length=300, padding='max_length', truncation=True).to('cuda')
        labels = example['label']
        with torch.no_grad():
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        prediction = torch.argmax(outputs['logits'], dim=-1).item()
        if prediction == labels:
            correct += 1
        total += 1
    accuracy = correct / total
    return accuracy

In [24]:
# Evaluate merged model.
accuracy = evaluate_model(model, tokenizer, dataset["validation"])
print(f"\nAccuracy: {accuracy*100:.2f} %")

100%|██████████| 99/99 [02:51<00:00,  1.74s/it]


Accuracy: 29.29 %



