<a href="https://colab.research.google.com/github/mdeniz1/datascienceprojects/blob/main/QLoRaFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Mehmet Baki Deniz

**4 Bit Quantization of Llama3-8B using AWQ method**

In [None]:
!pip install autoawq
!pip install nvidia-ml-py3

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from huggingface_hub import notebook_login
from huggingface_hub import HfApi
notebook_login()

In [None]:
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
quant_path = "Llama-3-8B-Instruct-AWQ-4bit"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit":4}

In [None]:
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.quantize(tokenizer, quant_config=quant_config)

Repo card metadata block was not found. Setting CardData to empty.
AWQ: 100%|██████████| 32/32 [28:45<00:00, 53.92s/it]


In [None]:
#save the quantized model
model.save_quantized("./"+quant_path, safetensors=True)
tokenizer.save_pretrained("./"+quant_path)

('./Llama-3-8B-Instruct-AWQ-4bit/tokenizer_config.json',
 './Llama-3-8B-Instruct-AWQ-4bit/special_tokens_map.json',
 './Llama-3-8B-Instruct-AWQ-4bit/tokenizer.json')

In [None]:
api = HfApi(token=<hf_tooken>)

In [None]:
api.create_repo(
    repo_id = f"{username}/{MODEL_NAME}",
    repo_type="model"
)

RepoUrl('https://huggingface.co/mdeniz1/Llama-3-8B-Instruct-AWQ-4bit', endpoint='https://huggingface.co', repo_type='model', repo_id='mdeniz1/Llama-3-8B-Instruct-AWQ-4bit')

In [None]:
api.upload_folder(
    repo_id = f"{username}/{MODEL_NAME}",
    folder_path = "/content/Llama-3-8B-Instruct-AWQ-4bit"
)

**FINE TUNING**

In [None]:
pip install autoawq peft datasets

In [None]:
def prepare_split(tokenizer):
    data = datasets.load_dataset("tatsu-lab/alpaca", split="train")
    data = data.select(range(10,20))
    prompt_template = "<s>[INST] {prompt} [/INST] {output}</s>"

    def format_prompt(x):
        return prompt_template.format(
            prompt=x["instruction"],
            output=x["output"]
        )

    data = data.map(
        lambda x: {"text": format_prompt(x)},
    ).select_columns(["text"])
    data = data.map(lambda x: tokenizer(x["text"]), batched=True)

    return data

In [None]:
import datasets
from awq import AutoAWQForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)



model_path = "mdeniz1/Llama-3-8B-Instruct-AWQ-4bit"

model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=False)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token


In [None]:

data_train = prepare_split(tokenizer)

lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.5,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False
)

model = get_peft_model(model.model, lora_config)

model.print_trainable_parameters()

training_arguments = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=1,
    optim="adamw_torch",
    num_train_epochs=1,
    learning_rate=1e-4,
    evaluation_strategy="no",
    max_steps = 100,
    save_strategy="epoch",
    save_steps=100,
    logging_steps=50,
    eval_steps=None,
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    train_dataset=data_train,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



In [None]:
training_stats=trainer.train()

In [None]:
from huggingface_hub import notebook_login


notebook_login()



In [None]:
model.push_to_hub("mdeniz1/llama3-8b-Instruct-4b-QLoRA-finetuned")
tokenizer.push_to_hub("mdeniz1/llama3-8b-Instruct-4b-QLoRA-finetuned")

**EVALUATION OF THE FINE-TUNE MODEL WITH BLEU AND ROUGE SCORES**

In [None]:
pip install evaluate rouge-score


In [None]:
pip install peft

  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13.0->peft)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.13.0->peft)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.13.0->peft)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.13.0->peft)
  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)
Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.13.0->peft)
  Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)
Collecting nvidia-nccl-cu12==2.19.3 (from torch>=1.13.0->peft)
  Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166

In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

*loading the QLoRA fine-tune model*

In [None]:


base_model = "mdeniz1/Llama-3-8B-Instruct-AWQ-4bit"
adapter_model = "mdeniz1/llama3-8b-Instruct-4b-QLoRA-finetuned"

model = AutoModelForCausalLM.from_pretrained(base_model)
model1 = PeftModel.from_pretrained(model, adapter_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

model = model.to("cuda")
model.eval()

*loading the first 10 rows*

In [None]:
import datasets
data = datasets.load_dataset("tatsu-lab/alpaca", split="train")
data = data.select(range(0,10))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

*infering the predictions for the first 10 rows of alpaca dataset with the quantized model*

In [None]:
import torch
outputs_ft=[]
for i in range(len(data)):
  prompt=f"[{data['input'][i]}. {data['instruction'][i]}]"

  inputs = tokenizer(prompt, return_tensors="pt")

  with torch.no_grad():
      outputs = model1.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=150)
      text=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
      outputs_ft.append(text)


*saving the results to a pandas dataframe*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path='/content/drive/ft.csv'
df = pd.DataFrame(outputs_ft, columns=['prompts'])

df.to_csv(path)

*Loading the quantized model*

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

quant_path = "mdeniz1/Llama-3-8B-Instruct-AWQ-4bit"

# Load model
model2 = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)



*infering the predictions*

In [None]:
import torch
outputs_Q_4b=[]
for i in range(len(data)):
  prompt=f"[{data['input'][i]}. {data['instruction'][i]}]"

  inputs = tokenizer(prompt, return_tensors="pt")

  with torch.no_grad():
      outputs = model2.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=512)
      text=tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
      outputs_Q_4b.append(text)


In [None]:
path = '/content/drive/My Drive/Q4b.csv'

df_qtb = pd.read_csv(path)


In [None]:
qtb_list=df_qtb['prompts'].tolist()

*Calculating ROUGE Scores for both the quantized and QLoRA fine-tuned model*

In [None]:
#4-bit quantized model
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scores_qtb_rouge = []
for i in range(len(data)):
  score = scorer.score(qtb_list[i],data['output'][i])
  scores_qtb_rouge.append(score['rouge1'].fmeasure)


In [None]:
path = '/content/drive/My Drive/ft.csv'

df_ft = pd.read_csv(path)
ft_list=df_ft['prompts'].tolist()

In [None]:
#rouge1 scores for QLoRA quantized model
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scores_ft_rouge = []
for i in range(len(data)):
  score = scorer.score(ft_list[i],data['output'][i])
  scores_ft_rouge.append(score['rouge1'].fmeasure)

In [None]:
scores_qtb_rouge

[0.1277533039647577,
 0.051428571428571435,
 0.23364485981308414,
 0.1970649895178197,
 0.2880794701986755,
 0.005,
 0.13736263736263737,
 0.2622950819672131,
 0.02054794520547945,
 0.03738317757009346]

In [None]:
scores_ft_rouge

[0.2802547770700637,
 0.1276595744680851,
 0.4126984126984127,
 0.40609137055837563,
 0.4982456140350877,
 0.017094017094017096,
 0.3194444444444445,
 0.2881844380403458,
 0.05925925925925926,
 0.1111111111111111]

*Calculating BLEU Scores for both the quantized and QLoRA fine-tuned model*

In [None]:
import evaluate

bleu = evaluate.load("bleu")
scores_ft_bleu=[]

for i in range(len(ft_list)):
    mylist2=[]

    mylist2.append(data['output'][i])
    mylist1=[]

    mylist1.append(ft_list[i])

    predictions = mylist2
    references = mylist1
    results = bleu.compute(predictions=predictions, references=references)
    scores_ft_bleu.append(results['bleu'])
# Print the results



Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
import evaluate

# Define the candidate predictions and reference sentences


# Load the BLEU evaluation metric
bleu = evaluate.load("bleu")
scores_qtb_bleu=[]

for i in range(len(ft_list)):
    mylist2=[]

    mylist2.append(data['output'][i])
    mylist1=[]

    mylist1.append(qtb_list[i])

    predictions = mylist2
    references = mylist1
    results = bleu.compute(predictions=predictions, references=references)
    scores_qtb_bleu.append(results['bleu'])
# Print the results



In [None]:
scores_ft_bleu

[0.007777269083703896,
 4.402520529973925e-06,
 0.058414699248373274,
 0.05342988663725387,
 0.14822858254730467,
 0.0,
 0.01205654369513532,
 0.0,
 0.0,
 1.7019792781268697e-08]

In [None]:
(scores_qtb_bleu)


[1.8414468030513951e-06,
 3.7034576975352583e-19,
 0.00038858047812507146,
 0.00040011860528254504,
 0.01296820771842633,
 0.0,
 7.87754105250895e-08,
 0.0,
 0.0,
 3.94777979221673e-24]