## This notebook contains code for Quantization-Aware Training (QAT)

## In this Note I tried to Fine tune one of the datasets "PIQA" and quantize it to perform Quantization-Aware Training using LORA and unsloth 

## I saved the model weights and uploaded them to hugging face- 
https://huggingface.co/Mubinmodi007/Llama-3.2-1B-finetuned

## Evaluation and Explanation of the model is in LLM_Quant_testing_accuarcy.ipynb

Note: The execution of some cells might be not in order(had to run some cells because Colab session expired.


In [1]:
%%capture
!pip install unsloth
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install trl
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer

In [4]:
max_seq_length = 2048
dtype = None  # None for auto detection
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B",  # Use LLaMA 3.2 11B when available
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="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"  # Required for gated models
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=True,  # Enable rank-stabilized LoRA
)

Unsloth 2024.9.post4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [6]:
from datasets import load_dataset
piqa = load_dataset("ybisk/piqa")
#boolq = load_dataset("google/boolq")
#winogrande = load_dataset("allenai/winogrande", "winogrande_xl")
#arc = load_dataset("allenai/ai2_arc", "ARC-Challenge")

piqa.py:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

The repository for ybisk/piqa contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ybisk/piqa.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/815k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1838 [00:00<?, ? examples/s]

In [7]:
def preprocess_piqa(example):
    return {
        'input_text': f"<piqa> Goal: {example['goal']} Solution1: {example['sol1']} Solution2: {example['sol2']}",
        'label': int(example['label'])
    }

piqa = piqa.map(preprocess_piqa)


Map:   0%|          | 0/16113 [00:00<?, ? examples/s]

Map:   0%|          | 0/3084 [00:00<?, ? examples/s]

Map:   0%|          | 0/1838 [00:00<?, ? examples/s]

In [7]:
#def preprocess_boolq(example):
#    return {
 #       'input_text': f"<boolq> Question: {example['question']} Passage: {example['passage']}",
  #      'label': 1 if example['answer'] else 0
   # }

#boolq = boolq.map(preprocess_boolq)


Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [8]:
#def preprocess_arc(example):
    # Adjust this according to the actual structure of your dataset
    #choices = " ".join([f"Option {i+1}: {choice}" for i, choice in enumerate(example['choices'])])
    #return {
       # 'input_text': f"<arc> Question: {example['question']} {choices}",
       # 'label': example['answerKey']
    #}

#arc = arc.map(preprocess_arc)

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

Map:   0%|          | 0/1172 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

In [10]:
#def preprocess_winogrande(example):
    #return {
       # 'input_text': f"<winogrande> Sentence: {example['sentence']} Option1: {example['option1']} Option2: {example['option2']}",
       # 'label': 0 if example['answer'] == '1' else 1
   # }

#winogrande = winogrande.map(preprocess_winogrande)


Map:   0%|          | 0/40398 [00:00<?, ? examples/s]

Map:   0%|          | 0/1767 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

In [8]:
# Step 5: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_ratio=0.03,
    group_by_length=True,
    seed=3407,
)

In [10]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=piqa['train'],  # Access the 'train' split of the dataset
    dataset_text_field="input_text",
    max_seq_length=max_seq_length,
    args=training_args,
)

trainer_stats = trainer.train()

Map:   0%|          | 0/16113 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,113 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 16
\        /    Total batch size = 16 | Total steps = 3,021
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
10,2.07
20,2.2519
30,2.0642
40,1.9751
50,2.0064
60,1.6439
70,1.604
80,1.6565
90,1.7174
100,1.7134


In [15]:
# Hugging Face API authentication
from huggingface_hub import notebook_login, HfApi

notebook_login()
username = "Mubinmodi007"
api = HfApi(token="your_huggingface_token")  # Replace with your actual token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from huggingface_hub import create_repo
quant_path="Llama-3.2-1B-finetuned"
repo_url = create_repo(repo_id=f"{username}/{quant_path}", repo_type="model", private=False)
print(f"Repository created: {repo_url}")

Repository created: https://huggingface.co/Mubinmodi007/Llama-3.2-1B-finetuned


In [20]:
from transformers import AutoTokenizer
from huggingface_hub import HfApi, HfFolder
import os

# Save the model and tokenizer locally
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# Get the token from the login process
token = HfFolder.get_token()

# Create the HfApi instance using the obtained token
api = HfApi(token=token)

# Set your Hugging Face username and the desired repository name
username = "Mubinmodi007"  # Replace with your actual username
repo_name = "Llama-3.2-1B-finetuned"  # Replace with your desired model name

# Create the full repository ID
repo_id = f"{username}/{repo_name}"

# Check the contents of the local directory where the model is saved
print("Contents of the local model directory:")
print(os.listdir("lora_model"))

# Upload the files to Hugging Face
api.upload_folder(
    repo_id=repo_id,
    folder_path="lora_model",
    commit_message="Upload LoRA model"
)
print("Model uploaded successfully to Hugging Face")

Contents of the local model directory:
['adapter_model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'adapter_config.json', 'README.md']


  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Model uploaded successfully to Hugging Face


In [39]:
def preprocess_piqa(example):
    prompt = f"Goal: {example['goal']}\nChoose the correct solution:\n1. {example['sol1']}\n2. {example['sol2']}\nlabel:"
    return {"input_text": prompt, "label": str(example['label'] + 1)}

In [40]:
print(piqa['test'][0])

{'goal': 'how do you puncture a vein?', 'sol1': 'hit it at the wrong angle and make it bleed.', 'sol2': 'pop it.', 'label': -1, 'input_text': '<piqa> Goal: how do you puncture a vein? Solution1: hit it at the wrong angle and make it bleed. Solution2: pop it.'}


In [43]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer
from unsloth import FastLanguageModel  # Import FastLanguageModel from unsloth

# Assuming you have already loaded your model

# Prepare the model for inference
model = FastLanguageModel.for_inference(model)

# Load the tokenizer (replace with your model's tokenizer if different)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define the preprocessing function for PIQA
def preprocess_piqa(example):
    return {
        'text': f"Goal: {example['goal']} Solution1: {example['sol1']} Solution2: {example['sol2']} Answer:",
        'label': example['label']
    }

# Modify the evaluate_model function to handle PIQA specifics
def evaluate_model(model, tokenizer, dataset, preprocess_func):
    correct = 0
    total = 0

    model.eval()  # Set the model to evaluation mode
    device = next(model.parameters()).device  # Get the device the model is on

    for example in tqdm(dataset):
        processed = preprocess_func(example)
        input_text = processed['text']

        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=2048)
        inputs = {k: v.to(device) for k, v in inputs.items() if k != 'token_type_ids'}

        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=5)

        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split()[-1]

        # PIQA-specific: Check if the predicted answer matches the correct solution
        if predicted_answer == '1' and processed['label'] == 0:
            correct += 1
        elif predicted_answer == '2' and processed['label'] == 1:
            correct += 1

        total += 1

    accuracy = correct / total if total > 0 else 0
    return {"accuracy": accuracy, "correct": correct, "total": total}

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device
model = model.to(device)

# Evaluate the model on the PIQA test set
results = evaluate_model(model, tokenizer, piqa['test'], preprocess_piqa)

# Print the results
print(f"Evaluation Results on PIQA test set:")
print(f"Correct: {results['correct']}")
print(f"Total: {results['total']}")
print(f"Accuracy: {results['accuracy']:.2%}")

100%|██████████| 3084/3084 [10:08<00:00,  5.07it/s]

Evaluation Results on PIQA test set:
Correct: 0
Total: 3084
Accuracy: 0.00%



