In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Prerequisites

Before delving into the fine-tuning process, ensure that you have the following prerequisites in place:

1. **GPU**: This tutorial cannot run on free Google Colab; it requires more powerful GPUs, such as the A100.
2. **Python Packages**: Ensure that you have the necessary Python packages installed. You can use the following commands to install them:

Let's begin by checking if your GPU is correctly detected:

In [None]:
!nvidia-smi

Mon May 27 04:15:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Let's define a wrapper function which will get completion from the model from a user question

## Step 1 - Install necessary packages
First, install the dependencies below to get started. As these features are available on the main branches only, we need to install the libraries below from source.

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets scipy
!pip install -q trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.

## Step 2 - Model loading
We'll load the model using QLoRA quantization to reduce the usage of memory


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

Now we specify the model ID and then we load it with our previously defined quantization configuration.

In [None]:
from huggingface_hub import notebook_login
from google.colab import output

output.enable_custom_widget_manager()
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Run a inference on the base model. The model does not seem to understand our instruction and gives us a list of questions related to our query.

In [None]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  <s>

  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [None]:
result = get_completion(query="code the fibonacci series in python using reccursion", model=model, tokenizer=tokenizer)
print(result)

<s> 
  <s> 
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  code the fibonacci series in python using reccursion
  [/INST]
  </s> 
  <s> 

  </s> 
   Here is a Python code that uses recursion to generate the Fibonacci series:

   ````python
   def fibonacci(n):
     if n <= 0:
       return 0
     elif n == 1:
       return 1
     else:
       return(fibonacci(n-1) + fibonacci(n-2))
   
   # Example
   for i in range(10):
       print(fibonacci(i))
   ````

   The `fibonacci()` function takes an argument `n` which is the position of the desired Fibonacci number in the Fibonacci series. The function uses recursion to calculate the Fibonacci number by calling itself twice with the argument of `n-1` and `n-2`. If the value of `n` is less than or equal to `0`, the function returns `0`, and if the function is called for `n=1`, the function returns `1`.</s>


In [None]:
import re
code_block = re.search(r'python(.*?)', result, re.DOTALL)
if code_block:
  code = code_block.group(1).strip()
  print(code)
else:
  print("No code block found.")




In [None]:
code

''

## Step 3 - Load dataset for finetuning

### Lets Load the Dataset

For this tutorial, we will fine-tune Mistral 7B Instruct for code generation.

We will be using this [dataset](https://huggingface.co/datasets/TokenBender/code_instructions_122k_alpaca_style) which is curated by [TokenBender (e/xperiments)](https://twitter.com/4evaBehindSOTA) and is an excellent data source for fine-tuning models for code generation. It follows the alpaca style of instructions, which is an excellent starting point for this task. The dataset structure should resemble the following:

```json
{
  "instruction": "Create a function to calculate the sum of a sequence of integers.",
  "input": "[1, 2, 3, 4, 5]",
  "output": "# Python code def sum_sequence(sequence): sum = 0 for num in sequence: sum += num return sum"
}
```

In [None]:
from datasets import load_dataset

dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split="train")
dataset

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/169M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/121959 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'text', 'instruction', 'output'],
    num_rows: 121959
})

In [None]:
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,input,text,instruction,output
0,"[1, 2, 3, 4, 5]",Below is an instruction that describes a task....,Create a function to calculate the sum of a se...,# Python code\ndef sum_sequence(sequence):\n ...
1,"str1 = ""Hello ""\nstr2 = ""world""",Below is an instruction that describes a task....,Develop a function that will add two strings,"def add_strings(str1, str2):\n """"""This func..."
2,,Below is an instruction that describes a task....,Design a data structure in C++ to store inform...,#include <map>\n#include <string>\n\nclass Gro...
3,"[3, 1, 4, 5, 9, 0]",Below is an instruction that describes a task....,Implement a sorting algorithm to sort a given ...,def bubble_sort(arr):\n n = len(arr)\n \n ...
4,Not applicable,Below is an instruction that describes a task....,Design a Swift application for tracking expens...,import UIKit\n\nclass ExpenseViewController: U...
5,Not Applicable,Below is an instruction that describes a task....,Create a REST API to convert a UNIX timestamp ...,<?php\n$timestamp = $_GET['timestamp'];\n\nif(...
6,website: www.example.com \ndata to crawl: phon...,Below is an instruction that describes a task....,Generate a Python code for crawling a website ...,import requests\nimport re\n\ndef crawl_websit...
7,,Below is an instruction that describes a task....,Create a Python list comprehension to get the ...,"[x*x for x in [1, 2, 3, 5, 8, 13]]"
8,,Below is an instruction that describes a task....,Create a MySQL query to find the most expensiv...,SELECT * FROM products ORDER BY price DESC LIM...
9,Not applicable,Below is an instruction that describes a task....,Create a data structure in Java for storing an...,public class Library {\n \n // map of books in...


In [None]:
df.shape

(121959, 4)

Instruction Fintuning - Prepare the dataset under the format of "prompt" so the model can better understand :
1. the function generate_prompt : take the instruction and output and generate a prompt
2. shuffle the dataset
3. tokenizer the dataset

In [None]:
df.to_parquet('/content/drive/MyDrive/CodeBot/Mistral-7B/Dataset/CodeAssitantRawData.parquet')

### Formatting the Dataset

Now, let's format the dataset in the required [Mistral-7B-Instruct-v0.1 format](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1).



We'll put each instruction and input pair between `[INST]` and `[/INST]` output after that, like this:

```
<s>[INST] What is your favorite condiment? [/INST]
Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavor to whatever I'm cooking up in the kitchen!</s>
```

You can use the following code to process your dataset and create a JSONL file in the correct format:

In [None]:
def generate_prompt(data_point):

    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'

    # Samples with additional context into.
    if data_point['input']:
        text = f"""<s>[INST]{prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} [/INST]{data_point["output"]}</s>"""
    # Without
    else:
        text = f"""<s>[INST]{prefix_text} {data_point["instruction"]} [/INST]{data_point["output"]} </s>"""
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

We'll need to tokenize our data so the model can understand.


In [None]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map:   0%|          | 0/121959 [00:00<?, ? examples/s]

Split dataset into 90% for training and 10% for testing

In [None]:
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
training_data = train_data.to_pandas()
testing_data = test_data.to_pandas()

In [None]:
training_data.shape,testing_data.shape

((97567, 7), (24392, 7))

In [None]:
training_data.head(2)

Unnamed: 0,input,text,instruction,output,prompt,input_ids,attention_mask
0,"['apple', 'banana', 'orange', 'strawberry', 'g...",Below is an instruction that describes a task....,Create an Angular application for displaying a...,import { Component } from '@angular/core';\n\n...,<s>[INST]Below is an instruction that describe...,"[1, 1, 733, 16289, 28793, 20548, 336, 349, 396...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Name: John Doe; Age: 30; Gender: Male,Below is an instruction that describes a task....,Create a JavaScript class to represent a perso...,"class Person {\n constructor(name, age, gen...",<s>[INST]Below is an instruction that describe...,"[1, 1, 733, 16289, 28793, 20548, 336, 349, 396...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
training_data['prompt'].iloc[0]

'<s>[INST]Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n Create an Angular application for displaying a list of items. The application should enable users to sort the list of items. here are the inputs [\'apple\', \'banana\', \'orange\', \'strawberry\', \'grape\'] [/INST]import { Component } from \'@angular/core\';\n\n@Component({\n selector: \'my-app\',\n templateUrl: \'./app.component.html\',\n styleUrls: [ \'./app.component.css\' ]\n})\nexport class AppComponent  {\n  name = \'Angular\';\n\n  fruits = [\'apple\', \'banana\', \'orange\', \'strawberry\', \'grape\'];\n\n  sortFruits() {\n    this.fruits.sort();\n  }\n\n}\n\n<div>\n  <button (click)="sortFruits()">Sort</button>\n  <ul>\n    <li *ngFor="let fruit of fruits">{{fruit}}</li>\n  </ul>\n</div></s>'

In [None]:
training_data.to_parquet('/content/drive/MyDrive/CodeBot/Mistral-7B/Dataset/SampleTrainingData.parquet')
testing_data.to_parquet('/content/drive/MyDrive/CodeBot/Mistral-7B/Dataset/SampleTestingData.parquet')

### After Formatting, We should get something like this

```json
{
"text":"<s>[INST] Create a function to calculate the sum of a sequence of integers. here are the inputs [1, 2, 3, 4, 5] [/INST]
# Python code def sum_sequence(sequence): sum = 0 for num in sequence: sum += num return sum</s>",
"instruction":"Create a function to calculate the sum of a sequence of integers",
"input":"[1, 2, 3, 4, 5]",
"output":"# Python code def sum_sequence(sequence): sum = 0 for num in,
 sequence: sum += num return sum"
"prompt":"<s>[INST] Create a function to calculate the sum of a sequence of integers. here are the inputs [1, 2, 3, 4, 5] [/INST]
# Python code def sum_sequence(sequence): sum = 0 for num in sequence: sum += num return sum</s>"

}
```

While using SFT (**[Supervised Fine-tuning Trainer](https://huggingface.co/docs/trl/main/en/sft_trainer)**) for fine-tuning, we will be only passing in the “text” column of the dataset for fine-tuning.

In [None]:
print(test_data)

Dataset({
    features: ['input', 'text', 'instruction', 'output', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 24392
})


## Step 4 - Apply Lora  
Here comes the magic with peft! Let's load a PeftModel and specify that we are going to use low-rank adapters (LoRA) using get_peft_model utility function and  the prepare_model_for_kbit_training method from PEFT.

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

Use the following function to find out the linear layers for fine tuning.
QLoRA paper : "We find that the most critical LoRA hyperparameter is how many LoRA adapters are used in total and that LoRA on all linear transformer block layers is required to match full finetuning performance."

In [None]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

['down_proj', 'up_proj', 'v_proj', 'o_proj', 'q_proj', 'gate_proj', 'k_proj']


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")


Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


## Step 5 - Run the training!

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Setting the training arguments:
* for the reason of demo, we just ran it for few steps (100) just to showcase how to use this integration with existing tools on the HF ecosystem.

In [None]:
# from datasets import load_dataset
# data = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split='train')
# data = data.train_test_split(test_size=0.1)
# train_data = data["train"]
# test_data = data["test"]

In [None]:
# import transformers

# tokenizer.pad_token = tokenizer.eos_token


# trainer = transformers.Trainer(
#     model=model,
#     train_dataset=train_data,
#     eval_dataset=test_data,
#     args=transformers.TrainingArguments(
#         per_device_train_batch_size=1,
#         gradient_accumulation_steps=4,
#         warmup_ratio=0.03,
#         max_steps=100,
#         learning_rate=2e-4,
#         fp16=True,
#         logging_steps=1,
#         output_dir="outputs_mistral_b_finance_finetuned_test",
#         optim="paged_adamw_8bit",
#         save_strategy="epoch",
#     ),
#     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
# )


### Fine-Tuning with qLora and Supervised Fine-Tuning

We're ready to fine-tune our model using qLora. For this tutorial, we'll use the `SFTTrainer` from the `trl` library for supervised fine-tuning. Ensure that you've installed the `trl` library as mentioned in the prerequisites.

In [None]:
#new code using SFTTrainer
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

max_steps is given, it will override any value given in num_train_epochs


Start the training

### Let's start the training process

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()




Step,Training Loss
1,2.5216
2,1.9731
3,1.5858
4,1.086
5,1.4027
6,1.0239
7,0.9907
8,0.7124
9,0.8221
10,0.8608


TrainOutput(global_step=100, training_loss=0.666908483505249, metrics={'train_runtime': 2236.9171, 'train_samples_per_second': 0.179, 'train_steps_per_second': 0.045, 'total_flos': 3734778424246272.0, 'train_loss': 0.666908483505249, 'epoch': 0.004099746840632591})

 Share adapters on the 🤗 Hub

In [None]:
new_model = "mistralai-Code-Instruct-Finetune-IK" #Name of the model you will be pushing to huggingface model hub

In [None]:
trainer.model.save_pretrained(new_model)

In [None]:
save_directory = "/content/drive/MyDrive/CodeBot/Mistral-7B/Training"
trainer.model.save_pretrained(save_directory)

In [None]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig.from_pretrained('/content/drive/MyDrive/CodeBot/Mistral-7B/Training')
model = get_peft_model(model, lora_config)

In [None]:
model.push_to_hub("Mistral-7B-qlora-finetunined-IK")

adapter_model.safetensors:   0%|          | 0.00/84.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/muskwaa/Mistral-7B-qlora-finetunined-IK/commit/421d8a3ae40d49cab9a931655bd8608f36d4f061', commit_message='Upload model', commit_description='', oid='421d8a3ae40d49cab9a931655bd8608f36d4f061', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("muskwaa/Mistral-7B-qlora-finetunined-IK")

OSError: None is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

## Merging Adaptor and BaseModel

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
from safetensors.torch import load_file

In [None]:
from transformers import AutoModel, AutoTokenizer,AutoConfig, AutoModelForSequenceClassification
from safetensors import safe_open
import torch

# Paths to the adapter files
adapter_config_path = "/content/drive/MyDrive/CodeBot/Mistral-7B/Training/adapter_config.json"
adapter_weights_path = "/content/drive/MyDrive/CodeBot/Mistral-7B/Training/adapter_model.safetensors"


adapter_weights = load_file(adapter_weights_path)

# Add the adapter weights to the base model
base_model.load_state_dict(adapter_weights, strict=False)



_IncompatibleKeys(missing_keys=['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gat

In [None]:
# Save the merged model locally
save_directory = "/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik"
base_model.save_pretrained(save_directory)

# Save the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik/tokenizer_config.json',
 '/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik/special_tokens_map.json',
 '/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik/tokenizer.model',
 '/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik/added_tokens.json',
 '/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik/tokenizer.json')

In [None]:
base_model.push_to_hub("mistral-qlora-finetunined-ik-assitant")

model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/muskwaa/mistral-qlora-finetunined-ik-assitant/commit/279df01d313e94ab6a85d63e6add75ced994bfff', commit_message='Upload MistralForCausalLM', commit_description='', oid='279df01d313e94ab6a85d63e6add75ced994bfff', pr_url=None, pr_revision=None, pr_num=None)

# Loading the Model

In [None]:
save_directory = "/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik"

# Load the model from the directory
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

# Load the tokenizer from the directory
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Example usage: Tokenizing and making a prediction
inputs = tokenizer("code the fibonacci series in python using reccursion", return_tensors="pt")
outputs = model(**inputs)

print(outputs)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SequenceClassifierOutputWithPast(loss={'logits': tensor([[-5.9375,  1.6328]], dtype=torch.float16, grad_fn=<ToCopyBackward0>), 'past_key_values': ((tensor([[[[-5.0781e-01,  1.2598e-01,  6.7578e-01,  ..., -1.6250e+00,
           -2.1562e+00, -2.1875e+00],
          [ 6.0391e+00, -4.3789e+00, -2.8750e+00,  ...,  2.1406e+00,
            1.2578e+00,  6.3232e-01],
          [ 1.2793e+00, -1.8291e+00,  2.8467e-01,  ...,  8.0029e-01,
           -7.7344e-01, -1.6797e+00],
          ...,
          [ 2.1055e+00,  6.2266e+00,  4.8984e+00,  ...,  2.5000e+00,
            1.6738e+00,  7.6562e-01],
          [ 8.4375e+00,  4.0508e+00,  3.4355e+00,  ...,  2.4375e+00,
            1.4385e+00,  4.8633e-01],
          [ 1.5928e+00, -4.3945e-03,  3.1567e-01,  ...,  4.2822e-01,
            3.7378e-01,  2.9248e-01]],

         [[ 6.9336e-02,  7.0312e-02,  7.2754e-02,  ...,  3.5625e+00,
           -3.6875e+00,  1.7578e+00],
          [ 1.4221e-01, -3.0737e-01, -1.4941e-01,  ..., -2.9688e+00,
            2.937

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# ik_tokenizer = AutoTokenizer.from_pretrained("muskwaa/mistral-qlora-finetunined-ik-assitant")
# ik_model = AutoModelForCausalLM.from_pretrained("muskwaa/mistral-qlora-finetunined-ik-assitant")

In [None]:
# # Push the model and tokenizer to the Hugging Face Model Hub

# new_model.push_to_hub(new_model, use_temp_dir=False)
# tokenizer.push_to_hub(new_model, use_temp_dir=False)

## Step 6 Evaluating the model qualitatively: run an inference!



In [None]:
def get_completion_merged(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)

  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [None]:
from transformers import MistralForCausalLM

In [None]:
save_directory = "/content/drive/MyDrive/CodeBot/Mistral-7B/Training/FineTunedModel/mistral-finetuned-ik"

# Load the model from the directory
model = MistralForCausalLM.from_pretrained(save_directory)

# Load the tokenizer from the directory
tokenizer = AutoTokenizer.from_pretrained(save_directory)

result = get_completion_merged(query="code the fibonacci series in python using reccursion", model=model, tokenizer=tokenizer)
print(result)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


<s> 
  <s> 
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  code the fibonacci series in python using reccursion
  [/INST]
  </s> 


  </s>

  Here is an example of how you can code the Fibonacci series in Python using recursion:


  def fibonacci(n):
    if n <= 1:
      return n
    else:
      return fibonacci(n-1) + fibonacci(n-2)

  
  # Example usage:
  print(fibonacci(10))
  output: 55
 

  Note: Recursion can be efficient for small values of n, but it may not be practical for larger values due to the high number of function calls. An iterative approach may be more efficient for larger values of n.</s>


In [None]:
query = "I was solving problem on dynamic programming, basically the fibonacci series using dynamic programming, i want to get started with memoziation, how do i get started, i do not want code for this."
result = get_completion_merged(query=query, model=model, tokenizer=tokenizer)
print(result)

<s> 
  <s> 
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  I was solving problem on dynamic programming, basically the fibonacci series using dynamic programming, i want to get started with memoziation, how do i get started, i do not want code for this.
  [/INST]
  </s> 


  </s> 

  To get started with memoization, you can begin by creating an auxiliary array or hash table to store the values that have already been calculated. This will allow you to avoid redundant calculations and improve the efficiency of your algorithm. Here's an example of how you can implement memoization in the context of the fibonacci series:
```scss
int fibonacci(int n, int[] memo) {
  if (n == 1) {
    return 0;
  } else if (n == 2) {
    return 1;
  } else {
    memo[n] = fibonacci(n-1, memo) + fibonacci(n-2, memo);
    return memo[n];
  }
}
int fibonacciMemo(int n) {
  int memo[] = new int[n+1];
  return fibonacci(n, memo);
}
```
In thi

In [None]:
!pip install transformers



In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets scipy
!pip install -q trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.t

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import MistralForCausalLM

tokenizer = AutoTokenizer.from_pretrained("muskwaa/mistral-qlora-finetunined-ik-assitant")
model = MistralForCausalLM.from_pretrained("muskwaa/mistral-qlora-finetunined-ik-assitant")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


RuntimeError: No GPU found. A GPU is needed for quantization.