#### Install Dependencies

In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml)

#### Confirm CUDA

In [2]:
import torch
torch.cuda.is_available()

True

#### Load Base Model

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
#    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

##### View Model Summary

In [4]:
print(model)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear8bitLt(in_features=1024, out_features=3072, bias=True)
          (dense): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear8bitLt(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementw

In [5]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

#### Helper Function

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

#### Obtain LoRA Model

In [7]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 786432 || all params: 560001024 || trainable%: 0.14043402892063284


#### Load Sample Dataset

In [8]:
from datasets import load_dataset

qa_dataset = load_dataset("squad_v2")

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

```
### CONTEXT
{context}

### QUESTION
{question}

### ANSWER
{answer}</s>
```

In [9]:
import random

num_samples = 1000

# Generate random indices
random_indices = random.sample(range(len(qa_dataset['train'])), num_samples)

# Sample the records
sampled_records = qa_dataset['train'].select(random_indices)

In [10]:
 # Print the first few records from the training set
for i in range(5):
    print(f"Record {i+1}: {sampled_records[i]}")

Record 1: {'id': '5acd149907355d001abf33b9', 'title': 'Pain', 'context': "The pain signal travels from the periphery to the spinal cord along an A-delta or C fiber. Because the A-delta fiber is thicker than the C fiber, and is thinly sheathed in an electrically insulating material (myelin), it carries its signal faster (5–30 m/s) than the unmyelinated C fiber (0.5–2 m/s). Pain evoked by the (faster) A-delta fibers is described as sharp and is felt first. This is followed by a duller pain, often described as burning, carried by the C fibers. These first order neurons enter the spinal cord via Lissauer's tract.", 'question': 'What is the name for an insulating material?', 'answers': {'text': [], 'answer_start': []}}
Record 2: {'id': '56e07be97aa994140058e55f', 'title': 'Saint_Helena', 'context': "The first aircraft, a South African Beechcraft King Air 200, landed at the new airport on 15 September 2015, prior to conducting a series of flights to calibrate the airport's radio navigation e

In [11]:
def create_prompt(context, question, answer):
  if len(answer["text"]) < 1:
    answer = "Cannot Find Answer"
  else:
    answer = answer["text"][0]
  prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
  return prompt_template

mapped_qa_dataset = sampled_records.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

#### Train LoRA

In [12]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=mapped_qa_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.4802
2,3.435
3,3.3549
4,3.385
5,3.1527
6,3.3006
7,3.3911
8,3.3453
9,3.5057
10,3.6101


TrainOutput(global_step=100, training_loss=3.0147631549835205, metrics={'train_runtime': 359.11, 'train_samples_per_second': 4.455, 'train_steps_per_second': 0.278, 'total_flos': 756973824245760.0, 'train_loss': 3.0147631549835205, 'epoch': 1.6})

In [13]:
# Save the model to a directory
# Save the model to a directory
model_save_path = "./my_finetuned_model"

model.save_pretrained(model_save_path)

In [14]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_config = PeftConfig.from_pretrained(model_save_path)
model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

qa_model = get_peft_model(model, peft_config)

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [15]:
# Assuming 'model' is your PyTorch model
device = next(qa_model.parameters()).device
print("Model is on device:", device)

Model is on device: cpu


```
### CONTEXT
{context}

### QUESTION
{question}

### ANSWER
{answer}</s>

```

In [16]:
from IPython.display import display, Markdown

def make_inference(context, question):
  batch = tokenizer(f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n", return_tensors='pt')

  device = next(qa_model.parameters()).device
  batch = {k: v.to(device) for k, v in batch.items()}

  with torch.cuda.amp.autocast():
    output_tokens = qa_model.generate(**batch, max_new_tokens=30)

  display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [17]:
context = " Chandrayaan-3 was launched    Satish Dhawan Space Centre on 14 July 2023. The spacecraft entered lunar orbit on 5 August, and the lander touched down near the Lunar south pole on 23 August 2023"
question = "When was chandaryan-3 launched? "

make_inference(context, question)

### CONTEXT
 Chandrayaan-3 was launched    Satish Dhawan Space Centre on 14 July 2023. The spacecraft entered lunar orbit on 5 August, and the lander touched down near the Lunar south pole on 23 August 2023

### QUESTION
When was chandaryan-3 launched? 

### ANSWER
The spacecraft was launched on 14 July 2023. The spacecraft entered lunar orbit on 5 August, and the lander touched down near the Lunar south pole