In [None]:
!pip install torch transformers accelerate bitsandbytes peft trl datasets wandb
!pip install datasets pandas treelib

In [None]:
!python -m bitsandbytes

In [5]:
import json
from collections import defaultdict
import pandas as pd
from datasets import load_dataset

# Load dataset
dataset = load_dataset("OpenAssistant/oasst1", split="train")

# Convert to Pandas for easy manipulation
df = dataset.to_pandas()

# Filter for English conversations
df = df[df["lang"] == "en"]

# Sort messages by conversation tree and index
df = df.sort_values(by=["message_tree_id"])
df = df.sort_index()

# Group by conversation trees
conversation_trees = defaultdict(list)
for _, row in df.iterrows():
    conversation_trees[row["message_tree_id"]].append(row)

# Function to extract multi-turn conversations
def process_conversations(conversation_list, output_full_tree=True):
    formatted_data = []
    
    # Maintain context
    context = ""
    response = ""  # ✅ Define response outside the loop to avoid scope issues

    for i, message in enumerate(conversation_list):
        if message["role"] == "prompter":  # User's message
            context += f"\n\n### User:\n{message['text']}"
            response = ""  # Reset response for the new user prompt
            
            
        elif message["role"] == "assistant":
            if not response:
                response = message["text"]  # ✅ First response in thread

            if output_full_tree:
                # Store the structured prompt-response pair
                formatted_data.append({"instruction": context.strip(), "response": response})
                
                # Append response to context for next turns
                context += f"\n\n### Assistant:\n{response}"

            else:
                if i == 1:
                    # Store the structured prompt-response pair
                    formatted_data.append({"instruction": context.strip(), "response": response})
                
                break
            
        #print(i, context)
    return formatted_data

# Apply processing to all conversation trees
final_data = []
for conversation in conversation_trees.values():
    final_data.extend(process_conversations(conversation, output_full_tree=False))
    #print(conversation)
    #break

# Save the processed dataset
with open("oasst1_multi_turn_phi2.json", "w") as f:
    json.dump(final_data, f, indent=4)

#df.sample(1).transpose().to_dict()

In [6]:
from datasets import load_dataset

# Load JSON dataset
dataset = load_dataset("json", data_files="oasst1_multi_turn_phi2.json")

# Load Phi-2 tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    # Ensure instructions and responses are processed as lists
    instructions = examples["instruction"]
    responses = examples["response"]

    # Concatenate instruction and response for each example
    texts = [f"{inst}\n\n### Assistant:\n{resp}" for inst, resp in zip(instructions, responses)]

    # Tokenize the batch of texts
    return {'input_text': texts} #tokenizer(texts, padding="max_length", truncation=True, max_length=1024)


# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Map:   0%|          | 0/3484 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

device_map = {"": 0}
model_name = "microsoft/phi-2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map=device_map
)
model.config.use_cache = False
print(model)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_la

In [8]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2"
]

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

In [9]:
from trl import SFTConfig
import time

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"
run_name = f"phi2-qlora-run-{int(time.time())}"
max_seq_length = 256

sft_config = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    dataset_text_field="input_text",
    max_seq_length=max_seq_length,
    run_name=run_name,
    report_to="none"
)

In [10]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
)

trainer.train()

Converting train dataset to ChatML:   0%|          | 0/3484 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3484 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3484 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2856 > 2048). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/3484 [00:00<?, ? examples/s]

Step,Training Loss
10,11.6459
20,12.3385
30,13.3166
40,11.0515
50,13.0837
60,12.5535
70,10.8255
80,13.6944
90,11.0348
100,11.2969


TrainOutput(global_step=500, training_loss=11.38444842529297, metrics={'train_runtime': 7163.9105, 'train_samples_per_second': 2.233, 'train_steps_per_second': 0.07, 'total_flos': 5.233902520516608e+16, 'train_loss': 11.38444842529297})

In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "microsoft/phi-2"
device_map = {"": 0}

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map=device_map,
)

from peft import PeftModel
new_model = "/kaggle/working/results/checkpoint-500"
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
from transformers import pipeline

prompt = "\n\n### User:\nWhat was the role of indian revolutionaries in indian independence movement ?\n\n### Assistant:\n"  # change to your desired prompt
gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=500)
result = gen(prompt)
print(result[0]['generated_text'])

Device set to use cuda:0




### User:
What was the role of indian revolutionaries in indian independence movement ?

### Assistant:
Indian revolutionaries played a significant role in the Indian independence movement. They were individuals who used various means, including violence, to fight against British colonial rule and to promote the idea of an independent India.

The Indian revolutionaries were inspired by the ideas of nationalism and freedom that were spreading throughout the world at the time. They were influenced by the writings of Indian leaders such as Mahatma Gandhi, Jawaharlal Nehru, and Subhash Chandra Bose, who advocated for non-violent resistance and armed struggle against British rule.

The revolutionaries used various methods to carry out their activities, including assassinations of British officials, bombings, and sabotage. Some of the most famous Indian revolutionaries include Bhagat Singh, Sukhdev Thapar, and Rash Behari Bose.

The revolutionaries were also involved in the formation of va

In [22]:
prompt = '''### User:\nExplain CNN to a five year old.\n\n### Assistant:\nA Convolutional Neural Network (CNN) is like a special kind of computer brain that can look at pictures and understand what they are about. It's like a big puzzle solver that can find patterns and shapes in pictures.\nImagine you have a picture of a cat. A CNN can look at each tiny piece of the picture, called a pixel, and figure out what color it is. Then it can look at the colors of the pixels around it and figure out if they make up the cat's fur or its eyes. It can do this over and over again, looking at different parts of the picture, until it understands the whole picture and knows it's a cat.\nCNNs are used in lots of different things, like helping robots see and understand the world around them, or helping doctors look at X-rays and find problems in our bodies. They're really smart and can do lots of things that humans can do, but even better!\nSo, in short, a CNN is a special kind of computer brain that can look at pictures and understand what they are about by finding patterns and shapes in them. It's like a big puzzle solver that can do things that humans can do, but even better!\nI hope that helps you understand what a CNN is! Let me know if you have any other questions.\n\n### User:\nWhat does it mean by convolution here?\n\n### Assistant:\n'''  # change to your desired prompt
gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=500)
result = gen(prompt)
print(result[0]['generated_text'])

Device set to use cuda:0


### User:
Explain CNN to a five year old.

### Assistant:
A Convolutional Neural Network (CNN) is like a special kind of computer brain that can look at pictures and understand what they are about. It's like a big puzzle solver that can find patterns and shapes in pictures.
Imagine you have a picture of a cat. A CNN can look at each tiny piece of the picture, called a pixel, and figure out what color it is. Then it can look at the colors of the pixels around it and figure out if they make up the cat's fur or its eyes. It can do this over and over again, looking at different parts of the picture, until it understands the whole picture and knows it's a cat.
CNNs are used in lots of different things, like helping robots see and understand the world around them, or helping doctors look at X-rays and find problems in our bodies. They're really smart and can do lots of things that humans can do, but even better!
So, in short, a CNN is a special kind of computer brain that can look at picture

In [None]:
'''### User:\nExplain CNN to a five year old.\n\n### Assistant:\nA Convolutional Neural Network (CNN) is like a special kind of computer brain that can look at pictures and understand what they are about. It's like a big puzzle solver that can find patterns and shapes in pictures.\nImagine you have a picture of a cat. A CNN can look at each tiny piece of the picture, called a pixel, and figure out what color it is. Then it can look at the colors of the pixels around it and figure out if they make up the cat's fur or its eyes. It can do this over and over again, looking at different parts of the picture, until it understands the whole picture and knows it's a cat.\nCNNs are used in lots of different things, like helping robots see and understand the world around them, or helping doctors look at X-rays and find problems in our bodies. They're really smart and can do lots of things that humans can do, but even better!\nSo, in short, a CNN is a special kind of computer brain that can look at pictures and understand what they are about by finding patterns and shapes in them. It's like a big puzzle solver that can do things that humans can do, but even better!\nI hope that helps you understand what a CNN is! Let me know if you have any other questions.\n\n### User:\nWhat does it mean by convolution here?'''

In [37]:
import torch

import gc

gc.collect()  # ✅ Python garbage collection
torch.cuda.empty_cache()
!nvidia-smi

Sat Mar  8 06:36:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             32W /  250W |   16191MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                