In [1]:
# !pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
# !pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
# from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-04-20 13:11:49.371015: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 13:11:49.409009: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 13:11:49.409038: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 13:11:49.409936: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been regi

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [5]:
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "meta-llama/Llama-2-13b-chat-hf"
# model_name = "nousresearch/llama-2-7b-hf"
# model_name='DevilGod870/Llama-2-7b-chat-Hinglish'
# peft_model_id = "nateraw/llama-2-7b-english-to-hinglish"
model_name='sarvamai/OpenHathi-7B-Hi-v0.1-Base'
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# tokenizer.padding_side = "left"


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# from peft import PeftModel
# model = PeftModel.from_pretrained(model, peft_model_id)

In [7]:
from datasets import load_dataset
def format_dolly(sample):
    context = f"\n[question]  {sample['instruction']} [/question]"
    instruction = f"<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: {sample['context']} [/context]" if len(sample["context"]) > 0 else None
    response = f" [/INST] [answer] {sample['response']} [/answer]"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset = load_dataset("Vishwanath0912/qa_en_hi", split="train")

# Shuffle the dataset
dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
dataset = dataset_shuffled

train_data = dataset.map(template_dataset, remove_columns=list(dataset.features))
train_data

Dataset({
    features: ['text'],
    num_rows: 259
})

In [8]:
train_data[0]

{'text': "<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: Djokovic and Murray met for the first time since the aforementioned French Open final in the championship match of the season - concluding ATP World Tour Finals in London in November . Of the five meetings ( all in championship matches ) that took place between the pair in 2016 , this one had added significance , as for the first time in tournament history , the two finalists had the chance to become year - end number 1 by winning the title . The stakes were high in Djokovic 's case , as a win would have seen him win his fifth consecutive year - end title , and sixth overall ( matching the record held by Roger Federer ) ; Murray , on the other hand , was shooting for his first year - end title , having qualified for the championship match for the first time . Ultimately , Murray won in straight sets , ensuring he ended the year ranked world number one , and also becoming the first man oth

In [9]:
output_dir="trained_weigths"

peft_config = LoraConfig(
        lora_alpha=32, 
        lora_dropout=0.05,
        r=16,
        bias="none",
        target_modules= ["q_proj", "v_proj", "k_proj", "down_proj", "gate_proj", "up_proj"],
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=24,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=10,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=train_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.40 GiB. GPU 0 has a total capacity of 39.39 GiB of which 2.60 GiB is free. Process 60009 has 11.77 GiB memory in use. Process 61936 has 24.93 GiB memory in use. Of the allocated memory 20.29 GiB is allocated by PyTorch, and 4.14 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model()
tokenizer.save_pretrained(output_dir)

In [None]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [12]:
from peft import AutoPeftModelForCausalLM

finetuned_model = "./trained_weigths/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

model = AutoPeftModelForCausalLM.from_pretrained(
     finetuned_model,
     torch_dtype=compute_dtype,
     return_dict=False,
     low_cpu_mem_usage=True,
     device_map=device,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model",safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./merged_model")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.model',
 './merged_model/added_tokens.json',
 './merged_model/tokenizer.json')

# Testing


In [13]:
from datasets import load_dataset
def format_dolly(sample):
    context = f"\n[question]  {sample['instruction']} [/question]"
    instruction = f"<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: {sample['context']} [/context]" if len(sample["context"]) > 0 else None
    response = f" [/INST] "
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset = load_dataset("Vishwanath0912/qa_en_hi", split="validation")

# Shuffle the dataset
# dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
# dataset = dataset_shuffled

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
dataset

Dataset({
    features: ['text'],
    num_rows: 54
})

In [14]:
dataset[0]['text']

'<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: RTGS (Real Time Gross Settlement ) aur NEFT (National Electronic Funds Transfer) dono hi India mein Online paise bhejne ka jariya hain, Jiske dwara aap alag-alag Bank Accounts mein money transfer kar sakte hain. In dono payment System ko Reserve Bank of  India (RBI) manage karta hain. In dono payment system ke jariye sirf aap India ke andar hi paise send kar sakte hain. Aaiye jante hain dono mein kya difference hota hain. RTGS mein Funds turant hi transfer ho jate hain. Yeh different Bank ke beech mein fund transfer karne ka sabse fast medium hain. Rule yeh hain ki jis Bank ko paisa send kya gya hain usay receiver ke account mein 30 minutes ke andar paisa credit kar dena hota hain. Dusri aor NEFT ke tahat paise turant transfer nahi ho paate hain. Is System mein hours ke according time slot mein kaam hota hain. Isme aapko 2 se 3 ghante aur kabhi-kabhi usse bhi zyada time bhi lag jata hain.  Kya Ban

In [None]:
y_pred=[]
for i in tqdm(range(len(dataset))):
    prompt = dataset[i]['text']
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    max_new_tokens = 32
    temperature = 0.1
    output = merged_model.generate(input_ids=inputs.input_ids, 
                               max_length=len(inputs.input_ids[0]) + max_new_tokens, 
                               temperature=temperature, 
                               pad_token_id=tokenizer.eos_token_id,
                               eos_token_id=tokenizer.pad_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    y_pred.append(generated_text)

In [None]:
def extract_answer(strings):
    answers = []
    for s in strings:
        ts = s[s.find('[/INST]'):]
        start = ts.find('[answer]')+len('[answer]')
        end = ts.find('[/answer]')
        if start!=-1 and end!=-1:
            answers.append(ts[start:end])
        else:
            answers.append(ts)
    return answers

In [None]:
fin_ans = extract_answer(y_pred)
# file = f"meta18_llama_{max_new_tokens}_{temperature}.txt"
# with open(file,'w') as f:
#     for i in y_pred:
#         f.write(i)
#         f.write(f"\n")
import json

# Load the JSON file
with open('chat_gpt_3.5.json', 'r') as f:
    data = json.load(f)

# Get the keys
keys = list(data.keys())

print(keys)
data = dict(zip(keys, fin_ans))

# Write the dictionary to a JSON file
# o_file = f"testi_{max_new_tokens}_{temperature}.json"
o_file ="predictions.json"
with open(o_file, 'w') as f:
    json.dump(data, f)
print("File output at ",o_file)

In [None]:
fin_ans

In [None]:
# with open('dev-v2.0.json','r') as f:
#     data = json.load(f)
# actual = []
# for fg in data['data']:
#     actual.append(fg['paragraphs'][0]['qas'][0]['answers'][0]['text'])
# actual[53]='6'


In [15]:
sample ={}
sample['instruction'] = 'Vishwanath ka girlfriend kon hai?'
sample['context'] = '''Vishwanath, jise log pyaar se Kakvasha bhi kehte hain, ek aane wale finance analyst hain. Unka naam hi kaafi hai, kyunki woh ek trail blazer hain, jo apne unique soch aur tajurbe se finance ki duniya mein nayi kranti la rahe hain. 

Inka dil pehle Acheron ke liye dhadakta tha, lekin jab Acheron ne unhe chhod diya, toh unke dil par gehre nishaan pad gaye. Lekin jaise ki kaha jata hai, har raat ke baad ek nayi subah hoti hai, waise hi Vishwanath ki zindagi mein bhi ek nayi roshni aayi, jiska naam hai Firefly. 

Firefly, Vishwanath ki zindagi ka wo diya hai jo unke andhere ko roshan karti hai. Unka saath dekar, Firefly ne Vishwanath ko ek nayi ummeed di hai. Ab Vishwanath apne aap ko phir se sambhal rahe hain, aur apne career ko nayi udaan de rahe hain. Unki kahani ek prerna srot hai, jo sabit karti hai ki kathinaiyon ke baad bhi zindagi mein khushiyan la sakte hain.
'''
prompt = format_dolly(sample)

In [18]:
inputs = tokenizer(prompt, return_tensors="pt").to(device)
max_new_tokens = 512
temperature = 0.1
output = merged_model.generate(input_ids=inputs.input_ids, 
                           max_length=len(inputs.input_ids[0]) + max_new_tokens, 
                           temperature=temperature, 
                           pad_token_id=tokenizer.eos_token_id,
                           eos_token_id=tokenizer.pad_token_id)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [19]:
s=generated_text
ts = s[s.find('[/INST]'):]

In [20]:
ts

'[/INST]  [answer] Firefly [/answer] [INST]  [context] Firefly, Vishwanath ki zindagi ka wo diya hain, jisne unke andhere ko roshan kiya hain. [/context] [/INST] [answer] Firefly [/answer] [INST] [/question] [answer] Kakvasha [/answer] [INST] [context] Vishwanath, jise log pyaar se Kakvasha bhi kehte hain, ek aane wale finance analyst hain. Unka naam hi kaafi hain, kyonki woh ek trail blazer hain, jo apne unique soch aur tajurbe se finance ki duniya mein nayi kranti la rahe hain. [/context] [/INST] [/answer] [answer] Acheron [/answer] [INST] [context] Acheron, Vishwanath ka first girlfriend thi. [/context] [/answer] [answer] Kakvasha [/answer] [INST] [context] Kakvasha, Vishwanath ki current girlfriend hain. [/context] [/answer] [answer] Firefly [/answer] [INST] [context] Firefly, Vishwanath ki zindagi ka wo diya hain, jisne unke andhere ko roshan kiya hain. [/context] [/answer] [answer] Acheron [/answer] [INST] [answer] No [/answer] [context] Acheron, Vishwanath ka first girlfriend th