In [None]:
# !pip install "transformers==4.35" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken"

In [None]:
%pip install -U bitsandbytes
%pip install -U accelerate
%pip install -U trl
%pip install -U transformers
%pip install -U peft
%pip install -U datasets==2.16.0

Collecting datasets>=2.21.0 (from trl)
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers>=4.46.0 (from trl)
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers>=4.46.0->trl)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached datasets-3.2.0-py3-none-any.whl (480 kB)
Downloading transformers-4.48.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokeniz

In [1]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

ModuleNotFoundError: No module named 'datasets'

In [None]:
# load csv question and answer pair data 
df = pd.read_csv("/content/Conversation.csv")
df.head()

In [None]:
# add question and answer to create single query to the tiny lama model
df['text'] = 'Question:\n' + df['question'] + '\n\nAnswer:\n' +df['answer']
df.head()

In [None]:
# remove unnecessary data rows
df1 = df.drop(columns=['Unnamed: 0','question','answer'])


In [None]:
# converting into hugging face datasets format
train_datasets = Dataset.from_pandas(df1)
train_datasets

## Finetuning

In [None]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [None]:
from huggingface_hub import login

login(token="")

In [None]:
# get the type
compute_dtype = getattr(torch, "float16")

# bits and bytes config int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

In [None]:
%%time

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
%%time
# load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [None]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
                          lora_alpha=16,
                          lora_dropout=0.1,
                          r=64,
                          bias="none",
                          task_type="CAUSAL_LM"
                        )


In [None]:
# Define the training arguments. For full list of arguments, check
#https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
args = TrainingArguments(
    output_dir='llama2-7b-tuned-qna',
    num_train_epochs=10, # adjust based on the data size
    per_device_train_batch_size=2, # use 4 if you have more GPU RAM
    save_strategy="epoch", #steps
    # evaluation_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    seed=42
)


In [None]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_datasets,
    # eval_dataset=test,
    dataset_text_field='text',
    peft_config=peft_config,
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)

In [None]:
# train
trainer.train()

In [None]:
# save model in local
trainer.save_model()

# Merge the base model and adapters and save it


In [None]:
# clean memory
# Empty VRAM
# del model
# del trainer
# import gc
# gc.collect()
# gc.collect()
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# reload the save model
%%time
from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    'llama2-7b-tuned-qna',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Test the model

In [None]:
prompt = "what is AI?"
#ground truth = "The output format of the tasks that the Florence-2 model can handle is text forms, whether it be captioning, object detection, grounding or segmentation."


input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()


outputs = new_model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         do_sample=True,
                         top_p=0.9,
                         temperature=0.6)


result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]


print(result)



In [None]:
# Merge LoRA and base model
merged_model = new_model.merge_and_unload()

In [None]:
# Save the merged model
merged_model.save_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0_merged", safe_serialization=True)
tokenizer.save_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0_merged")

In [None]:
from huggingface_hub import login
login("")

In [None]:
%%time
hf_model_repo = "Finetune_for_personal_assistant"
merged_model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)


In [None]:
hf_model_repo = ""

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)

# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo,
                                             quantization_config=bnb_config,
                                             device_map="auto")


# prompt = "Question: What is the name of the new vision foundation model introduced in the paper?\n\nAnswer:\n"
# prompt = "Question: How does the Florence-2 model take user instructions?\n\nAnswer:\n"
# prompt = "Question: What is the output format of the tasks that the Florence-2 model can handle?\n\nAnswer:\n"
prompt = "Question: What is the main challenge addressed by the paper?\n\nAnswer:\n"


# Generate response
%%time
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         temperature=0.6)

result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Print the result
print(f"Generated response:\n{result}")




In [None]:
prompt = "Question: \n\nAnswer:\n"


# Generate response
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         temperature=0.1)

result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Print the result
print(f"Generated response:\n{result}")