In [None]:
#installing and importing all necessary libraries. Make sure you are connected to a T4 GPU on colab or have a GPU with cuda installed
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
#enter your hugging face token
!huggingface-cli login

In [None]:
hf_token=input('Enter hugging face token :')

In [None]:
#you can use custom dataset and vary the length of train and eval dataset
train_data = load_dataset("vibhorag101/phr_mental_therapy_dataset",split="train[:50000]")
eval_data = load_dataset("vibhorag101/phr_mental_therapy_dataset",split="train[51000:52000]")

In [None]:
print("Evaluation data length : ",len(eval_data))
print("Training data length : ",len(train_data))

In [None]:
#Parameters
model_name = "meta-llama/Llama-2-7b-chat-hf" #make sure you have access to this model
new_model = "llama-2-7b-custom"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 100
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"": 0}

In [None]:
#System prompt
system_message="You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Do not assume anything. "

In [None]:
# Quantization
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


#loading the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    token = hf_token
)

model.config.use_cache = False
model.config.pretraining_tp = 1

#loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name
                                          , trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    evaluation_strategy="steps",
    eval_steps= 20
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
     eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()
trainer.model.save_pretrained(new_model)


In [None]:
# Test the model
logging.set_verbosity(logging.CRITICAL)
message=input("Enter a command : ")
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\{message}[/INST]"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
result = pipe(prompt)
print(result[0]['generated_text'])

In [None]:
# you can add history context by modifying the prompt
while True:
  message=input("Enter a command : ")
  prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\{message}[/INST]"
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
  result = pipe(prompt)
  print(result[0]['generated_text'])

In [None]:
#you can save the model directly but in my case i ran out of gpu memory hence i tried other alternatives
model.save_pretrained(model_path)
save_pretrained(model_path)

Pushing the model to hub

In [None]:
# You can push your model to huggingface
# make sure you have a write token
hf_repo_id=""# enter your repo name
model.push_to_hub(hf_repo_id)
tokenizer.push_to_hub(hf_repo_id)

In [None]:
# then you can directly use your model
from transformers import pipeline
hf_repo_id=""# enter your repo name
pipe = pipeline("text-generation", model=hf_rep_id)
message=input("Enter a command : ")
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\{message}[/INST]"
result = pipe(prompt)
print(result[0]['generated_text'])

In [None]:
# or load the model
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_repo_id=""# enter your repo name
# you can add quantization parameters to your model


tokenizer = AutoTokenizer.from_pretrained(hf_repo_id)
model = AutoModelForCausalLM.from_pretrained(hf_repo_id)


pipe= pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
message=input("Enter a command : ")
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\{message}[/INST]"
result = pipe(prompt)
print(result[0]['generated_text'])

If you don't want to push the mode to hugging face you can try saving the model in drive

Merging the adapter model with base model

Make sure you restart the session and run the imports and parameters cell

In [None]:
# Merge and save the fine-tuned model in google drive
from google.colab import drive
drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/llama-2-7b-custom"  # change to your preferred path
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
#loading model from drive
from google.colab import drive
drive.mount('/content/drive')
from transformers import AutoTokenizer, AutoModelForCausalLM,pipeline
model_path=""
# you can add quantization parameters to your model


tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)


pipe= pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

message=input("Enter a command : ")
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\{message}[/INST]"
result = pipe(prompt)
print(result[0]['generated_text'])