In [1]:
# !pip install -U accelerate
# !pip install -U bitsandbytes
# !pip install -U trl
# !pip install -U peft
# !pip install -U transformers
# !pip install -U datasets
# !pip install -U ipywidgets

# Load and Prepare Data

In [2]:
from datasets import load_dataset

dataset = load_dataset("vibhorag101/phr-mental-therapy-dataset-conversational-format-1024-tokens", trust_remote_code=True)
# dataset = dataset.shuffle(seed=42).select(range(10000))
dataset02 = dataset['val'].shuffle(seed=42).select(range(1000))

In [3]:
dataset02

Dataset({
    features: ['id', 'messages'],
    num_rows: 1000
})

In [4]:
dataset['train'][0]
# print(format_prompt(dataset[0])['text'])

{'id': 'identity_37265',
 'messages': [{'content': "You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.",
   'role': 'system'},
  {'content': "I'm grateful for your help. I really need someone to talk to about him.",
   'role': 'user'},
  {'content': "I'm here for you. Can you tell me a bit more about what's been going on?",
   'role': 'assistant'},
  {'content': "Well, things have been tense between us lately. We've been together for a while, but it feels like we're drifting apart. I don't know what to do.",
   'role'

In [5]:
from transformers import AutoTokenizer

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
template_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
template_tokenizer

LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
def format_prompt(example):
    chat = example['messages']
    prompt = template_tokenizer.apply_chat_template(chat, tokenize=False)
    return {"text": prompt}

print(format_prompt(dataset['train'][0])['text'])

train_dataset = dataset['train'].map(format_prompt)
eval_dataset = dataset02.map(format_prompt)

<|system|>
You are a helpful and joyous mental therapy assistant. Always answer as helpfully and cheerfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
I'm grateful for your help. I really need someone to talk to about him.</s>
<|assistant|>
I'm here for you. Can you tell me a bit more about what's been going on?</s>
<|user|>
Well, things have been tense between us lately. We've been together for a while, but it feels like we're drifting apart. I don't know what to do.</s>
<|assistant|>
It sounds like you're concerned about the state of your relationship. Have you noticed any specific changes or

# Model Configuration for Training

In [8]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True,
)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

In [12]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [13]:
# model

# LoRA Configuration for PEFT Fine tuning

In [14]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    lora_alpha = 32,
    lora_dropout = 0.1,
    r = 64,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# Model Training

In [15]:
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer

output_dir = "train_dir_mental_health_assistant"

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    # max_steps=1500,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=2,
    logging_steps=50,
    fp16=True,
    gradient_checkpointing=True,
    report_to=[],

    evaluation_strategy="steps", 
    eval_steps=50,
    do_eval=True
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=512,
    peft_config=peft_config,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  super().__init__(


Step,Training Loss,Validation Loss
50,0.8666,0.729117
100,0.7127,0.697103
150,0.6916,0.67981
200,0.6833,0.666315
250,0.6639,0.65773
300,0.6544,0.649343
350,0.6481,0.643451
400,0.6379,0.639027
450,0.6452,0.635131
500,0.6276,0.629317


TrainOutput(global_step=5570, training_loss=0.5532980939430223, metrics={'train_runtime': 24431.2012, 'train_samples_per_second': 1.824, 'train_steps_per_second': 0.228, 'total_flos': 1.4850670948297114e+17, 'train_loss': 0.5532980939430223, 'epoch': 1.9998204829009962})

In [16]:
trainer.model.save_pretrained("TinyLlama-1.1B-qlora-mental-health")

# Load pretrained PEFT Model for Prediction

In [19]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora-mental-health",
    device_map = 'auto',
)

merged_model = model.merge_and_unload()

In [23]:
from transformers import pipeline

prompt = """<|user|>
I feel very lonely and inadequate
<|assistant|>
"""

pipe = pipeline(task='text-generation', model=merged_model, tokenizer=tokenizer)
output = pipe(prompt)
print(output[0]['generated_text'])

<|user|>
I feel very lonely and inadequate
<|assistant|>
I'm sorry to hear that. Loneliness and inadequacy can be tough emotions to deal with. Can you tell me more about what's been going on?
