## Finetune an Open-source LLM model

### Installing modules

In [1]:
!pip install -q -U pyarrow==14.0.1
!pip install -q -U fsspec==2023.10.0

!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.0
!pip install python-dotenv



### Login to Hugging Face

In [2]:
from huggingface_hub import login
from google.colab import userdata

HF_token = userdata.get('HF_TOKEN')
login(token=HF_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Load Model

In [3]:
import accelerate
import bitsandbytes as bnb

print(f"Accelerate version: {accelerate.__version__}")
print(f"BitsAndBytes version: {bnb.__version__}")

Accelerate version: 0.27.1
BitsAndBytes version: 0.42.0


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#Load the model and Tokenizer
model_id = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Importing Dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [6]:
# Convert HF dataset to pandas Dataframe
df = dataset["train"].to_pandas()
df.sample(10)

Unnamed: 0,Context,Response
1672,We have been fighting a lot and have 3 kids bu...,"Hello, I do not live in the California area. H..."
227,I'm depressed. I have been for years. I hide i...,"Hi Georgia, There's a really good lesson here...."
219,After he got home from the hospital he was ang...,I appreciate that you are concerned about your...
1612,I have had a crush on this guy for years. I la...,"You'll only find out whether or not it is ""too..."
709,My brother just broke up with his girlfriend. ...,"Hi Tampa, I get that this is a loss for all of..."
2556,"He is verbally abusive. When he gets mad, he j...",Good for you on recognizing your own feelings....
2296,"I was raped by multiple men, and now I can't s...","Hello Utah, thank you for writing with your qu..."
1226,We don't have sex a lot. I cheat when we argue...,"Hello, and thank you for your question. The qu..."
91,A few years ago I was making love to my wife w...,"For starters, know that this is a normal exper..."
300,I've been having this ongoing problem for most...,Not having support from your family for such a...


### Generate Prompt

In [7]:
def generate_prompt(data_point):
    # Generate prompt
    prefix_text = 'You are a mental health counselor engaging in a conversation with a client.\n'\
                  'The goal is to provide empathetic responses, offer support, and guide the client through their thoughts and feelings.\n' \
                  'Respond to the following client statement with a thoughtful and understanding reply.\n'

    # Samples with additional context info
    if data_point['Context']:
        text = f"""<start_of_turn>user {prefix_text}{data_point["Context"]} <end_of_turn>\n<start_of_turn>model{data_point["Response"]} <end_of_turn>"""
    return text

# Add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset["train"]]
dataset = dataset["train"].add_column("prompt", text_column)
dataset

Dataset({
    features: ['Context', 'Response', 'prompt'],
    num_rows: 3512
})

In [8]:
print(dataset[0]['prompt'])

<start_of_turn>user You are a mental health counselor engaging in a conversation with a client.
The goal is to provide empathetic responses, offer support, and guide the client through their thoughts and feelings.
Respond to the following client statement with a thoughtful and understanding reply.
I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.
   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.
   How can I change my feeling of being worthless to everyone? <end_of_turn>
<start_of_turn>modelIf everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many 

In [9]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [10]:
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]

print(train_data)
print(test_data)

Dataset({
    features: ['Context', 'Response', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 3160
})
Dataset({
    features: ['Context', 'Response', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 352
})


#### Low-Rank Adaptation for LLMs

In [11]:
# Automated selection of target modules
import bitsandbytes as bnb

def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)


modules = find_all_linear_names(model)
print(modules)

['k_proj', 'q_proj', 'o_proj', 'v_proj', 'down_proj', 'gate_proj', 'up_proj']


In [12]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

print(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # Causal Language Modeling (e.g., autoregressive models like GPT)
)

model = get_peft_model(model, lora_config)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
     

In [13]:
# Number of trainable parameters
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


### Training your Model

In [16]:
import transformers
from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=2500,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        warmup_steps=0.03,
        max_steps=100,
        learning_rate=1e-4,
        logging_steps=10,
        output_dir="outputs",
        optim="adamw_torch",
        save_strategy="epoch",
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.9901
20,1.9586
30,2.0014
40,1.9078
50,1.9252
60,2.0031
70,1.9251
80,2.0139
90,1.9815
100,1.9968


Checkpoint destination directory outputs/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=100, training_loss=1.9703652954101563, metrics={'train_runtime': 830.3909, 'train_samples_per_second': 0.963, 'train_steps_per_second': 0.12, 'total_flos': 4248517917032448.0, 'train_loss': 1.9703652954101563, 'epoch': 0.25})

### Pushing Model to HuggingFace

In [17]:
new_model = "gemma-2b-instruct-ft-mental-health-conv_v2"

trainer.model.save_pretrained(new_model)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model locally
# save_adapter=True, save_config=True
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/noelabu/gemma-2b-instruct-ft-mental-health-conv_v2/commit/92090707c1edec9277953364c738f649d92d0e89', commit_message='Upload tokenizer', commit_description='', oid='92090707c1edec9277953364c738f649d92d0e89', pr_url=None, pr_revision=None, pr_num=None)

## Evaluation Metrics

In [14]:
# Load/define base (non-finetuned) vs finetuned models for comparison

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from math import exp
import bitsandbytes as bnb  # Ensure bitsandbytes is installed if using quantization
from peft import PeftModel  # If using PEFT models

# Define your Hugging Face username and model IDs
username = "noelabu"
finetuned_model_id = f"{username}/gemma-2b-instruct-ft-mental-health-conv_v2"
base_model_id = "google/gemma-2b-it"

# Set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "google/gemma-2b-it"

base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
base_tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

# Load the Finetuned Model and Tokenizer from HF
finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_id)
finetuned_model = AutoModelForCausalLM.from_pretrained(
                  finetuned_model_id,
                  torch_dtype=torch.float16,
                  device_map="auto"
)

# Ensure the model is in evaluation mode
base_model.eval()
finetuned_model.eval()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaRM

### Perplexity

In [16]:
test_data

Dataset({
    features: ['Context', 'Response', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 352
})

In [17]:
test_data_subset = test_data.shuffle(seed=42).select(range(10))  # Get 10 random rows

In [21]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from math import exp

def calculate_perplexity(model, tokenizer, dataset):
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for example in dataset:
            inputs = tokenizer(example["prompt"], return_tensors="pt").to("cuda")
            labels = inputs.input_ids
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * inputs.input_ids.size(1)
            total_tokens += inputs.input_ids.size(1)

    perplexity = exp(total_loss / total_tokens)
    return perplexity

base_perplexity = calculate_perplexity(base_model, base_tokenizer, test_data_subset)
finetuned_perplexity = calculate_perplexity(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned Perplexity: {base_perplexity}")
print(f"Finetuned Perplexity: {finetuned_perplexity}\n")

Non-finetuned Perplexity: 57.76777174589178
Finetuned Perplexity: 7.864154692054048



## Semantic Similarity

In [19]:
import torch
from torch.nn.functional import cosine_similarity

def calculate_semantic_similarity(model, tokenizer, dataset):
    similarities = []
    model.eval()

    with torch.no_grad():
        for example in dataset:
            # Encode the input prompt
            input_ids = tokenizer.encode(example["prompt"], return_tensors="pt").to("cuda")
            # Generate output from the model
            output_ids = model.generate(input_ids, max_length=2500)
            generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Tokenize generated and reference texts
            gen_inputs = tokenizer(generated_text, return_tensors='pt').to("cuda")
            ref_inputs = tokenizer(example["Response"], return_tensors='pt').to("cuda")

            # Get embeddings from the last hidden state
            gen_outputs = model(**gen_inputs, output_hidden_states=True, return_dict=True)
            ref_outputs = model(**ref_inputs, output_hidden_states=True, return_dict=True)

            # Average pooling of the embeddings
            gen_embedding = gen_outputs.hidden_states[-1].mean(dim=1).squeeze()
            ref_embedding = ref_outputs.hidden_states[-1].mean(dim=1).squeeze()

            # Compute cosine similarity
            cosine_score = cosine_similarity(gen_embedding, ref_embedding, dim=0).item()
            similarities.append(cosine_score)

    average_similarity = sum(similarities) / len(similarities)
    return average_similarity

base_semantic_similarity = calculate_semantic_similarity(base_model, base_tokenizer, test_data_subset)
finetuned_semantic_similarity = calculate_semantic_similarity(finetuned_model, finetuned_tokenizer, test_data_subset)
print(f"Non-finetuned Semantic Similarity: {base_semantic_similarity:.4f}")
print(f"Finetuned Semantic Similarity: {finetuned_semantic_similarity:.4f}\n")

Non-finetuned Semantic Similarity: 0.9667
Finetuned Semantic Similarity: 0.9076

