## Fine-tuning Phi-3-mini-QLoRA

Based on Phi-3Cookbook https://github.com/microsoft/Phi-3CookBook/blob/main/code/04.Finetuning/Phi-3-finetune-qlora-python.ipynb

Install required packages

In [None]:
# from IPython.display import clear_output
# !pip install -qqq --upgrade bitsandbytes transformers peft accelerate datasets trl flash_attn
# !pip install huggingface_hub
# !pip install python-dotenv
# !pip install wandb -qqq
# !pip install absl-py nltk rouge_score
# !pip list | grep transformers
# clear_output()

In [1]:
from IPython.display import clear_output
!pip install peft==0.8.2
!pip install bitsandbytes==0.42.0
!pip install accelerate==0.26.1
!pip install datasets==2.16.1
!pip install GPUtil
!pip install transformers==4.38.0
!pip install huggingface-hub
!pip install trl
!pip install rouge_score
!pip install -U sentence-transformers
clear_output()

In [2]:
# Get the scripts from remote source
!git clone https://github.com/mlgomez0/Health_Therapist
!mv Health_Therapist/backend/ml_models/modules/model_tester.py .
!mv Health_Therapist/backend/ml_models/datasets/dataset.csv .
!rm -rf Health_Therapist

Cloning into 'Health_Therapist'...
remote: Enumerating objects: 646, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 646 (delta 11), reused 11 (delta 1), pack-reused 585[K
Receiving objects: 100% (646/646), 22.44 MiB | 16.24 MiB/s, done.
Resolving deltas: 100% (253/253), done.


# Import packages

In [3]:
import torch
from random import randrange
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
from datasets import Dataset, DatasetDict

# Template Exploration

In [None]:
model_name = 'acorreal/phi3-mental-health'
adapter_name = 'acorreal/adapter-phi-3-mini-mental-health'
compute_dtype = torch.bfloat16

In [None]:
%%time

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=compute_dtype)
model = PeftModel.from_pretrained(model, adapter_name)
model = model.merge_and_unload()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(adapter_name)

# Print model name
print("Model:", model.name_or_path)

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# Test the template
pipe.tokenizer.apply_chat_template([
    {
        "role": "user",
        "content": "Hello, I am stressed"}
    ],
    tokenize=False,
    add_generation_prompt=True
)

In [None]:
def predict(prompt: str) -> str:
    prompt = pipe.tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95, max_time= 180)
    print(outputs)
    return outputs[0]['generated_text'][len(prompt):].strip()

In [None]:
%%time
predict("i am going through some things with my feelings and myself i barely sleep and i do nothing but think about how i am worthless and how i should not be here i have never tried or contemplated suicide i have always wanted to fix my issues but i never get around to it how can i change my feeling of being worthless to everyone")

# Testing Generated Responses

## microsoft/Phi-3-mini-4k-instruct

In [4]:
# load all model responses
import json
def load_responses(dataset, path, pipeline, tokenizer):
  counter = 0
  result = []
  for chat in dataset:
      gen_text = predict(chat['messages'][0]['content'], pipeline, tokenizer)
      input = chat['input']
      print(f"Got prediction: {counter}")
      result.append((input, gen_text))
      if counter % 10 == 0:
        with open(path, 'w') as f:
            print(f"Saving {len(result)} responses in file")
            json.dump(result, f)
      counter += 1
  return result


### Preparing the dataset

In [5]:
df = pd.read_csv('dataset.csv')
df.columns = ['input', 'output']
df['instruction'] = "You are a mental health assistant. Your job is to provide emotional support, actively listen, and offer practical suggestions for well-being. Respond empathically and do not give specific medical advice or diagnoses. Always make sure the user feels heard and supported. If the user mentions suicidal thoughts, encourage them to seek professional help immediately. Here's the conversation so far:\n\n"
df.head()

Unnamed: 0,input,output,instruction
0,i am going through some things with my feeling...,if everyone thinks you are worthless then mayb...,You are a mental health assistant. Your job is...
1,i am going through some things with my feeling...,hello and thank you for your question and seek...,You are a mental health assistant. Your job is...
2,i am going through some things with my feeling...,first thing i would suggest is getting the sle...,You are a mental health assistant. Your job is...
3,i am going through some things with my feeling...,therapy is essential for those that are feelin...,You are a mental health assistant. Your job is...
4,i am going through some things with my feeling...,i first want to let you know that you are not ...,You are a mental health assistant. Your job is...


In [6]:
# Load the dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 2747
})

In [7]:
model_id = "microsoft/Phi-3-mini-4k-instruct"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings



tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def create_message_column(row):
    messages = []
    user = {
        "content": f"{row['instruction']}\n Input: {row['input']}",
        "role": "user"
    }
    messages.append(user)
    assistant = {
        "content": f"{row['output']}",
        "role": "assistant"
    }
    messages.append(assistant)
    return {"messages": messages}

def format_dataset_chatml(row):
    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}

In [14]:
dataset_chatml = dataset.map(create_message_column)
dataset_chatml = dataset_chatml.map(format_dataset_chatml)

Map:   0%|          | 0/2747 [00:00<?, ? examples/s]

Map:   0%|          | 0/2747 [00:00<?, ? examples/s]

In [15]:
dataset_chatml

Dataset({
    features: ['input', 'output', 'instruction', 'messages', 'text'],
    num_rows: 2747
})

In [None]:
dataset_chatml[0]

{'input': 'i am going through some things with my feelings and myself i barely sleep and i do nothing but think about how i am worthless and how i should not be here i have never tried or contemplated suicide i have always wanted to fix my issues but i never get around to it how can i change my feeling of being worthless to everyone',
 'output': 'if everyone thinks you are worthless then maybe you need to find new people to hang out withseriously the social context in which a person lives is a big influence in selfesteemotherwise you can go round and round trying to understand why you are not worthless then go back to the same crowd and be knocked down againthere are many inspirational messages you can find in social media maybe read some of the ones which state that no person is worthless and that everyone has a good purpose to their lifealso since our culture is so saturated with the belief that if someone does not feel good about themselves that this is somehow terriblebad feelings 

In [16]:
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
dataset_chatml

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'messages', 'text'],
        num_rows: 2609
    })
    test: Dataset({
        features: ['input', 'output', 'instruction', 'messages', 'text'],
        num_rows: 138
    })
})

### Importing the models

In [None]:
original_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
finetuned_model_id = 'acorreal/phi3-mental-health'
finetuned_model = AutoModelForCausalLM.from_pretrained(finetuned_model_id, trust_remote_code=True)

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

In [None]:
original_pipeline = pipeline("text-generation", model=original_model, tokenizer=tokenizer)

In [None]:
finetuned_pipeline = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer)

In [None]:
def predict(prompt, pipeline, tokenizer):
    prompt = pipeline.tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)
    outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, num_beams=1, temperature=0.3, top_k=50, top_p=0.95, max_time= 180)
    return outputs[0]['generated_text'][len(prompt):].strip()

In [None]:
predict(dataset_chatml['test'][0]['messages'][0]['content'], original_pipeline, tokenizer) #Original Model



"I hear you, and it sounds like you're going through a really tough time. It's important to recognize that you're taking a brave step by acknowledging these patterns and seeking change. Building a secure relationship starts with understanding and respecting your own needs and boundaries. It's also crucial to work on self-esteem and independence, which can help you attract healthier relationships.\n\n\nHere are some practical steps you might consider:\n\n\n1. **Self-Reflection**: Take time to reflect on what you truly want in a relationship and why you're drawn to certain types of partners. Understanding your motivations can be the first step toward change.\n\n\n2. **Self-Care**: Make sure you're taking care of your physical and emotional well-being. Engage in activities that make you feel good about yourself and that you can enjoy on your own.\n\n\n3. **Boundaries**: Learn to set clear boundaries with others. This can help prevent you from being taken advantage of and teach others how 

In [None]:
predict(dataset_chatml['test'][0]['messages'][0]['content'], finetuned_pipeline, tokenizer) #Finetuned Model

"I hear you, and it's really brave of you to share these feelings. It sounds like you're searching for a sense of security and connection, which is completely understandable. Relationships can be complex, and it's not uncommon to feel stuck in patterns that aren't serving us.\n\n\nFirstly, it's important to recognize that you have the strength to change these patterns. It's a significant step to be aware of them. Perhaps we can explore some ways to build self-reliance and self-esteem, which can help you feel more comfortable being alone and doing things independently.\n\n\nOne approach could be to engage in activities that you enjoy and that make you feel good about yourself. This could be a hobby, exercise, or volunteering. These activities can help you grow and feel more confident in your own company.\n\n\nAlso, it might be helpful to reflect on what you're looking for in a relationship. What qualities do you value in a partner? Understanding these can guide you towards relationships

In [None]:
responses_fine_tuned_model = load_responses(dataset_chatml['test'], 'fine_tuned_model0.json', finetuned_pipeline, tokenizer)

In [None]:
responses_fine_tuned_model_train = load_responses(dataset_chatml['train'], 'fine_tuned_train_model0.json', finetuned_pipeline, tokenizer)

Got prediction: 0
Saving 1 responses in file
Got prediction: 1
Got prediction: 2
Got prediction: 3
Got prediction: 4
Got prediction: 5
Got prediction: 6
Got prediction: 7
Got prediction: 8




Got prediction: 9
Got prediction: 10
Saving 11 responses in file
Got prediction: 11
Got prediction: 12
Got prediction: 13
Got prediction: 14
Got prediction: 15
Got prediction: 16
Got prediction: 17
Got prediction: 18
Got prediction: 19
Got prediction: 20
Saving 21 responses in file
Got prediction: 21
Got prediction: 22
Got prediction: 23
Got prediction: 24
Got prediction: 25
Got prediction: 26
Got prediction: 27
Got prediction: 28
Got prediction: 29
Got prediction: 30
Saving 31 responses in file
Got prediction: 31
Got prediction: 32
Got prediction: 33
Got prediction: 34
Got prediction: 35
Got prediction: 36
Got prediction: 37
Got prediction: 38
Got prediction: 39
Got prediction: 40
Saving 41 responses in file
Got prediction: 41
Got prediction: 42
Got prediction: 43
Got prediction: 44
Got prediction: 45
Got prediction: 46
Got prediction: 47
Got prediction: 48
Got prediction: 49
Got prediction: 50
Saving 51 responses in file
Got prediction: 51
Got prediction: 52
Got prediction: 53
Got pr

In [None]:
responses_original_model = load_responses(dataset_chatml['test'], 'original_model.json', original_pipeline, tokenizer)

In [None]:
responses_original_model_train = load_responses(dataset_chatml['train'], 'original_model_train.json', original_pipeline, tokenizer)

## Model Responses

In [None]:
model_responses_df = pd.read_json('fine_tuned_model.json') # model responses for testing
model_responses_df.columns = ['input', 'output']
model_responses_df.head()

Unnamed: 0,input,output
0,i want a secure relationship with someone that...,It sounds like you're going through a really t...
1,my boyfriend is in recovery from drug addictio...,I'm truly sorry to hear that you're going thro...
2,is this something i should be worried about sh...,I'm really glad you're reaching out and sharin...
3,my friend is abusing her prescription medicine...,I'm really sorry to hear that you're going thr...
4,i terminated my counseling relationship with a...,I'm really glad to hear that you're considerin...


In [None]:
original_model_responses_df = pd.read_json('original_model.json') # model responses for testing
original_model_responses_df.columns = ['input', 'output']
original_model_responses_df.head()

Unnamed: 0,input,output
0,i want a secure relationship with someone that...,"I hear you, and it sounds like you're going th..."
1,my boyfriend is in recovery from drug addictio...,I'm truly sorry to hear that you're going thro...
2,is this something i should be worried about sh...,I'm really glad you're reaching out to talk ab...
3,my friend is abusing her prescription medicine...,I'm really concerned to hear about what you're...
4,i terminated my counseling relationship with a...,I'm really sorry to hear that you've been havi...


In [25]:
model_train_responses_df = pd.read_json('fine_tuned_train_model0.json') # model responses for training
model_train_responses_df.columns = ['input', 'output']
model_train_responses_df.head()

Unnamed: 0,input,output
0,i do not love my sister i would never wish her...,I hear you expressing a complex mix of emotion...
1,every winter i find myself getting sad because...,I'm really glad you've reached out and shared ...
2,i keep having these random thoughts that i do ...,I'm really sorry to hear that you're going thr...
3,i feel like i was born in the wrong body i fee...,I'm really sorry to hear that you're feeling t...
4,my boyfriend is in ireland for days and i am a...,I'm really sorry to hear that you're feeling t...


In [11]:
model_train_responses_df.shape

(1891, 2)

In [26]:
train_df = dataset_chatml['train'].to_pandas(["input", "output"])

In [27]:
model_train_responses_df['therapist_output'] = train_df['output'][:model_train_responses_df.shape[0]]

In [None]:
original_model_responses_df.shape

(138, 2)

In [None]:
model_responses_df.shape

(138, 2)

In [None]:
test_df = dataset_chatml['test'].to_pandas(["input", "output"])

In [None]:
model_responses_df['therapist_output'] = test_df['output']

In [None]:
model_responses_df.shape

(138, 3)

In [None]:
original_model_responses_df['therapist_output'] = test_df['output']

In [None]:
original_model_responses_df.to_csv('original_model_responses.csv') # saving as csv

In [None]:
model_responses_df.to_csv('model_responses.csv') # saving as csv

In [39]:
model_train_responses_df.to_csv('model_train_responses.csv', index=False) # saving as csv

In [None]:
model_responses_df.head()

Unnamed: 0,input,output,therapist_output
0,i want a secure relationship with someone that...,It sounds like you're going through a really t...,good for you on your keen awareness of your di...
1,my boyfriend is in recovery from drug addictio...,I'm truly sorry to hear that you're going thro...,my empathy goes out to you relationships are t...
2,is this something i should be worried about sh...,I'm really glad you're reaching out and sharin...,it can be tricky to figure out if a child is t...
3,my friend is abusing her prescription medicine...,I'm really sorry to hear that you're going thr...,first of all where did you find the pills did ...
4,i terminated my counseling relationship with a...,I'm really glad to hear that you're considerin...,yes your former social worker should return yo...


In [None]:
model_responses_df.to_excel('model_responses.xlsx') # saving as excel

In [None]:
original_model_responses_df.to_excel('original_model_responses.xlsx') # saving as excel

## Rouge Test

In [46]:
from model_tester import ModelTester

In [None]:
model_responses_df = pd.read_csv('model_responses.csv')

In [None]:
original_model_responses_df = pd.read_csv('original_model_responses.csv')

In [43]:
model_train_responses_df = pd.read_csv('model_train_responses.csv')

In [44]:
model_train_responses_df.shape

(1891, 3)

In [None]:
finetune_model_tester = ModelTester(model_responses_df['therapist_output'], model_responses_df['output']) # Initiate tester class
finetune_model_tester.calculate_rouge_score() # Calculate Rouge Score
rouge_score = finetune_model_tester.test_results
rouge_score_df = pd.DataFrame(rouge_score)
rouge_score_df.head()

Unnamed: 0,rouge1,rouge2,rougeL
precision,0.345152,0.041858,0.165014
recall,0.315464,0.036597,0.156128
fmeasure,0.300732,0.035537,0.145425


In [None]:
original_model_tester = ModelTester(original_model_responses_df['therapist_output'], original_model_responses_df['output']) # Initiate tester class
original_model_tester.calculate_rouge_score() # Calculate Rouge Score
original_rouge_score = original_model_tester.test_results
original_rouge_score_df = pd.DataFrame(original_rouge_score)
original_rouge_score_df.head()

Unnamed: 0,rouge1,rouge2,rougeL
precision,0.34659,0.040629,0.169133
recall,0.303819,0.033308,0.151799
fmeasure,0.293892,0.033484,0.144125


In [47]:
model_train_tester = ModelTester(model_train_responses_df['therapist_output'], model_train_responses_df['output']) # Initiate tester class
model_train_tester.calculate_rouge_score() # Calculate Rouge Score
model_train_rouge_score = model_train_tester.test_results
model_train_rouge_score_df = pd.DataFrame(model_train_rouge_score)
model_train_rouge_score_df.head()

Unnamed: 0,rouge1,rouge2,rougeL
precision,0.343122,0.043622,0.164482
recall,0.329095,0.039788,0.163465
fmeasure,0.303918,0.037534,0.147016


## LLM Test

In [48]:
import re
import requests
from google.colab import userdata

In [52]:
# Hugging Face API configuration

api_token = userdata.get('HF_TOKEN')

# Headers for the request
headers = {
    "Authorization": f"Bearer {api_token}"
}

# The model to use
model_name = "mistralai/Mistral-7B-v0.1"

# The endpoint for the Hugging Face inference API
api_url = f"https://api-inference.huggingface.co/models/{model_name}"

### Testing Phi3

In [50]:
def create_template(text_1, text_2, input):
  prompt = f"""
    You will be given a user_question, a mental_health_therapist_answer and a machine_generated_answer.
    Your task is to provide a 'total rating' scoring how well the machine_generated_answer answers
    the user concerns expressed in the user_question. Use the mental_health_therapist_answer as example of
    a good answer. Give your answer as a float on a scale of 0 to 10, where 0 means that the
    machine_generated_answer is not helpful at all, and 10 means that the answer completely and helpfully
    addresses the user_question.

    Provide your feedback as follows:

    Feedback:::
    Total rating: (your rating, as a float between 0 and 10)

    Now here are the question and answers.

    User Question: {input}
    Mental Health Therapist Answer: {text_1}
    Machine Generated Answer: {text_2}

    Feedback:::
    Total rating: """

  return prompt

def get_llm_scores(model_name, prompt, api_url):

  # The data payload for the POST request
  data = {
      "inputs": prompt,
      "parameters": {
          "max_length": 100,
          "num_return_sequences": 1
      }
  }

  # Make the request to the API
  response = requests.post(api_url, headers=headers, json=data)

  # Parse the response
  if response.status_code == 200:
      response_data = response.json()
      generated_text = response_data[0]["generated_text"]
      similarity_score = re.search(r'Total rating: (\d+\.\d+)', generated_text)

      if similarity_score:
          score = float(similarity_score.group(1))
          print(f"Similarity Score: {score}")
          return score
      else:
          print("Similarity score not found in the response.")
          return None
  else:
      print(f"Request failed with status code {response.status_code}: {response.text}")
      return None


In [None]:
result = [] # calculating the score using LLMs for fine-tuned model

for i in range(model_responses_df.shape[0]):
  prompt = create_template(model_responses_df['therapist_output'][i], model_responses_df['output'][i], model_responses_df['input'][i])
  score = get_llm_scores(model_name, prompt, api_url)
  result.append(score)


Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 5.0
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 0.0
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 5.0
Similarity

In [None]:
df_llm_eval = pd.DataFrame(result, columns=["acorreal/phi3-mental-health"])
df_llm_eval.head()

Unnamed: 0,acorreal/phi3-mental-health
0,7.5
1,8.5
2,7.5
3,7.5
4,8.5


In [None]:
df_llm_eval.to_csv('llm_eval.csv')

In [None]:
df_llm_eval = pd.read_csv('llm_eval.csv')
df_llm_eval.head()

Unnamed: 0.1,Unnamed: 0,acorreal/phi3-mental-health
0,0,7.5
1,1,8.5
2,2,7.5
3,3,7.5
4,4,8.5


In [None]:
result = [] # calculating the score using LLMs for the original model

for i in range(original_model_responses_df.shape[0]):
  prompt = create_template(original_model_responses_df['therapist_output'][i], original_model_responses_df['output'][i], original_model_responses_df['input'][i])
  score = get_llm_scores(model_name, prompt, api_url)
  result.append(score)

Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 7.5
Similarity Score: 8.5
Similarity Score: 7.5
Similarity score not found in the response.
Similarity

## Comparing Models

In [None]:
df_llm_eval = pd.read_csv('llm_eval.csv')
df_llm_eval[['acorreal/phi3-mental-health']].mean()

acorreal/phi3-mental-health    7.601449
dtype: float64

In [None]:
df_llm_eval["microsoft/Phi-3-mini-4k-instruct"] = result

In [None]:
df_llm_eval.head()

Unnamed: 0.1,Unnamed: 0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct
0,0,7.5,7.5
1,1,8.5,7.5
2,2,7.5,7.5
3,3,7.5,7.5
4,4,8.5,7.5


In [None]:
df_llm_eval.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df_llm_eval.to_csv('llm_eval.csv', index=False)

In [64]:
df_llm_eval = pd.read_csv('llm_eval.csv')
df_llm_eval.head()

Unnamed: 0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct
0,7.5,7.5
1,8.5,7.5
2,7.5,7.5
3,7.5,7.5
4,8.5,7.5


In [66]:
llm_test_result = df_llm_eval.mean().to_frame(name='mean').T

In [67]:
llm_test_result

Unnamed: 0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct
mean,7.601449,7.525547


In [None]:
df_llm_eval_train = pd.DataFrame(result, columns=["acorreal/phi3-mental-health/train"])
df_llm_eval_train.head()

In [81]:
result = [] # calculating the score using LLMs for fine-tuned model with training set
counter = 0
for i in range(model_train_responses_df.shape[0]):
  prompt = create_template(model_train_responses_df['therapist_output'][i], model_train_responses_df['output'][i], model_train_responses_df['input'][i])
  score = get_llm_scores(model_name, prompt, api_url)
  result.append(score)
  counter += 1
  if counter % 100 == 0:
    print(f"Saving {len(result)} responses in file")
    with open('llm_eval_train.json', 'w') as f:
        json.dump(result, f)

In [69]:
import numpy as np

In [73]:
# read result from json
with open('llm_eval_train.json', 'r') as f:
    result = json.load(f)

In [75]:
result = result[:300]

In [77]:
result = np.array(result, dtype=np.float64)

In [78]:
result = np.array(result)
result = result[~np.isnan(result)]

In [79]:
llm_test_result['acorreal/phi3-mental-health/train'] = np.array(result).mean()

In [80]:
llm_test_result

Unnamed: 0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct,acorreal/phi3-mental-health/train
mean,7.601449,7.525547,7.611864
