# Comparing Models Performance

In [None]:
!pip install rouge_score



## Importing modules

In [None]:
# Get the scripts from remote source
!git clone https://github.com/mlgomez0/Health_Therapist
!mv Health_Therapist/backend/ml_models/modules .
!mv Health_Therapist/backend/ml_models/datasets .
!mv Health_Therapist/backend/ml_models/LLMs_fine_tuning/phi3-qlora .
!mv Health_Therapist/requirements.txt .
!rm -rf Health_Therapist

Cloning into 'Health_Therapist'...
remote: Enumerating objects: 646, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 646 (delta 11), reused 11 (delta 1), pack-reused 585[K
Receiving objects: 100% (646/646), 22.83 MiB | 8.02 MiB/s, done.
Resolving deltas: 100% (253/253), done.
mv: cannot move 'Health_Therapist/backend/ml_models/modules' to './modules': Directory not empty
mv: cannot move 'Health_Therapist/backend/ml_models/datasets' to './datasets': Directory not empty
mv: cannot move 'Health_Therapist/backend/ml_models/LLMs_fine_tuning/phi3-qlora' to './phi3-qlora': Directory not empty


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
import numpy as np
from scipy import stats
from scipy.stats import wilcoxon

In [None]:
fine_tuned_phi3_model_responses = pd.read_csv('phi3-qlora/model_responses.csv', index_col=False)
original_phi3_model_responses = pd.read_csv('phi3-qlora/original_model_responses.csv', index_col=False)

In [None]:
fine_tuned_phi3_model_responses.head()

Unnamed: 0,input,output,therapist_output
0,i want a secure relationship with someone that...,It sounds like you're going through a really t...,good for you on your keen awareness of your di...
1,my boyfriend is in recovery from drug addictio...,I'm truly sorry to hear that you're going thro...,my empathy goes out to you relationships are t...
2,is this something i should be worried about sh...,I'm really glad you're reaching out and sharin...,it can be tricky to figure out if a child is t...
3,my friend is abusing her prescription medicine...,I'm really sorry to hear that you're going thr...,first of all where did you find the pills did ...
4,i terminated my counseling relationship with a...,I'm really glad to hear that you're considerin...,yes your former social worker should return yo...


## Topic Modeling

In [10]:
text = fine_tuned_phi3_model_responses['input'] + ' ' + fine_tuned_phi3_model_responses['therapist_output']

In [11]:
df = pd.read_csv('datasets/dataset.csv')

In [12]:
vectorizer = CountVectorizer(min_df=10, max_df=0.5, stop_words="english") # To transform the text into a numerical representation
topic_model = NMF(n_components=10)

topic_pipeline = Pipeline(
    [
        ("vectorizer", vectorizer),
        ("topic_model", topic_model),
    ]
)

In [13]:
topic_pipeline.fit(text)

In [14]:
# dictionary to map the topics

topics_map = {
    0: "Work-Life Balance",
    1: "Other",
    2: "Romantic Relationships",
    3: "Therapy and Therapist",
    4: "Negative Thoughts",
    5: "Parenthood",
    6: "People Feelings",
    7: "Anxiety and Depression",
    8: "Anxiety and Depression",
    9: "Counseling Issues"
}

In [15]:
topics = topic_pipeline.transform(text)
topics = np.argmax(topics, axis=1)
topic_names = [topics_map[topic] for topic in topics]

## Rouge Score with Topic Modeling

In [16]:
from modules.model_tester import ModelTester

In [17]:
rouge_tester_fine_tuned = ModelTester(fine_tuned_phi3_model_responses['therapist_output'], fine_tuned_phi3_model_responses['output'])
rouge_tester_original = ModelTester(original_phi3_model_responses['therapist_output'], original_phi3_model_responses['output'])

In [18]:
rouge_scores_fine_tuned = rouge_tester_fine_tuned.calculate_rouge_score_by_item()
rouge_scores_original = rouge_tester_original.calculate_rouge_score_by_item()

In [19]:
len(rouge_scores_fine_tuned['rouge1']['precision'])

138

In [20]:
rouge_df = pd.DataFrame()

In [21]:
for key in rouge_scores_fine_tuned.keys():
  for val in rouge_scores_fine_tuned[key].keys():
    fine_tuned_model_name = "fine-tuned_" + key + "_" + val
    original_model_name = "original_" + key + "_" + val
    rouge_df[fine_tuned_model_name] = rouge_scores_fine_tuned[key][val]
    rouge_df[original_model_name] = rouge_scores_original[key][val]

rouge_df['topic'] = topic_names
rouge_df.head()

Unnamed: 0,fine-tuned_rouge1_precision,original_rouge1_precision,fine-tuned_rouge1_recall,original_rouge1_recall,fine-tuned_rouge1_fmeasure,original_rouge1_fmeasure,fine-tuned_rouge2_precision,original_rouge2_precision,fine-tuned_rouge2_recall,original_rouge2_recall,fine-tuned_rouge2_fmeasure,original_rouge2_fmeasure,fine-tuned_rougeL_precision,original_rougeL_precision,fine-tuned_rougeL_recall,original_rougeL_recall,fine-tuned_rougeL_fmeasure,original_rougeL_fmeasure,topic
0,0.336066,0.295337,0.312977,0.435115,0.324111,0.351852,0.033058,0.026042,0.030769,0.038462,0.031873,0.031056,0.180328,0.15544,0.167939,0.229008,0.173913,0.185185,Romantic Relationships
1,0.445087,0.455696,0.37561,0.35122,0.407407,0.396694,0.046512,0.044586,0.039216,0.034314,0.042553,0.038781,0.179191,0.170886,0.15122,0.131707,0.164021,0.14876,Romantic Relationships
2,0.506667,0.465517,0.102981,0.073171,0.171171,0.126464,0.027027,0.0,0.005435,0.0,0.00905,0.0,0.253333,0.241379,0.051491,0.03794,0.085586,0.065574,Parenthood
3,0.521739,0.542169,0.186047,0.174419,0.274286,0.26393,0.032967,0.04878,0.011673,0.015564,0.017241,0.023599,0.26087,0.301205,0.093023,0.096899,0.137143,0.146628,People Feelings
4,0.172973,0.169312,0.367816,0.367816,0.235294,0.231884,0.01087,0.026596,0.023256,0.05814,0.014815,0.036496,0.086486,0.111111,0.183908,0.241379,0.117647,0.152174,Counseling Issues


In [22]:
topics = rouge_df['topic'].unique()
topics

array(['Romantic Relationships', 'Parenthood', 'People Feelings',
       'Counseling Issues', 'Other', 'Anxiety and Depression',
       'Negative Thoughts', 'Therapy and Therapist', 'Work-Life Balance'],
      dtype=object)

In [23]:
def statistical_test_rouge(data, topics, col1, col2):
  result = []
  for topic in topics:
    data_t = data.loc[data['topic'] == topic]
    data1 = data_t[col1]
    data2 = data_t[col2]
    data1 = data1.dropna()
    data2 = data2.dropna()
    _, p_value = wilcoxon(data1, data2)
    result.append((topic, p_value, len(data1)))
  return result

In [24]:
def get_rouge_scores_df(data, topics, col1, col2):
  rouge_ttest = statistical_test_rouge(data, topics, col1, col2)
  df_result = data[['topic', col1, col2]].groupby('topic').mean()
  df_result['p_value'] = 0.0
  df_result['n'] = 0
  for result in rouge_ttest:
    df_result.loc[result[0], 'p_value'] = result[1]
    df_result.loc[result[0], 'n'] = result[2]
  return df_result


In [25]:
from IPython.display import display

In [26]:
cols = rouge_df.columns
for i in range(0, len(cols) - 1, 2):
  partial_rouge_df = get_rouge_scores_df(rouge_df, topics, cols[i], cols[i + 1])
  print(f"ROUGE scores between {cols[i]} and {cols[i + 1]}:")
  display(partial_rouge_df.style.set_properties(**{'text-align': 'center'}).set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]}]
  ))
  print("\n" + "-"*80 + "\n")

ROUGE scores between fine-tuned_rouge1_precision and original_rouge1_precision:


Unnamed: 0_level_0,fine-tuned_rouge1_precision,original_rouge1_precision,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.347094,0.348194,0.879293,34
Counseling Issues,0.301464,0.306824,0.408243,26
Negative Thoughts,0.342085,0.367882,0.21875,6
Other,0.369295,0.331799,0.015625,7
Parenthood,0.320798,0.335238,0.803955,15
People Feelings,0.372615,0.379737,0.578206,25
Romantic Relationships,0.375831,0.360225,0.088654,17
Therapy and Therapist,0.301859,0.29343,0.6875,6
Work-Life Balance,0.513283,0.538579,1.0,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rouge1_recall and original_rouge1_recall:




Unnamed: 0_level_0,fine-tuned_rouge1_recall,original_rouge1_recall,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.32448,0.316535,0.565645,34
Counseling Issues,0.316336,0.310879,0.886403,26
Negative Thoughts,0.368321,0.292958,0.03125,6
Other,0.293831,0.27224,0.296875,7
Parenthood,0.298963,0.288357,0.388186,15
People Feelings,0.313644,0.296986,0.172959,25
Romantic Relationships,0.330195,0.336992,0.679116,17
Therapy and Therapist,0.272494,0.250529,0.5625,6
Work-Life Balance,0.2182,0.218232,1.0,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rouge1_fmeasure and original_rouge1_fmeasure:


Unnamed: 0_level_0,fine-tuned_rouge1_fmeasure,original_rouge1_fmeasure,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.309158,0.302275,0.34285,34
Counseling Issues,0.282353,0.282349,0.980084,26
Negative Thoughts,0.290141,0.252528,0.15625,6
Other,0.289533,0.26476,0.21875,7
Parenthood,0.282395,0.286328,0.977966,15
People Feelings,0.309963,0.306974,0.490786,25
Romantic Relationships,0.335375,0.330825,0.611221,17
Therapy and Therapist,0.266584,0.23065,0.0625,6
Work-Life Balance,0.297508,0.296488,1.0,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rouge2_precision and original_rouge2_precision:




Unnamed: 0_level_0,fine-tuned_rouge2_precision,original_rouge2_precision,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.042317,0.043429,0.921715,34
Counseling Issues,0.035695,0.034753,0.764394,26
Negative Thoughts,0.055203,0.045224,0.15625,6
Other,0.027919,0.032875,0.46875,7
Parenthood,0.034148,0.029211,0.396726,15
People Feelings,0.051515,0.046141,0.325392,25
Romantic Relationships,0.045193,0.047298,0.963226,17
Therapy and Therapist,0.023873,0.019104,0.273322,6
Work-Life Balance,0.085653,0.107392,0.5,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rouge2_recall and original_rouge2_recall:




Unnamed: 0_level_0,fine-tuned_rouge2_recall,original_rouge2_recall,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.043169,0.037731,0.400418,34
Counseling Issues,0.032447,0.034336,0.986134,26
Negative Thoughts,0.045355,0.024157,0.043114,6
Other,0.015904,0.016484,0.8125,7
Parenthood,0.030506,0.02551,0.463071,15
People Feelings,0.042489,0.034509,0.074143,25
Romantic Relationships,0.038431,0.041964,0.234321,17
Therapy and Therapist,0.019817,0.020949,0.654721,6
Work-Life Balance,0.031761,0.038034,0.5,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rouge2_fmeasure and original_rouge2_fmeasure:




Unnamed: 0_level_0,fine-tuned_rouge2_fmeasure,original_rouge2_fmeasure,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.038844,0.03821,0.610589,34
Counseling Issues,0.031309,0.030959,0.822184,26
Negative Thoughts,0.039953,0.026015,0.0625,6
Other,0.017006,0.018055,0.8125,7
Parenthood,0.030313,0.026041,0.509797,15
People Feelings,0.042349,0.036174,0.107315,25
Romantic Relationships,0.040046,0.042309,0.579056,17
Therapy and Therapist,0.021012,0.01874,0.715001,6
Work-Life Balance,0.045173,0.053784,0.5,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rougeL_precision and original_rougeL_precision:


Unnamed: 0_level_0,fine-tuned_rougeL_precision,original_rougeL_precision,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.16644,0.167768,0.95963,34
Counseling Issues,0.150856,0.151364,0.802801,26
Negative Thoughts,0.165679,0.188586,0.21875,6
Other,0.171366,0.173521,0.9375,7
Parenthood,0.153139,0.1652,0.25238,15
People Feelings,0.173319,0.177899,0.287229,25
Romantic Relationships,0.175238,0.171918,0.547668,17
Therapy and Therapist,0.150877,0.162736,0.6875,6
Work-Life Balance,0.241375,0.265025,1.0,2





--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rougeL_recall and original_rougeL_recall:




Unnamed: 0_level_0,fine-tuned_rougeL_recall,original_rougeL_recall,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.164493,0.156542,0.289477,34
Counseling Issues,0.160944,0.157517,0.765198,26
Negative Thoughts,0.19481,0.15371,0.043114,6
Other,0.134166,0.143722,0.465209,7
Parenthood,0.146579,0.147644,0.972125,15
People Feelings,0.151902,0.143531,0.088277,25
Romantic Relationships,0.154973,0.163712,0.432626,17
Therapy and Therapist,0.136263,0.132789,0.68583,6
Work-Life Balance,0.106011,0.10965,0.5,2



--------------------------------------------------------------------------------

ROUGE scores between fine-tuned_rougeL_fmeasure and original_rougeL_fmeasure:


Unnamed: 0_level_0,fine-tuned_rougeL_fmeasure,original_rougeL_fmeasure,p_value,n
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anxiety and Depression,0.150529,0.145545,0.101225,34
Counseling Issues,0.142142,0.14089,0.802801,26
Negative Thoughts,0.14509,0.131433,0.09375,6
Other,0.131938,0.137566,0.8125,7
Parenthood,0.135697,0.142723,0.229309,15
People Feelings,0.146823,0.14574,0.691519,25
Romantic Relationships,0.156868,0.158842,0.889969,17
Therapy and Therapist,0.133616,0.124202,0.4375,6
Work-Life Balance,0.143166,0.148087,0.5,2



--------------------------------------------------------------------------------



## LLM Evaluation with Topic Modeling

In [None]:
llm_eval = pd.read_csv('phi3-qlora/llm_eval.csv')
llm_eval.head()

Unnamed: 0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct
0,7.5,7.5
1,8.5,7.5
2,7.5,7.5
3,7.5,7.5
4,8.5,7.5


In [None]:
llm_eval['topic'] = topic_names

In [None]:
llm_eval.head()

Unnamed: 0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct,topic
0,7.5,7.5,Romantic Relationships
1,8.5,7.5,Romantic Relationships
2,7.5,7.5,Parenthood
3,7.5,7.5,People Feelings
4,8.5,7.5,Counseling Issues


In [None]:
# getting the tipics in the dataset
topics = llm_eval['topic'].unique()
topics

array(['Romantic Relationships', 'Parenthood', 'People Feelings',
       'Counseling Issues', 'Other', 'Anxiety and Depression',
       'Negative Thoughts', 'Therapy and Therapist', 'Work-Life Balance'],
      dtype=object)

In [None]:
def statistical_test(data, topics):
  result = []
  for topic in topics:
    data_t = data.loc[data['topic'] == topic]
    data1 = data_t['acorreal/phi3-mental-health']
    data2 = data_t['microsoft/Phi-3-mini-4k-instruct']
    try:
      _, p_value = wilcoxon(data1, data2)
    except Exception as e:
      p_value = np.nan
      print(f"Error processing topic '{topic}': {e}")
    result.append((topic, p_value))
  return result


In [None]:
results = statistical_test(llm_eval, topics)

Error processing topic 'Work-Life Balance': zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.




In [None]:
results

[('Romantic Relationships', 0.7388826803635273),
 ('Parenthood', 0.31731050786291415),
 ('People Feelings', 0.17971249487899976),
 ('Counseling Issues', nan),
 ('Other', 0.6547208460185769),
 ('Anxiety and Depression', 0.8515548977976366),
 ('Negative Thoughts', 0.6547208460185769),
 ('Therapy and Therapist', 0.4142161782425252),
 ('Work-Life Balance', nan)]

In [None]:
llm_t_test = llm_eval.groupby('topic').mean()

In [None]:
llm_t_test['p_value'] = 0.0
for result in results:
  llm_t_test.loc[result[0], 'p_value'] = result[1]

In [None]:
llm_t_test.style.set_properties(**{'text-align': 'center'}).set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]}]
)

Unnamed: 0_level_0,acorreal/phi3-mental-health,microsoft/Phi-3-mini-4k-instruct,p_value
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anxiety and Depression,7.575758,7.69697,0.851555
Counseling Issues,7.777778,7.730769,
Negative Thoughts,7.833333,6.583333,0.654721
Other,6.428571,7.142857,0.654721
Parenthood,7.633333,7.566667,0.317311
People Feelings,7.7,7.56,0.179712
Romantic Relationships,7.529412,7.617647,0.738883
Therapy and Therapist,7.833333,6.583333,0.414216
Work-Life Balance,7.5,7.5,
