# Analysis for Zero Shot FlowJudge and GLIDER

Here, we look at the initial results obtained when running FlowJudge and GLIDER with zero shot prompts.

In [None]:
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [None]:
flowjudge_data = pd.read_csv("data/flowjudge_fc_results.csv")
glider_data = pd.read_csv("data/glider_fc_results.csv")

In [None]:
mapping = {
    "ST-Perfect": True,
    "ir-ST-Perfect": False,
    "ST-Imperfect": False,
    "ir-ST-Imperfect": False,
    "ST-External": False,
    "ir-ST-External": False
}

In [None]:
print("FlowJudge")
print(f1_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(precision_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(recall_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(confusion_matrix(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))

In [None]:
print("GLIDER")
print(f1_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(precision_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(recall_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(confusion_matrix(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))

# Analysis for few shot FlowJudge

Here, we look at two different metrics to evaluat how FlowJudge did as a function calling judge. First, we assess how stable FlowJudge during experimentation. We do this by passing the same data points three different times to FlowJudge (i.e. running the experiment three times), and then finding the `cohen_kappa_score` between the three runs pairwise. Second, for each experiment, we check the f1-score, precision, recall, and confusion matrix.

In [13]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, cohen_kappa_score
import pandas as pd

In [4]:
hammerbench = pd.read_csv("/home/dni138/mozilla_ai/data/function_call_experiment.csv")

## Cohen Kappa

In [14]:
cohen_kappa_score(hammerbench.score_run_0, hammerbench.score_run_1)

0.26344492678403253

In [15]:
cohen_kappa_score(hammerbench.score_run_1, hammerbench.score_run_2)

0.27004185295065264

In [16]:
cohen_kappa_score(hammerbench.score_run_0, hammerbench.score_run_2)

0.27486275759596934

## Performance Metrics

In [19]:
hammerbench.columns

Index(['label', 'messages', 'tools', 'explanation_run_0', 'score_run_0',
       'explanation_run_1', 'score_run_1', 'explanation_run_2', 'score_run_2',
       'gt_label'],
      dtype='object')

In [37]:
score_to_bool = {
    0: False,
    1: False,
    2: True
}

In [40]:
hammerbench["score_run_0"].map(score_to_bool).fillna(False)

  hammerbench["score_run_0"].map(score_to_bool).fillna(False)


0        False
1        False
2        False
3        False
4        False
         ...  
13049    False
13050    False
13051    False
13052    False
13053    False
Name: score_run_0, Length: 13054, dtype: bool

In [43]:
for i in range(3):
    print("----------RUN_{}-----------".format(i))
    print("F1 Score: {}".format(f1_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
    print("Precision: {}".format(precision_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
    print("Recall: {}".format(recall_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
    print("Confusion Matrix: \n\n {} \n".format(confusion_matrix(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), labels=[False, True])))

----------RUN_0-----------
F1 Score: 0.5035039230266493
Precision: 0.6131466994997525
Recall: 0.5196778041969499
Confusion Matrix: 

 [[10743   195]
 [ 1995   121]] 

----------RUN_1-----------
F1 Score: 0.5046584311020123
Precision: 0.6251617793110659
Recall: 0.5206912064252164
Confusion Matrix: 

 [[10760   178]
 [ 1994   122]] 

----------RUN_2-----------
F1 Score: 0.49889597981284217
Precision: 0.6054650923850642
Recall: 0.5172079630126981
Confusion Matrix: 

 [[10751   187]
 [ 2007   109]] 



  print("F1 Score: {}".format(f1_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
  print("Precision: {}".format(precision_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
  print("Recall: {}".format(recall_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
  print("Confusion Matrix: \n\n {} \n".format(confusion_matrix(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), labels=[False, True])))
  print("F1 Score: {}".format(f1_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(score_to_bool).fillna(False), average="macro", labels=[False, True])))
  print("Precision: {}".format(precision_score(hammerbench.gt_label, hammerbench["score_run_{}".format(i)].map(sc