Download Evaluation Run Output

In [None]:
from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append('./../common')

from pf_sdk_utils import PromptFlowUtils

pf_utils = PromptFlowUtils()
pf_utils.initialize_pf_client(cloud = True)

In [None]:
def download_or_display_outputs(executed_runs):
    from output_utils import download_output
    output_runs_dir = "./runs"
    for run in executed_runs:
        if isinstance(run, str):
            output_file_path = f"{output_runs_dir}/{run}"
        else:
            output_file_path = f"{output_runs_dir}/{run.name}"
        download_output(run,output_file_path)
    return f"{output_file_path}.jsonl"

In [None]:
evaluation_run_name = "prasann_evaluation_experiment_step2_variant_0_28200658"

evaluation_run  = pf_utils.get_run(evaluation_run_name)
eval_out_file = download_or_display_outputs([evaluation_run])

Read source dataset and evaluation output

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

eval_out_df = pd.read_json(eval_out_file, lines=True)

Explode the evaluation output to get the individual predictions

In [None]:
# Explode the JSON objects into separate columns
eval_scores_df = eval_out_df['scores'].apply(pd.Series)

# Concatenate the original DataFrame with the new `json_df`
eval_out_df = pd.concat([eval_out_df, eval_scores_df], axis=1).drop('scores', axis=1)

Analyze errors

In [None]:
# Filter out the false positives rows
eval_df_fp = eval_out_df[eval_out_df["false_positive"] > 0]

In [None]:
# Filter out the big parse error rows
eval_df_error = eval_out_df[eval_out_df["pred_big_parse_error"] > 0]

In [None]:
eval_df_error['pred_big_parse_error_msg'].value_counts()

Analyze individual big parse errors

In [None]:
# create a new data-frame to compare the keywords between the predictions and the ground truth
import json

def create_comparison_df(input_df: pd.DataFrame):
    compare_keywords_df = pd.DataFrame(columns=['fsn', 'seller', 'llms', 'missing', 'extra'])

    for rows in input_df.iterrows():
        predictions_str = rows[1]["inputs.predictions_str"].replace("```json", "").replace("```", "")
        predictions = json.loads(predictions_str)
        prediction_keywords = []
        for prediction in predictions:
            prediction_keywords.append(prediction["keyword"])
        keywords = list(rows[1]["inputs.ground_truth"].keys())
        missing_keywords = [item for item in keywords if item not in prediction_keywords]
        extra_keywords = [item for item in prediction_keywords if item not in keywords]

        new_df = pd.DataFrame({'fsn': [rows[1]["inputs.fsn"]] ,'seller': [keywords], 'llms': [prediction_keywords], 'missing': [missing_keywords], 'extra': [extra_keywords]})
        compare_keywords_df = pd.concat([compare_keywords_df, new_df], ignore_index=True)

    return compare_keywords_df

In [None]:
parsing_error_df = eval_df_error[eval_df_error["pred_big_parse_error_msg"] == "Error parsing predictions as json"]
print(parsing_error_df["inputs.predictions_str"])

In [None]:
eval_df_error_msg = eval_df_error[eval_df_error["pred_big_parse_error_msg"] == "Predictions keywords different than GT keywords"]
compare_prediction_diff_df = create_comparison_df(eval_df_error_msg)
len(compare_prediction_diff_df)

In [None]:
eval_df_error_not_alist = eval_df_error[eval_df_error["pred_big_parse_error_msg"] == "Predictions not a list or # of keywords in predictions is different than # of GT keywords."]
compare_keywords_not_a_list_df = create_comparison_df(eval_df_error_not_alist)
len(compare_keywords_not_a_list_df)