## Results Processing

In [None]:
import os 
import ast  
import json
import numpy as np
from dotenv import dotenv_values
import pandas as pd
from langfuse import Langfuse

langfuse = Langfuse()

In [2]:
for i,v in dotenv_values().items():
    globals()[i]=v 

RESULTS_FOLDER = os.path.join(PROJECT_DIR, "results")

## Necessary functions

In [3]:
def langfuse_run_ids(dataset_name, dataset_run_name):
    dataset_run = langfuse.get_dataset_run(
    dataset_name=dataset_name, dataset_run_name=dataset_run_name
    )
    trace_ids = [run_item.trace_id for run_item in dataset_run.dataset_run_items]
    return trace_ids

In [4]:
def open_results(dataset):
    return pd.read_csv(os.path.join(RESULTS_FOLDER, dataset))

In [5]:
def parse_cell(cell):
    if isinstance(cell, str) and cell.startswith('[') and cell.endswith(']'):
        try:
            value = ast.literal_eval(cell)
            return value[0] if isinstance(value, list) and len(value) == 1 else np.nan
        except:
            return np.nan
    return cell

In [13]:
def get_results(dataset):
    columns=['precision', 'recall', 'f1', 'semantic similarity', 'jaccard_similarity', 'r_precision', 'r_recall', 'totalCost', 'latency']
    dataset[columns] = dataset[columns].fillna(0).applymap(parse_cell)
    df = dataset[columns].copy()
    metrics = df.mean(skipna=True).to_dict()
    # metrics['hallucinations'] = df[df['semantic similarity'] == -1].shape[0]
    return metrics

In [None]:
def get_dataset_item_id(row):
    try:
        return json.loads(row)['dataset_item_id']
    except:
        return None

In [None]:
def convert_2_json(s):
    try: 
        return json.loads(s)
    except:
        return None

In [None]:
def get_first_GEN(trace):
    obs = [obs for obs in trace.observations if obs.type == 'GENERATION']
    sorted_obs = sorted(obs, key=lambda obs: obs.start_time)
    return sorted_obs[0]

In [None]:
def get_second_GEN(trace):
    obs = [obs for obs in trace.observations if obs.type == 'GENERATION']
    sorted_obs = sorted(obs, key=lambda obs: obs.start_time)
    return [obs.usage_details for obs in sorted_obs[1:]]

In [None]:
def get_output(obs):
    try:
        return obs['output_reasoning']
    except:
        return np.nan

## Read results

In [None]:
trace_ids = langfuse_run_ids("GSCESP", "NoHybrid-o4mini-extract") #

results = open_results("25052025.csv")
results = results[results.id.isin(trace_ids)]
results.shape

(224, 36)

In [None]:
get_results(results)

{'precision': 0.6276099696712942,
 'recall': 0.562891709499326,
 'f1': 0.5871794871794872,
 'semantic similarity': 0.5467914886693027,
 'jaccard_similarity': 0.43817907596003935,
 'r_precision': 0.17869935724748123,
 'r_recall': 0.6535446489431067,
 'hallucinations': 0}

In [None]:
results['item_id' ] = results['metadata'].apply(lambda x: get_dataset_item_id(x) )
results = results[~results.item_id.isna()]
results.output = results['output'].apply(convert_2_json)
results.input = results['input'].apply(convert_2_json)

In [None]:
n_candidates = results['output'].apply(lambda x: len(x['final_answer']) if x is not None else None) #number of candidates retrieved
len_cc = results['input'].apply(lambda x: len(x['clinical_note'].split(' ')) if x is not None else None ) #length of candidates

In [None]:
traces = results['id'].apply(langfuse.fetch_trace) #get the whole trace
traces = traces.apply(lambda x: x.data)
first_obs = traces.apply(lambda x: get_first_GEN(x)) #first call
second_obs = traces.apply(lambda x: get_second_GEN(x)) #second call

In [161]:
token_count = [(cc, obs['input']) for cc, obs_list in zip(len_cc.values, second_obs) for obs in obs_list if not pd.isna(cc)]
X = [a[0] for a in token_count]
Y = [a[1] for a in token_count]

In [None]:
reasoning_tokens = first_obs.apply(lambda x: x.usage_details['output_reasoning'] if 'output_reasoning' in x.usage_details else None) #reasoning tokens
output_tokens = first_obs.apply(lambda x: x.usage_details['output'] if 'output' in x.usage_details else None) 
cost = first_obs.apply(lambda x: x.calculated_total_cost)