In [1]:
import pandas as pd
import json
import glob
import os
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def load_jsonl_to_dataframes(directory="."):
    """
    Load all .jsonl files in the specified directory and its subdirectories into DataFrames,
    storing them in a dictionary with filenames as keys.

    Args:
    directory (str): The directory to search for .jsonl files. Defaults to the current directory.

    Returns:
    dict: A dictionary where each key is the filename and the value is the corresponding DataFrame.
    """
    data_frames = {}
    # Search for all .jsonl files in the directory and subdirectories
    for file_path in glob.glob(os.path.join(directory, '**/*.jsonl'), recursive=True):
        data = []
        with open(file_path, 'r') as file:
            for line in file:
                data.append(json.loads(line))
        # Create a DataFrame from list of dictionaries
        df = pd.DataFrame(data)
        # Extract filename without extension as the key
        filename = os.path.splitext(os.path.basename(file_path))[0]
        data_frames[filename] = df
    return data_frames

In [4]:
# Example usage:
dataframes = load_jsonl_to_dataframes()  # Use the function for the current directory
file_keys = dataframes.keys()
file_key_list = list(file_keys)
file_key_list

['gpt_turbo_test_results_exp0_ep10',
 'gpt_turbo_test_results_exp0_ep20',
 'gpt_turbo_test_results_exp0_ep3',
 'gpt_turbo_test_results_exp1_ep10',
 'gpt_turbo_test_results_exp1_ep3',
 'gpt_turbo_test_results_exp2_ep10',
 'gpt_turbo_test_results_exp2_ep3',
 'gpt_turbo_train_results_exp0_ep10',
 'gpt_turbo_train_results_exp0_ep20',
 'gpt_turbo_train_results_exp0_ep3',
 'gpt_turbo_train_results_exp1_ep10',
 'gpt_turbo_train_results_exp1_ep3',
 'gpt_turbo_train_results_exp2_ep10',
 'gpt_turbo_train_results_exp2_ep3']

In [15]:
exp_name = file_key_list[0]
df_output = dataframes[exp_name]
df_output

Unnamed: 0,test_question,test_answer,response,num_epoch,context_training_stat,detailed_prompt_stat
0,"Hello, Doctor. I'm calling about one of my she...","{""animal"": ""Sheep"", ""shown_signs"": {""Anae"": -1...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
1,"Hello, Doctor. I'm calling about one of my cal...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": -...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
2,"Hello, Doctor. I'm calling about one of my she...","{""animal"": ""Sheep"", ""shown_signs"": {""Anae"": -1...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
3,"Hello, Doctor. I'm calling about one of my she...","{""animal"": ""Sheep"", ""shown_signs"": {""Anae"": -1...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
4,"Hello, Doctor. I'm calling about one of my cal...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 0...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
5,"Hello, Doctor. I'm calling about one of my goa...","{""animal"": ""Goat"", ""shown_signs"": {""Anae"": -1,...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
6,"Hello, Doctor. I'm calling about one of my cal...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 0...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
7,"Hello, Doctor. I'm calling about one of my she...","{""animal"": ""Sheep"", ""shown_signs"": {""Anae"": 0,...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
8,"Hello, Doctor. I'm calling about one of my she...","{""animal"": ""Sheep"", ""shown_signs"": {""Anae"": -1...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0
9,"Hello, Doctor. I'm calling about one of my goa...","{""animal"": ""Goat"", ""shown_signs"": {""Anae"": -1,...","{""animal"": ""Cattle"", ""shown_signs"": {""Anae"": 1...",10,0,0


In [16]:
def check_missing_extra_signs(test_answer, response):
    # Convert JSON strings to dictionaries
    test_answer_dict = json.loads(test_answer)
    response_dict = json.loads(response)
    
    # Extract shown_signs from both dictionaries
    test_signs = set(test_answer_dict.get('shown_signs', {}).keys())
    response_signs = set(response_dict.get('shown_signs', {}).keys())
    
    # Identify missing and extra signs
    missing_signs = test_signs - response_signs
    extra_signs = response_signs - test_signs
    
    return missing_signs, extra_signs, len(missing_signs), len(extra_signs)


def check_shown_signs_metrics(test_answer, response):
    # Convert JSON strings to dictionaries
    test_answer_dict = json.loads(test_answer)
    response_dict = json.loads(response)
    
    # Extract shown_signs from both dictionaries
    test_signs = test_answer_dict.get('shown_signs', {})
    response_signs = response_dict.get('shown_signs', {})
    
    # Get all unique signs
    all_signs = set(test_signs.keys()).union(set(response_signs.keys()))
    
    # Create lists for true values and predicted values
    y_true = [test_signs.get(sign, 0) for sign in all_signs]
    y_pred = [response_signs.get(sign, 0) for sign in all_signs]
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)

    return precision, recall, f1

def check_shown_signs_accuracy(test_answer, response):
    # Convert JSON strings to dictionaries
    test_answer_dict = json.loads(test_answer)
    response_dict = json.loads(response)
    
    # Extract shown_signs from both dictionaries
    test_signs = test_answer_dict.get('shown_signs', {})
    response_signs = response_dict.get('shown_signs', {})
    
    # Calculate the accuracy of shown_signs
    correct_matches = 0
    total_signs = len(test_signs)
    
    for sign, value in test_signs.items():
        if sign in response_signs and response_signs[sign] == value:
            correct_matches += 1
    
    # Calculate accuracy as a percentage
    accuracy = (correct_matches / total_signs) * 100 if total_signs > 0 else 0

    return accuracy

def check_exact_match(test_answer, response):
    # Convert JSON strings to dictionaries
    test_answer_dict = json.loads(test_answer)
    response_dict = json.loads(response)
    
    # Compare dictionaries
    exact_match = test_answer_dict == response_dict

    return exact_match

def check_animal_match(test_answer, response):
    # Convert JSON strings to dictionaries
    test_answer_dict = json.loads(test_answer)
    response_dict = json.loads(response)
    
    # Compare the animal fields
    animal_match = test_answer_dict.get('animal') == response_dict.get('animal')

    return animal_match

In [17]:
def check_result_for_df(dataframe, exp_name):
    # Initialize lists to store the results
    exact_match_results = []
    animal_match_results = []
    shown_signs_accuracy_results = []
    shown_signs_precision_results = []
    shown_signs_recall_results = []
    shown_signs_f1_score_results = []
    missing_signs_results = []
    extra_signs_results = []
    num_missing_signs_results = []
    num_extra_signs_results = []

    # Iterate over each row in the dataframe
    for i in range(len(dataframe)):
        test_answer = dataframe['test_answer'][i]
        response = dataframe['response'][i]
        
        exact_match = check_exact_match(test_answer, response)
        animal_match = check_animal_match(test_answer, response)
        shown_signs_accuracy = check_shown_signs_accuracy(test_answer, response)
        precision, recall, f1 = check_shown_signs_metrics(test_answer, response)
        missing_signs, extra_signs, num_missing_signs, num_extra_signs = check_missing_extra_signs(test_answer, response)

        # Append the results to the lists
        exact_match_results.append(exact_match)
        animal_match_results.append(animal_match)
        shown_signs_accuracy_results.append(shown_signs_accuracy)
        shown_signs_precision_results.append(precision)
        shown_signs_recall_results.append(recall)
        shown_signs_f1_score_results.append(f1)
        missing_signs_results.append(list(missing_signs))
        extra_signs_results.append(list(extra_signs))
        num_missing_signs_results.append(num_missing_signs)
        num_extra_signs_results.append(num_extra_signs)

    # Create a new dataframe with the original data and the results
    result_df = pd.DataFrame({
        'exp_name': exp_name,
        'exact_match': exact_match_results,
        'animal_match': animal_match_results,
        'shown_signs_accuracy': shown_signs_accuracy_results,
        'shown_signs_precision': shown_signs_precision_results,
        'shown_signs_recall': shown_signs_recall_results,
        'shown_signs_f1_score': shown_signs_f1_score_results,
        'missing_signs': missing_signs_results,
        'extra_signs': extra_signs_results,
        'num_missing_signs': num_missing_signs_results,
        'num_extra_signs': num_extra_signs_results
    })
    
    return result_df

In [18]:
def calculate_average_metrics(result_df):
    """
    Calculate the average metrics for a given result dataframe.
    
    Args:
    result_df (pd.DataFrame): The result dataframe containing metrics.
    
    Returns:
    pd.DataFrame: A dataframe containing the average metrics.
    """
    average_metrics = {
        'exact_match': result_df['exact_match'].mean(),
        'animal_match': result_df['animal_match'].mean(),
        'shown_signs_accuracy': result_df['shown_signs_accuracy'].mean(),
        'shown_signs_precision': result_df['shown_signs_precision'].mean(),
        'shown_signs_recall': result_df['shown_signs_recall'].mean(),
        'shown_signs_f1_score': result_df['shown_signs_f1_score'].mean(),
        'num_missing_signs': result_df['num_missing_signs'].mean(),
        'num_extra_signs': result_df['num_extra_signs'].mean()
    }
    
    average_metrics_df = pd.DataFrame(average_metrics, index=[result_df['exp_name'].iloc[0]])
    
    return average_metrics_df

In [19]:
def update_and_save_compiled_results(avg_result_df, filename='compiled_results.csv'):
    """
    Update the compiled results DataFrame with new results and save it locally.
    If the file already exists, it updates the DataFrame; otherwise, it creates a new one.

    Args:
    result_df (pd.DataFrame): The result DataFrame containing the new results.
    filename (str): The filename for the compiled results CSV file.

    Returns:
    None
    """
    # Check if the file exists
    if os.path.exists(filename):
        compiled_results_df = pd.read_csv(filename)
    else:
        # Create an empty DataFrame with the specified columns
        compiled_results_df = pd.DataFrame(columns=[
            'exp_num', 'epoch_num', 'train/test', 'exact_match', 'animal_match',
            'shown_signs_accuracy', 'shown_signs_precision', 'shown_signs_recall',
            'shown_signs_f1_score', 'num_missing_signs', 'num_extra_signs'
        ])
    
    # Extract expnum, epochnum, and train/test from the exp_name
    exp_name = avg_result_df.index[0]
    parts = exp_name.split('_')
    epoch_num = parts[-1].replace('ep', '')
    exp_num = parts[-2].replace('exp', '')
    train_test = parts[-4]
    model_name = parts[0]
    
    # Create a dictionary with the new data
    new_data = {
        'exp_num': exp_num,
        'epoch_num': epoch_num,
        'train/test': train_test,
        'model': model_name,
        'exact_match': round(avg_result_df.at[exp_name, 'exact_match'], 2),
        'animal_match': round(avg_result_df.at[exp_name, 'animal_match'], 2),
        'shown_signs_accuracy': round(avg_result_df.at[exp_name, 'shown_signs_accuracy'], 2),
        'shown_signs_precision': round(avg_result_df.at[exp_name, 'shown_signs_precision'], 2),
        'shown_signs_recall': round(avg_result_df.at[exp_name, 'shown_signs_recall'], 2),
        'shown_signs_f1_score': round(avg_result_df.at[exp_name, 'shown_signs_f1_score'], 2),
        'num_missing_signs': round(avg_result_df.at[exp_name, 'num_missing_signs'], 2),
        'num_extra_signs': round(avg_result_df.at[exp_name, 'num_extra_signs'], 2)
    }
    
    # Convert the dictionary to a DataFrame
    new_df = pd.DataFrame(new_data, index=[exp_name])
    
    # Update the compiled results DataFrame
    compiled_results_df = pd.concat([compiled_results_df, new_df], ignore_index=True)

    compiled_results_df = compiled_results_df.drop_duplicates(ignore_index=True)
    
    # Save the updated compiled results DataFrame to CSV
    compiled_results_df.to_csv(filename, index=False)

    return compiled_results_df

In [20]:
result_df = check_result_for_df(df_output, exp_name)
result_df

Unnamed: 0,exp_name,exact_match,animal_match,shown_signs_accuracy,shown_signs_precision,shown_signs_recall,shown_signs_f1_score,missing_signs,extra_signs,num_missing_signs,num_extra_signs
0,gpt_turbo_test_results_exp0_ep10,False,False,33.333333,0.401786,0.333333,0.363971,"[Deprs, Oc_Nas, Dehyd]",[],3,0
1,gpt_turbo_test_results_exp0_ep10,False,True,53.333333,0.575,0.533333,0.552941,[],[],0,0
2,gpt_turbo_test_results_exp0_ep10,False,False,50.0,0.555556,0.5,0.511111,"[Deprs, Oc_Nas, Dehyd]",[],3,0
3,gpt_turbo_test_results_exp0_ep10,False,False,38.888889,0.537037,0.388889,0.444444,"[Deprs, Oc_Nas, Dehyd]",[],3,0
4,gpt_turbo_test_results_exp0_ep10,False,True,60.0,0.666667,0.6,0.605128,[],[],0,0
5,gpt_turbo_test_results_exp0_ep10,False,False,55.555556,0.740741,0.555556,0.625,"[Dehyd, Dep, OcNasDis]",[],3,0
6,gpt_turbo_test_results_exp0_ep10,False,True,13.333333,0.941667,0.133333,0.139259,[],[],0,0
7,gpt_turbo_test_results_exp0_ep10,False,False,33.333333,0.662037,0.333333,0.411111,"[Deprs, Oc_Nas, Dehyd]",[],3,0
8,gpt_turbo_test_results_exp0_ep10,False,False,38.888889,0.473214,0.388889,0.426471,"[Deprs, Oc_Nas, Dehyd]",[],3,0
9,gpt_turbo_test_results_exp0_ep10,False,False,33.333333,0.809524,0.333333,0.472222,"[Dehyd, Dep, OcNasDis]",[],3,0


In [21]:
avg_result_df = calculate_average_metrics(result_df)
avg_result_df

Unnamed: 0,exact_match,animal_match,shown_signs_accuracy,shown_signs_precision,shown_signs_recall,shown_signs_f1_score,num_missing_signs,num_extra_signs
gpt_turbo_test_results_exp0_ep10,0.0,0.275862,44.482759,0.621442,0.446743,0.488515,2.172414,0.0


In [22]:
for i in range(len(file_key_list)):
    try:
        exp_name = file_key_list[i]
        df_output = dataframes[exp_name]
        result_df = check_result_for_df(df_output, exp_name)
        avg_result_df = calculate_average_metrics(result_df)
        compiled_results_df = update_and_save_compiled_results(avg_result_df)
        print(f'Results added for : {exp_name}')
    except Exception as e:
        print(f'Failed to process {exp_name}: {e}')


  compiled_results_df = pd.concat([compiled_results_df, new_df], ignore_index=True)


Results added for : gpt_turbo_test_results_exp0_ep10
Results added for : gpt_turbo_test_results_exp0_ep20
Results added for : gpt_turbo_test_results_exp0_ep3
Results added for : gpt_turbo_test_results_exp1_ep10
Results added for : gpt_turbo_test_results_exp1_ep3
Results added for : gpt_turbo_test_results_exp2_ep10
Results added for : gpt_turbo_test_results_exp2_ep3
Results added for : gpt_turbo_train_results_exp0_ep10
Results added for : gpt_turbo_train_results_exp0_ep20
Results added for : gpt_turbo_train_results_exp0_ep3
Results added for : gpt_turbo_train_results_exp1_ep10
Results added for : gpt_turbo_train_results_exp1_ep3
Results added for : gpt_turbo_train_results_exp2_ep10
Results added for : gpt_turbo_train_results_exp2_ep3
