In [1]:
import json
import pandas as pd
import re

from tfob import TFOb,  get_dss, get_bhsa

In [2]:
BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [156]:
list_model_output_paths = [
    ("GPT-4o mini (T=0)", "large", "base", "data/fine_tuning_datasets/basic_models_outputs/output_validation_large_gpt4o_mini.jsonl"), 
    ("GPT-4o mini (T=0)", "medium", "base", "data/fine_tuning_datasets/basic_models_outputs/output_validation_medium_gpt4o_mini.jsonl"),
    ("GPT-4o mini (T=0)", "small", "base", "data/fine_tuning_datasets/basic_models_outputs/output_validation_small_gpt4o_mini.jsonl"),
    
    ("GPT-4o (T=0)", "large", "base", "data/fine_tuning_datasets/basic_models_outputs/output_validation_large_gpt4o.jsonl"), 
    ("GPT-4o (T=0)", "medium", "base", "data/fine_tuning_datasets/basic_models_outputs/output_validation_medium_gpt4o.jsonl"),
    ("GPT-4o (T=0)", "small", "base", "data/fine_tuning_datasets/basic_models_outputs/output_validation_small_gpt4o.jsonl"),
    
    ("GPT-4o mini (T=0)", "large", "fine-tuned", "data/fine_tuning_datasets/trial_1/model_outputs/output_validation_large_temp_0.jsonl"),
    ("GPT-4o mini (T=0)", "medium", "fine-tuned","data/fine_tuning_datasets/trial_1/model_outputs/output_validation_medium_temp_0.jsonl"),
    ("GPT-4o mini (T=0)", "small", "fine-tuned","data/fine_tuning_datasets/trial_1/model_outputs/output_validation_small_temp_0.jsonl"),
    
    ("GPT-4o (T=0)", "large", "fine-tuned", "data/fine_tuning_datasets/trial_5/model_outputs/output_4o_validation_large_temp_0.jsonl"),
    ("GPT-4o (T=0)", "medium", "fine-tuned", "data/fine_tuning_datasets/trial_5/model_outputs/output_4o_validation_medium_temp_0.jsonl"),
    ("GPT-4o (T=0)", "small", "fine-tuned", "data/fine_tuning_datasets/trial_5/model_outputs/output_4o_validation_small_temp_0.jsonl")   
]

In [168]:
gold_path = "data/verses_clauses_dict.json"
model_path = "data/fine_tuning_datasets/basic_models_outputs/output_validation_large_gpt4o.jsonl"

In [169]:
# get the verses and count the clauses for each verse

# Define functions

def load_gold_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_model_output_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

In [170]:
gold_data = load_gold_json(gold_path)

In [171]:
list(gold_data.values())[0]

{'clauses': ['בראשית ברא אלהים את השמים ואת הארץ'], 'complexity': 'simple'}

In [172]:
#gold_data

In [173]:
# Count simple verses with 1, 2 or 3+ clauses and complex clauses

tot_1_clause_verses = 0
tot_2_clauses_verses = 0
tot_3_clauses_verses = 0
tot_4_clauses_verses = 0
tot_5plus_clauses_verses = 0
tot_complex_verses = 0

for row_verse, verse in gold_data.items():
    clauses = verse["clauses"]
    complexity = verse["complexity"]
    
    if complexity != "simple":
        tot_complex_verses += 1
    elif len(clauses) <= 1:
        tot_1_clause_verses += 1
    elif len(clauses) == 2:
        tot_2_clauses_verses += 1
    elif len(clauses) == 3:
        tot_3_clauses_verses += 1
    elif len(clauses) == 4:
        tot_4_clauses_verses += 1
    else: 
        tot_5plus_clauses_verses += 1

print(tot_1_clause_verses, tot_2_clauses_verses, tot_3_clauses_verses, tot_4_clauses_verses, tot_5plus_clauses_verses, tot_complex_verses)

1281 2198 2296 2084 3791 1434


In [174]:
model_data = load_model_output_jsonl(model_path)

In [175]:
#model_data

In [176]:
def extract_result(line):
    return line["verse"].split(":")[1].strip(), line["parsed_clauses"].replace("```", "").replace("json", "").replace("\n", "")

In [177]:
def get_category(verse_info):
    clauses = verse_info["clauses"]
    
    if verse_info["complexity"] == "complex":
        return "complex"
    elif len(clauses) <= 1:
        return "simple_1_clause"
    elif len(clauses) == 2:
        return "simple_2_clauses"
    elif len(clauses) == 3:
        return "simple_3_clauses"
    elif len(clauses) == 4:
        return "simple_4_clauses"
    else:
        return "simple_5plus_clauses"

In [178]:
def compare_model_to_gold(gold_dict, model_output_list):
    results = []

    # Index model output by cleaned verse
    n_bad_json = 0
    
    model_map = {}
    for i, entry in enumerate(model_output_list):
        verse, raw_clauses = extract_result(entry)
        
        try:
            #predicted = json.loads(entry["parsed_clauses"])
            predicted = json.loads(raw_clauses)
            model_map[verse] = (i, [c.strip() for c in predicted])
        except json.JSONDecodeError:
            model_map[verse] = (i, [])
            n_bad_json += 1
            
    #print("Number of bad JSON: ", n_bad_json)

    for verse, info in gold_dict.items():
        
        gold_clauses = [c.strip() for c in info["clauses"]]
        #print(gold_clauses)
        category = get_category(info)

        if verse not in model_map:
            continue  # model never predicted on this verse
        #print(info["complexity"])

        row_idx, predicted_clauses = model_map[verse]
        
        complete_verse_correct = predicted_clauses == gold_clauses
        
        results.append({
            "row": row_idx,
            "verse": verse,
            "category": category,
            "correct": complete_verse_correct
        })


    return pd.DataFrame(results)

In [179]:
df = compare_model_to_gold(gold_data, model_data)

In [180]:
df.category.unique()

array(['simple_2_clauses', 'simple_3_clauses', 'simple_1_clause',
       'simple_5plus_clauses', 'simple_4_clauses', 'complex'],
      dtype=object)

In [181]:
counts = df.groupby(['category', 'correct']).size().reset_index(name='count')

In [182]:
counts

Unnamed: 0,category,correct,count
0,complex,False,286
1,simple_1_clause,False,227
2,simple_1_clause,True,37
3,simple_2_clauses,False,136
4,simple_2_clauses,True,280
5,simple_3_clauses,False,275
6,simple_3_clauses,True,209
7,simple_4_clauses,False,331
8,simple_4_clauses,True,95
9,simple_5plus_clauses,False,704


In [183]:
# 1. Create counts table with True/False as columns
counts = df.pivot_table(index='category', columns='correct', aggfunc='size', fill_value=0)

# 2. Ensure columns are named True and False explicitly (optional)
counts = counts.rename(columns={True: 'True', False: 'False'})

# 3. Add percentage column (percentage of True out of total)
counts['percentage'] = counts['True'] / (counts['True'] + counts['False']) * 100

# 4. (Optional) reset index if you want 'category' as a regular column
counts = counts.reset_index()

In [184]:
counts

correct,category,False,True,percentage
0,complex,286,0,0.0
1,simple_1_clause,227,37,14.015152
2,simple_2_clauses,136,280,67.307692
3,simple_3_clauses,275,209,43.181818
4,simple_4_clauses,331,95,22.300469
5,simple_5plus_clauses,704,36,4.864865


In [192]:
all_counts = []

for model_temp, size, status, path in list_model_output_paths:
    model_data = load_model_output_jsonl(path)
    df = compare_model_to_gold(gold_data, model_data)
    
    # 1 Create counts table with True/False as columns
    counts = df.pivot_table(index='category', columns='correct', aggfunc='size', fill_value=0)
    
    # 2 Ensure columns are named True and False explicitly
    counts = counts.rename(columns={True: 'True', False: 'False'})
    
    # 3 Add percentage column (percentage of True out of total)
    counts['percentage'] = (counts['True'] / (counts['True'] + counts['False']) * 100).round(1)
    
    # 4 reset index if you want 'category' as a regular column
    counts = counts.reset_index()
    counts["model_temp"] = model_temp
    counts["size"] = size
    counts["status"] = status
    all_counts.append(counts)

In [193]:
counts

correct,category,False,True,percentage,model_temp,size,status
0,complex,17,4,19.0,GPT-4o (T=0),small,fine-tuned
1,simple_1_clause,1,18,94.7,GPT-4o (T=0),small,fine-tuned
2,simple_2_clauses,8,22,73.3,GPT-4o (T=0),small,fine-tuned
3,simple_3_clauses,8,28,77.8,GPT-4o (T=0),small,fine-tuned
4,simple_4_clauses,10,20,66.7,GPT-4o (T=0),small,fine-tuned
5,simple_5plus_clauses,24,40,62.5,GPT-4o (T=0),small,fine-tuned


In [194]:
df_all_counts = pd.concat(all_counts).reset_index(drop=True)

In [195]:
df_all_counts

correct,category,False,True,percentage,model_temp,size,status
0,complex,286,0,0.0,GPT-4o mini (T=0),large,base
1,simple_1_clause,237,27,10.2,GPT-4o mini (T=0),large,base
2,simple_2_clauses,183,233,56.0,GPT-4o mini (T=0),large,base
3,simple_3_clauses,329,155,32.0,GPT-4o mini (T=0),large,base
4,simple_4_clauses,349,77,18.1,GPT-4o mini (T=0),large,base
5,simple_5plus_clauses,705,35,4.7,GPT-4o mini (T=0),large,base
6,complex,109,0,0.0,GPT-4o mini (T=0),medium,base
7,simple_1_clause,99,16,13.9,GPT-4o mini (T=0),medium,base
8,simple_2_clauses,63,92,59.4,GPT-4o mini (T=0),medium,base
9,simple_3_clauses,133,63,32.1,GPT-4o mini (T=0),medium,base


In [196]:
df_all_counts.to_csv("data/clause_number_results_assessment.csv", index=False)