In [1]:
import json
import pandas as pd
import re

from tfob import TFOb,  get_dss, get_bhsa

In [2]:
BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [3]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

50

In [4]:
with open("correct_verses.json") as verses_file:
    other_correct_verses = json.load(verses_file)

with open("correct_clauses.json") as clauses_file:
    other_correct_clauses = json.load(clauses_file)

In [5]:
prose_books = [
    "Genesis", 
    "Exodus",
    "Leviticus",
    "Numbers",
    "Deuteronomy",
    "Joshua",
    "Judges",
    "1_Samuel",
    "2_Samuel",
    "1_Kings",
    "2_Kings",
    "Jonah",
    "Ruth",
    "Esther",
    "Daniel",
    "Ezra",
    "Nehemiah",
    "1_Chronicles",
    "2_Chronicles",
]

In [6]:
verses_bhsa = TFOb.all("verse", BHSA).filter_in(book=prose_books)

In [7]:
# Initialise a list with the Hebrew Alphabet (including final letters and space)

hebrew_alphabet = [
    'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 
    'י', 'כ', 'ך', 'ל', 'מ', 'ם', 'נ', 'ן', 'ס', 
    'ע', 'פ', 'ף', 'צ', 'ץ', 'ק', 'ר', 'ש', 'ת',
    ' ',
]

# Create a set for faster results

heb_alph_set = set(hebrew_alphabet)

def heb_without_diac(clause):
    """Clean a string in Hebrew script from diactrics."""
    clause = str(clause.text)
    filtered_signs = [sign for sign in clause if sign in heb_alph_set]
    cleaned_clause = "".join(filtered_signs)
    return cleaned_clause
    

def extract_result(line):
    return line["verse"].split(":")[1].strip(), line["parsed_clauses"].replace("```", "").replace("json", "").replace("\n", "")

In [8]:
verses_bhsa_dict = {heb_without_diac(verse).strip(): verse for verse in verses_bhsa}

In [9]:
gold_path = "data/verses_clauses_dict.json"
model_path = "data/fine_tuning_datasets/basic_models_outputs/output_validation_large_gpt4o_mini.jsonl"

In [112]:
model = "GPT-4o mini"
fine_tuned_status = "no"
training_dataset_size = "large"
temperature = "0"

In [113]:
# version 2

# --- 1. Load Data ---
def load_gold_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_model_output_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

# --- 2. Embedded clause detection --- (Olivier Lauzanne)

def is_sub_list(a, b):
    """Check if a is a sublist of b. O. Lauzanne"""
    for i in range(len(b)):
        if b[i:i + len(a)] == a:
            return True
    return False


def find_cut_clauses(verse):
    # Get the TFOb object of the verse and clauses
    verse = verses_bhsa_dict[verse]
    clauses = verse.to_clauses

    return [heb_without_diac(clause).strip() for clause in clauses if not is_sub_list(clause.to_words.ids, verse.to_words.ids)]    
            
    
def find_embedded_clauses(verse, debug=False):
    # Get the TFOb object of the verse and clauses
    verse = verses_bhsa_dict[verse]
    clauses = verse.to_clauses

    cut_clauses = [clause for clause in clauses if not is_sub_list(clause.to_words.ids, verse.to_words.ids)]
    if debug:
        for cut_clause in cut_clauses:
            print(cut_clause)
            print(cut_clause.text)
        
    word_to_clause = {}
    for clause in clauses:
        for word in clause.to_words.ids:
            word_to_clause[word] = clause

    
    embedded_clauses = set()
    
    for cut_clause in cut_clauses:

        first_word_index = verse.to_words.ids.index(cut_clause.to_words.ids[0])
        last_word_index = verse.to_words.ids.index(cut_clause.to_words.ids[-1])

        words = verse.to_words.ids[first_word_index + 1:last_word_index]

        for word in words:
            if word not in cut_clause.to_words.ids:
                embedded_clauses.add(heb_without_diac(word_to_clause[word]).strip())
  
    return embedded_clauses

# Comparison

correct_verses_local = []
correct_clauses_local = []

def compare_model_to_gold(gold_dict, model_output_list):
    results = []

    # Index model output by cleaned verse
    n_bad_json = 0
    
    model_map = {}
    for i, entry in enumerate(model_output_list):
        verse, raw_clauses = extract_result(entry)
        
        #verse = entry["verse"].replace("Parse this verse:", "").strip()
        try:
            #predicted = json.loads(entry["parsed_clauses"])
            predicted = json.loads(raw_clauses)
            model_map[verse] = (i, [c.strip() for c in predicted])
        except json.JSONDecodeError:
            model_map[verse] = (i, [])
            n_bad_json += 1
            
    print("Number of bad JSON: ", n_bad_json)

    for verse, info in gold_dict.items():
        gold_clauses = [c.strip() for c in info["clauses"]]
        #print(gold_clauses)
        complexity = info["complexity"]
        embedded = find_embedded_clauses(verse)
        cut_clauses = find_cut_clauses(verse)

        if verse not in model_map:
            continue  # model never predicted on this verse

        row_idx, predicted_clauses = model_map[verse]

        matched = [c for c in predicted_clauses if c in gold_clauses]

        correct_clauses_local.extend(matched)
        
        missed = [c for c in gold_clauses if c not in predicted_clauses]
        embedded_found = [c for c in predicted_clauses if c in embedded]
        embedded_missed = [c for c in embedded if c not in predicted_clauses]
        cut_clauses_found = [c for c in predicted_clauses if c in cut_clauses]
        cut_clauses_missed = [c for c in cut_clauses if c not in predicted_clauses]
        non_embedded = [c for c in gold_clauses if c not in embedded]
        non_embedded_found = [c for c in predicted_clauses if c in non_embedded]
        non_cut_clauses = [c for c in gold_clauses if c not in cut_clauses]
        non_cut_clauses_found = [c for c in predicted_clauses if c in non_cut_clauses]

        other_clauses = [c for c in gold_clauses if c not in cut_clauses and c not in embedded]
        other_clauses_found = [c for c in predicted_clauses if c in other_clauses]
        
        complete_verse_correct = int(predicted_clauses == gold_clauses)
        if complete_verse_correct:
            correct_verses_local.append(verse)
        
        results.append({
            "row": row_idx,
            "verse": verse,
            "complexity": complexity,

            # Clause info

            "gold_clauses": gold_clauses,
            "predicted_clauses": predicted_clauses,
            "matched_clauses": matched,
            "missed_clauses": missed,
        
            # Counts
            "total_gold": len(gold_clauses),
            "matched_count": len(matched),
            "missed_count": len(missed),
            "embedded_count": len(embedded),
            "embedded_found_count": len(embedded_found),
            "embedded_missed_count": len(embedded_missed),
            "cut_clauses_count": len(cut_clauses),
            "cut_clauses_found": len(cut_clauses_found),
            "cut_clauses_missed": len(cut_clauses_missed),
            "non_embedded_count": len(non_embedded),
            "non_embedded_found": len(non_embedded_found),
            "non_cut_clauses_count": len(non_cut_clauses),
            "non_cut_clauses_found": len(non_cut_clauses_found),
            "other_clauses_count": len(other_clauses),
            "other_clauses_found": len(other_clauses_found),
            "complete_verse_correct": complete_verse_correct,
        
            # Optional: keep raw lists for reference/debugging
            # Comment these out if not needed

            
            "embedded_clauses": embedded,
            "embedded_found_list": embedded_found,
            "embedded_missed_list": embedded_missed
        })


    return pd.DataFrame(results)

In [114]:
gold_dict = load_gold_json(gold_path)
model_output = load_model_output_jsonl(model_path)

In [115]:
df_results = compare_model_to_gold(gold_dict, model_output)

Number of bad JSON:  0


In [116]:
#print(model_path)

In [117]:
# Checking if the list of clauses returned by the model add / remove words from the original verse given.

def clean_text(text, hebrew_alphabet):
    # Keep only Hebrew letters and normalize spaces
    cleaned = ''.join([c if c in hebrew_alphabet else ' ' for c in text])
    return re.sub(r'\s+', ' ', cleaned).strip()

identical_verses = 0 # perfect string-level matches between the original_verse and the reconstructed_clean version.
verses = 0
same_as_original_words = 0 # semantically similar verses based on word content, even if the full strings don't match exactly
    
for item in model_output:
    original_verse = item["verse"].split(": ", 1)[1]
    
    clause_string = item["parsed_clauses"]
    clause_string = clause_string.strip()[1:-1]
    clauses = [clause.strip().strip('"') for clause in clause_string.split('", "')]
    
    reconstructed_verse = ''.join(clauses)

    stripped_clauses = [clause.strip() for clause in clauses]
    reconstructed_verse = ' '.join(stripped_clauses)

    reconstructed_clean = clean_text(reconstructed_verse, hebrew_alphabet)

    if original_verse != reconstructed_clean:
        #print("❌ Different verse:")
        #print("Original:     ", original_verse)
        #print("Reconstructed:", reconstructed_clean)

        original_words = set(original_verse.split())
        reconstructed_words = set(reconstructed_clean.split())
        
        if original_words == reconstructed_words or "".join(sorted(original_words)) == "".join(sorted(reconstructed_words)):
            same_as_original_words += 1
            #print("✅ Same words")
            #print("\n")
            
        elif ''.join(sorted(''.join(original_words))) == ''.join(sorted(''.join(reconstructed_words))):
            same_as_original_words += 1
            #print("✅ Same characters when words are joined")
            #print("\n")
            
        else:
            print("")
            #print("❌ Different words")
            #print("Only in original:     ", original_words - reconstructed_words)
            #print("Only in reconstructed:", reconstructed_words - original_words)
            #print("\n")
    else:
        identical_verses += 1
        
        original_words = set(original_verse.split())
        reconstructed_words = set(reconstructed_clean.split())
        
        if original_words == reconstructed_words:
            
            same_as_original_words += 1
            #print("✅ Same words")
            #print("\n")
        
        elif ''.join(sorted(''.join(original_words))) == ''.join(sorted(''.join(reconstructed_words))):
            same_as_original_words += 1
            #print("✅ Same characters when words are joined")
            #print("\n")
            
        else:
            print("")
            #print("❌ Different words")
            #print("Only in original:     ", original_words - reconstructed_words)
            #print("Only in reconstructed:", reconstructed_words - original_words)
            #print("\n")
            
        #print("✅ Match:")
        #print("Original:     ", original_verse)
        #print("Reconstructed:", reconstructed_clean)

  
    verses += 1

















































































































































































































































































































































In [118]:
# Compute metrics
percentage_exact_restitution = (identical_verses / verses) * 100
percentage_exact_words_restitution = (same_as_original_words / verses) * 100

# Add first result to the DataFrame
df_input_output = pd.concat([
    df_input_output,
    pd.DataFrame([{
        "model": model,
        "temperature": temperature,
        "fine_tuned": fine_tuned_status,
        "training_dataset_size": training_dataset_size,
        "percentage_exact_restitution": percentage_exact_restitution,
        "percentage_exact_words_restitution": percentage_exact_words_restitution
    }])
], ignore_index=True)

# show df
df_input_output

Unnamed: 0,model,temperature,fine_tuned,training_dataset_size,percentage_exact_restitution,percentage_exact_words_restitution
0,GPT-4o,0,yes,small,94.5,99.0
1,GPT-4o,0,yes,medium,92.3,99.2
2,GPT-4o,0,yes,large,89.067278,98.96789
3,GPT-4o mini,0,yes,small,95.0,99.0
4,GPT-4o mini,0,yes,medium,94.4,98.9
5,GPT-4o mini,0,yes,large,90.902141,98.776758
6,GPT-4o,0,no,small,87.0,87.0
7,GPT-4o,0,no,medium,89.1,89.2
8,GPT-4o,0,no,large,89.067278,89.181957
9,GPT-4o mini,0,no,small,81.5,91.0


In [119]:
#df_input_output.to_csv("data/input_output_consistency.csv", index=False)

In [108]:
# Overall counts
total_clauses = df_results["total_gold"].sum()
total_found = df_results["matched_count"].sum()
total_cut_clauses = df_results["cut_clauses_count"].sum()

# Embedded
total_embedded = df_results["embedded_count"].sum()
found_embedded = df_results["embedded_found_count"].sum()

# Non-embedded
total_non_embedded = df_results["non_embedded_count"].sum()
found_non_embedded = df_results["non_embedded_found"].sum()

# Cut Clauses
total_cut_clauses = df_results["cut_clauses_count"].sum()
found_cut_clauses = df_results["cut_clauses_found"].sum()

# Cut Clauses
total_other_clauses = df_results["other_clauses_count"].sum()
found_other_clauses = df_results["other_clauses_found"].sum()

# Non Cut Clauses
total_non_cut_clauses = df_results["non_cut_clauses_count"].sum()
found_non_cut_clauses = df_results["non_cut_clauses_found"].sum()

# Full verse match
total_verses = len(df_results)
correct_verses = df_results["complete_verse_correct"].sum()

# By complexity
simple_df = df_results[df_results["embedded_count"] == 0]
complex_df = df_results[df_results["embedded_count"] != 0]

simple_correct = simple_df["complete_verse_correct"].sum()
complex_correct = complex_df["complete_verse_correct"].sum()

total_simple_clauses = simple_df["total_gold"].sum()
total_simple_found = simple_df["matched_count"].sum()

total_complex_clauses = complex_df["total_gold"].sum()
total_complex_found = complex_df["matched_count"].sum()

In [109]:
# Create a dictionary of your stats
stats_dict = {
    "model": model,
    "dataset_size": training_dataset_size,
    "temperature": temperature,
    "fine_tuned": fine_tuned_status,
    "total_clauses": total_clauses,
    "total_found": total_found,
    "total_cut_clauses": total_cut_clauses,
    "total_embedded": total_embedded,
    "found_embedded": found_embedded,
    "total_non_embedded": total_non_embedded,
    "found_non_embedded": found_non_embedded,
    "found_cut_clauses": found_cut_clauses,
    "total_other_clauses": total_other_clauses,
    "found_other_clauses": found_other_clauses,
    "total_non_cut_clauses": total_non_cut_clauses,
    "found_non_cut_clauses": found_non_cut_clauses,
    "total_verses": total_verses,
    "correct_verses": correct_verses,
    "simple_correct": simple_correct,
    "complex_correct": complex_correct,
    "total_simple_clauses": total_simple_clauses,
    "total_simple_found": total_simple_found,
    "total_complex_clauses": total_complex_clauses,
    "total_complex_found": total_complex_found
}

# Create df_stats from the dictionary
#df_stats = pd.DataFrame([stats_dict])

# Add new data as a new row to the df_stats
#df_stats = pd.concat([df_stats, pd.DataFrame([stats_dict])], ignore_index=True)

In [None]:
#df_stats.to_csv("data/all_models_metrics_for_paper.csv", index=False)

In [None]:
print(f"Percentage of correctly returned verses: {identical_verses}/{verses}", identical_verses/verses * 100, "%")
print(f"Percentage of correct set of words: {same_as_original_words}/{verses}", same_as_original_words/verses * 100, "%")

In [None]:
# --- Print Results ---
print("📊 Accuracy Summary\n")

print(f"Total gold clauses: {total_clauses}")
print(f"Correctly predicted clauses: {total_found}")
print(f"✅ Overall clause accuracy: {total_found / total_clauses:.2%}\n")

print(f"Total gold clauses in simple verses: {total_simple_clauses}")
print(f"Correctly predicted clauses in simple verses: {total_simple_found}")
print(f"✅ Clause accuracy in simple verses: {total_simple_found / total_simple_clauses:.2%}\n")

print(f"Total gold clauses in complex verses: {total_complex_clauses}")
print(f"Correctly predicted clauses in complex verses: {total_complex_found}")
print(f"✅ Clause accuracy in complex verses: {total_complex_found / total_complex_clauses:.2%}\n")

print(f"Embedded clauses in gold: {total_embedded}")
print(f"Found embedded clauses: {found_embedded}")
print(f"✅ Embedded clause accuracy: {found_embedded / total_embedded:.2%}" if total_embedded else "⚠️ No embedded clauses.\n")

#print(f"\nNon-embedded clauses in gold: {total_non_embedded}")
#print(f"Found non-embedded clauses: {found_non_embedded}")
#print(f"✅ Non-embedded clause accuracy: {found_non_embedded / total_non_embedded:.2%}" if total_non_embedded else "⚠️ No non-embedded clauses.\n")

print(f"\nCut clauses in gold: {total_cut_clauses}")
print(f"Found cut clauses: {found_cut_clauses}")
print(f"✅ Cut clause accuracy: {found_cut_clauses / total_cut_clauses:.2%}" if total_cut_clauses else "⚠️ No cut clauses.\n")

print(f"\nOther clauses in gold: {total_other_clauses}")
print(f"Found other clauses: {found_other_clauses}")
print(f"✅ Other clause accuracy: {found_other_clauses / total_other_clauses:.2%}" if total_other_clauses else "⚠️ No other clauses.\n")

#print(f"\nNon cut clauses in gold: {total_non_cut_clauses}")
#print(f"Found non cut clauses: {found_non_cut_clauses}")
#print(f"✅ Non cut clause accuracy: {found_non_cut_clauses / total_non_cut_clauses:.2%}" if total_non_cut_clauses else "⚠️ No non cut clauses.\n")

print(f"\nComplete verse parses: {correct_verses} / {total_verses}")
print(f"✅ Full verse accuracy: {correct_verses / total_verses:.2%}")

print(f"\nSimple verse parses correct: {simple_correct} / {len(simple_df)}")
print(f"✅ Accuracy on simple verses: {simple_correct / len(simple_df):.2%}" if len(simple_df) else "⚠️ No simple verses.")

print(f"\nComplex verse parses correct: {complex_correct} / {len(complex_df)}")
print(f"✅ Accuracy on complex verses: {complex_correct / len(complex_df):.2%}" if len(complex_df) else "⚠️ No complex verses.")

In [None]:
def full_clause_match(row):
    return set(row["predicted_clauses"]) == set(row["gold_clauses"])

df_results["full_clause_match"] = df_results.apply(full_clause_match, axis=1)

simple_full_correct = df_results[df_results["embedded_count"] == 0]["full_clause_match"].sum()
complex_full_correct = df_results[df_results["embedded_count"] != 0]["full_clause_match"].sum()

print("\n📘 Fully Correct Clause Identification by Complexity (Order-insensitive)")
print(f"🔹 Simple: {simple_full_correct} / {len(simple_df)} → Accuracy: {simple_full_correct / len(simple_df):.2%}" if len(simple_df) else "🔹 No simple verses.")
print(f"🔸 Complex: {complex_full_correct} / {len(complex_df)} → Accuracy: {complex_full_correct / len(complex_df):.2%}" if len(complex_df) else "🔸 No complex verses.")

In [None]:
df_results["ordered_full_match"] = df_results.apply(
    lambda row: row["predicted_clauses"] == row["gold_clauses"], axis=1
)

simple_ordered_correct = df_results[df_results["embedded_count"] == 0]["ordered_full_match"].sum()
complex_ordered_correct = df_results[df_results["embedded_count"] != 0]["ordered_full_match"].sum()

print("\n📘 Fully Correct Clause Identification by Complexity (Order-sensitive)")
print(f"🔹 Simple: {simple_ordered_correct} / {len(simple_df)} → Accuracy: {simple_ordered_correct / len(simple_df):.2%}" if len(simple_df) else "🔹 No simple verses.")
print(f"🔸 Complex: {complex_ordered_correct} / {len(complex_df)} → Accuracy: {complex_ordered_correct / len(complex_df):.2%}" if len(complex_df) else "🔸 No complex verses.")

In [None]:
def compute_precision(row):
    predicted = row["predicted_clauses"]
    if not predicted:
        return None
    gold = row["gold_clauses"]
    matched = [cl for cl in predicted if cl in gold]
    return len(matched) / len(predicted)

df_results["precision"] = df_results.apply(compute_precision, axis=1)

In [None]:
simple_df = df_results[df_results["embedded_count"] == 0]
complex_df = df_results[df_results["embedded_count"] != 0]

simple_precision = simple_df["precision"].dropna().mean()
complex_precision = complex_df["precision"].dropna().mean()

In [None]:
def detailed_precision_summary(df, label):
    df = df.copy()
    df = df[~df["precision"].isna()]
    n_verses = len(df)
    avg_precision = df["precision"].mean()
    total_predicted = df["predicted_clauses"].apply(len).sum()  # still list-based
    total_matched = df["matched_count"].sum()  # now scalar-based

    print(f"\n📘 Precision Summary – {label}")
    print(f"Verses evaluated: {n_verses}")
    print(f"Total predicted clauses: {total_predicted}")
    print(f"Total correct predicted clauses: {total_matched}")
    print(f"📐 Average per-verse precision: {avg_precision:.2%}")
    print(f"📐 Overall clause-level precision: {total_matched / total_predicted:.2%}" if total_predicted else "⚠️ No predictions made.")

In [None]:
detailed_precision_summary(simple_df, "Simple")
detailed_precision_summary(complex_df, "Complex")

In [None]:
# Initialize counters
total_predicted = df_results["predicted_clauses"].apply(len).sum()
total_matched = df_results["matched_count"].sum()

predicted_embedded = 0
matched_embedded = 0
predicted_non_embedded = 0
matched_non_embedded = 0

for _, row in df_results.iterrows():
    gold = set(row["gold_clauses"])
    embedded = set(row["embedded_clauses"])
    predicted = set(row["predicted_clauses"])

    for clause in predicted:
        if clause in embedded:
            predicted_embedded += 1
            if clause in gold:
                matched_embedded += 1
        else:
            predicted_non_embedded += 1
            if clause in gold:
                matched_non_embedded += 1

# Compute precisions
global_precision = total_matched / total_predicted if total_predicted else None
embedded_precision = matched_embedded / predicted_embedded if predicted_embedded else None
non_embedded_precision = matched_non_embedded / predicted_non_embedded if predicted_non_embedded else None

# Print results
print("\n📘 Clause-Level Precision by Clause Type")

print(f"🌐 Global precision: {total_matched} / {total_predicted} → {global_precision:.2%}" if global_precision is not None else "🌐 No predictions made.")

print(f"🔸 Embedded precision: {matched_embedded} / {predicted_embedded} → {embedded_precision:.2%}" if embedded_precision is not None else "🔸 No embedded clauses predicted.")

print(f"🔹 Non-embedded precision: {matched_non_embedded} / {predicted_non_embedded} → {non_embedded_precision:.2%}" if non_embedded_precision is not None else "🔹 No non-embedded clauses predicted.")

In [None]:
def precision_by_clause_type(df, clause_type="embedded"):
    total_predicted = 0
    total_matched = 0

    for _, row in df.iterrows():
        gold_set = set(row["gold_clauses"])
        embedded_set = set(row["embedded_clauses"])
        predicted = row["predicted_clauses"]

        for clause in predicted:
            if clause_type == "embedded" and clause in embedded_set:
                total_predicted += 1
                if clause in gold_set:
                    total_matched += 1
            elif clause_type == "non_embedded" and clause not in embedded_set:
                total_predicted += 1
                if clause in gold_set:
                    total_matched += 1

    label = "Embedded" if clause_type == "embedded" else "Non-embedded"
    print(f"\n📘 Precision Summary – {label} Clauses")
    print(f"Predicted {label.lower()} clauses: {total_predicted}")
    print(f"Correctly predicted {label.lower()} clauses: {total_matched}")
    if total_predicted:
        print(f"📐 Precision: {total_matched / total_predicted:.2%}")
    else:
        print(f"⚠️ No {label.lower()} clauses predicted.")
