In [1]:
import json
import pandas as pd

from tfob import TFOb,  get_dss, get_bhsa

In [2]:
BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [3]:
pd.get_option("display.max_rows")
pd.get_option("display.max_columns")
pd.get_option("display.max_colwidth")

50

In [4]:
with open("correct_verses.json") as verses_file:
    other_correct_verses = json.load(verses_file)

with open("correct_clauses.json") as clauses_file:
    other_correct_clauses = json.load(clauses_file)

In [5]:
prose_books = [
    "Genesis", 
    "Exodus",
    "Leviticus",
    "Numbers",
    "Deuteronomy",
    "Joshua",
    "Judges",
    "1_Samuel",
    "2_Samuel",
    "1_Kings",
    "2_Kings",
    "Jonah",
    "Ruth",
    "Esther",
    "Daniel",
    "Ezra",
    "Nehemiah",
    "1_Chronicles",
    "2_Chronicles",
]

In [6]:
verses_bhsa = TFOb.all("verse", BHSA).filter_in(book=prose_books)

In [7]:
# Initialise a list with the Hebrew Alphabet (including final letters and space)

hebrew_alphabet = [
    'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 
    'י', 'כ', 'ך', 'ל', 'מ', 'ם', 'נ', 'ן', 'ס', 
    'ע', 'פ', 'ף', 'צ', 'ץ', 'ק', 'ר', 'ש', 'ת',
    ' ',
]

# Create a set for faster results

heb_alph_set = set(hebrew_alphabet)

def heb_without_diac(clause):
    """Clean a string in Hebrew script from diactrics."""
    clause = str(clause.text)
    filtered_signs = [sign for sign in clause if sign in heb_alph_set]
    cleaned_clause = "".join(filtered_signs)
    return cleaned_clause
    

def extract_result(line):
    return line["verse"].split(":")[1].strip(), line["parsed_clauses"].replace("```", "").replace("json", "").replace("\n", "")

In [8]:
verses_bhsa_dict = {heb_without_diac(verse).strip(): verse for verse in verses_bhsa}

In [313]:
gold_path = "data/verses_clauses_dict.json"
model_path = "data/fine_tuning_datasets/trial_5/model_outputs/output_4o_validation_small_temp_0.jsonl"

In [314]:
# version 2

# --- 1. Load Data ---
def load_gold_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_model_output_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

# --- 2. Embedded clause detection --- (Olivier Lauzanne)

def is_sub_list(a, b):
    """Check if a is a sublist of b. O. Lauzanne"""
    for i in range(len(b)):
        if b[i:i + len(a)] == a:
            return True
    return False


def find_cut_clauses(verse):

    # Get the TFOb object of the verse and clauses
    verse = verses_bhsa_dict[verse]
    clauses = verse.to_clauses

    return [heb_without_diac(clause).strip() for clause in clauses if not is_sub_list(clause.to_words.ids, verse.to_words.ids)]    
            
    
def find_embedded_clauses(verse, debug=False):

    # Get the TFOb object of the verse and clauses
    verse = verses_bhsa_dict[verse]
    clauses = verse.to_clauses

    cut_clauses = [clause for clause in clauses if not is_sub_list(clause.to_words.ids, verse.to_words.ids)]
    if debug:
        for cut_clause in cut_clauses:
            print(cut_clause)
            print(cut_clause.text)
        
    word_to_clause = {}
    for clause in clauses:
        for word in clause.to_words.ids:
            word_to_clause[word] = clause

    
    embedded_clauses = set()
    
    for cut_clause in cut_clauses:

        first_word_index = verse.to_words.ids.index(cut_clause.to_words.ids[0])
        last_word_index = verse.to_words.ids.index(cut_clause.to_words.ids[-1])

        words = verse.to_words.ids[first_word_index + 1:last_word_index]

        for word in words:
            if word not in cut_clause.to_words.ids:
                embedded_clauses.add(heb_without_diac(word_to_clause[word]).strip())
  
    return embedded_clauses

# --- 3. Comparison ---

correct_verses_local = []
correct_clauses_local = []

def compare_model_to_gold(gold_dict, model_output_list):
    results = []

    # Index model output by cleaned verse
    n_bad_json = 0
    
    model_map = {}
    for i, entry in enumerate(model_output_list):
        verse, raw_clauses = extract_result(entry)
        
        #verse = entry["verse"].replace("Parse this verse:", "").strip()
        try:
            #predicted = json.loads(entry["parsed_clauses"])
            predicted = json.loads(raw_clauses)
            model_map[verse] = (i, [c.strip() for c in predicted])
        except json.JSONDecodeError:
            model_map[verse] = (i, [])
            n_bad_json += 1
            
    print("Number of bad JSON: ", n_bad_json)

    for verse, info in gold_dict.items():
        gold_clauses = [c.strip() for c in info["clauses"]]
        #print(gold_clauses)
        complexity = info["complexity"]
        embedded = find_embedded_clauses(verse)
        cut_clauses = find_cut_clauses(verse)

        if verse not in model_map:
            continue  # model never predicted on this verse

        row_idx, predicted_clauses = model_map[verse]

        matched = [c for c in predicted_clauses if c in gold_clauses]

        correct_clauses_local.extend(matched)
        
        missed = [c for c in gold_clauses if c not in predicted_clauses]
        embedded_found = [c for c in predicted_clauses if c in embedded]
        embedded_missed = [c for c in embedded if c not in predicted_clauses]
        cut_clauses_found = [c for c in predicted_clauses if c in cut_clauses]
        cut_clauses_missed = [c for c in cut_clauses if c not in predicted_clauses]
        non_embedded = [c for c in gold_clauses if c not in embedded]
        non_embedded_found = [c for c in predicted_clauses if c in non_embedded]
        non_cut_clauses = [c for c in gold_clauses if c not in cut_clauses]
        non_cut_clauses_found = [c for c in predicted_clauses if c in non_cut_clauses]

        other_clauses = [c for c in gold_clauses if c not in cut_clauses and c not in embedded]
        other_clauses_found = [c for c in predicted_clauses if c in other_clauses]
        
        complete_verse_correct = int(predicted_clauses == gold_clauses)
        if complete_verse_correct:
            correct_verses_local.append(verse)
        
        results.append({
            "row": row_idx,
            "verse": verse,
            "complexity": complexity,

            # Clause info

            "gold_clauses": gold_clauses,
            "predicted_clauses": predicted_clauses,
            "matched_clauses": matched,
            "missed_clauses": missed,
        
            # Counts
            "total_gold": len(gold_clauses),
            "matched_count": len(matched),
            "missed_count": len(missed),
            "embedded_count": len(embedded),
            "embedded_found_count": len(embedded_found),
            "embedded_missed_count": len(embedded_missed),
            "cut_clauses_count": len(cut_clauses),
            "cut_clauses_found": len(cut_clauses_found),
            "cut_clauses_missed": len(cut_clauses_missed),
            "non_embedded_count": len(non_embedded),
            "non_embedded_found": len(non_embedded_found),
            "non_cut_clauses_count": len(non_cut_clauses),
            "non_cut_clauses_found": len(non_cut_clauses_found),
            "other_clauses_count": len(other_clauses),
            "other_clauses_found": len(other_clauses_found),
            "complete_verse_correct": complete_verse_correct,
        
            # Optional: keep raw lists for reference/debugging
            # Comment these out if not needed

            
            "embedded_clauses": embedded,
            "embedded_found_list": embedded_found,
            "embedded_missed_list": embedded_missed
        })


    return pd.DataFrame(results)

In [315]:
gold_dict = load_gold_json(gold_path)
model_output = load_model_output_jsonl(model_path)

In [316]:
model_output

[{'verse': 'Parse this verse: ויאמר שמואל אלשאול אתי שלח יהוה למשחך למלך עלעמו עלישראל ועתה שמע לקול דברי יהוה ס',
  'parsed_clauses': '["ויאמר שמואל אלשאול", "אתי שלח יהוה", "למשחך למלך עלעמו עלישראל", "ועתה שמע לקול דברי יהוה ס"]'},
 {'verse': 'Parse this verse: וימת חזאל מלךארם וימלך בןהדד בנו תחתיו',
  'parsed_clauses': '["וימת חזאל מלךארם", "וימלך בןהדד בנו תחתיו"]'},
 {'verse': 'Parse this verse: ועל ראש העמודים מעשה שושן ותתם מלאכת העמודים',
  'parsed_clauses': '["ועל ראש העמודים מעשה שושן", "ותתם מלאכת העמודים"]'},
 {'verse': 'Parse this verse: וישב ארצו ברכוש גדול ולבבו עלברית קדש ועשה ושב לארצו',
  'parsed_clauses': '["וישב ארצו ברכוש גדול", "ולבבו עלברית קדש", "ועשה", "ושב לארצו"]'},
 {'verse': 'Parse this verse: ורחצו ממנו משה ואהרן ובניו אתידיהם ואתרגליהם',
  'parsed_clauses': '["ורחצו ממנו משה ואהרן ובניו אתידיהם ואתרגליהם"]'},
 {'verse': 'Parse this verse: וזה הדבר אשר תעשו כלזכר וכלאשה ידעת משכבזכר תחרימו',
  'parsed_clauses': '["וזה הדבר", "אשר תעשו", "כלזכר וכלאשה", "

In [317]:
df_results = compare_model_to_gold(gold_dict, model_output)

Number of bad JSON:  0


In [318]:
len(df_results)

200

In [319]:
#df_results[(df_results.embedded_count != 0) & (df_results.complexity == "simple")]

In [320]:
# Overall counts
total_clauses = df_results["total_gold"].sum()
total_found = df_results["matched_count"].sum()
total_cut_clauses = df_results["cut_clauses_count"].sum()

# Embedded
total_embedded = df_results["embedded_count"].sum()
found_embedded = df_results["embedded_found_count"].sum()

# Non-embedded
total_non_embedded = df_results["non_embedded_count"].sum()
found_non_embedded = df_results["non_embedded_found"].sum()

# Cut Clauses
total_cut_clauses = df_results["cut_clauses_count"].sum()
found_cut_clauses = df_results["cut_clauses_found"].sum()

# Cut Clauses
total_other_clauses = df_results["other_clauses_count"].sum()
found_other_clauses = df_results["other_clauses_found"].sum()

# Non Cut Clauses
total_non_cut_clauses = df_results["non_cut_clauses_count"].sum()
found_non_cut_clauses = df_results["non_cut_clauses_found"].sum()

# Full verse match
total_verses = len(df_results)
correct_verses = df_results["complete_verse_correct"].sum()

# By complexity
simple_df = df_results[df_results["embedded_count"] == 0]
complex_df = df_results[df_results["embedded_count"] != 0]

simple_correct = simple_df["complete_verse_correct"].sum()
complex_correct = complex_df["complete_verse_correct"].sum()

total_simple_clauses = simple_df["total_gold"].sum()
total_simple_found = simple_df["matched_count"].sum()

total_complex_clauses = complex_df["total_gold"].sum()
total_complex_found = complex_df["matched_count"].sum()

# --- Print Results ---
print("📊 Accuracy Summary\n")

print(f"Total gold clauses: {total_clauses}")
print(f"Correctly predicted clauses: {total_found}")
print(f"✅ Overall clause accuracy: {total_found / total_clauses:.2%}\n")

print(f"Total gold clauses in simple verses: {total_simple_clauses}")
print(f"Correctly predicted clauses in simple verses: {total_simple_found}")
print(f"✅ Clause accuracy in simple verses: {total_simple_found / total_simple_clauses:.2%}\n")

print(f"Total gold clauses in complex verses: {total_complex_clauses}")
print(f"Correctly predicted clauses in complex verses: {total_complex_found}")
print(f"✅ Clause accuracy in complex verses: {total_complex_found / total_complex_clauses:.2%}\n")

print(f"Embedded clauses in gold: {total_embedded}")
print(f"Found embedded clauses: {found_embedded}")
print(f"✅ Embedded clause accuracy: {found_embedded / total_embedded:.2%}" if total_embedded else "⚠️ No embedded clauses.\n")

#print(f"\nNon-embedded clauses in gold: {total_non_embedded}")
#print(f"Found non-embedded clauses: {found_non_embedded}")
#print(f"✅ Non-embedded clause accuracy: {found_non_embedded / total_non_embedded:.2%}" if total_non_embedded else "⚠️ No non-embedded clauses.\n")

print(f"\nCut clauses in gold: {total_cut_clauses}")
print(f"Found cut clauses: {found_cut_clauses}")
print(f"✅ Cut clause accuracy: {found_cut_clauses / total_cut_clauses:.2%}" if total_cut_clauses else "⚠️ No cut clauses.\n")

print(f"\nOther clauses in gold: {total_other_clauses}")
print(f"Found other clauses: {found_other_clauses}")
print(f"✅ Other clause accuracy: {found_other_clauses / total_other_clauses:.2%}" if total_other_clauses else "⚠️ No other clauses.\n")

#print(f"\nNon cut clauses in gold: {total_non_cut_clauses}")
#print(f"Found non cut clauses: {found_non_cut_clauses}")
#print(f"✅ Non cut clause accuracy: {found_non_cut_clauses / total_non_cut_clauses:.2%}" if total_non_cut_clauses else "⚠️ No non cut clauses.\n")

print(f"\nComplete verse parses: {correct_verses} / {total_verses}")
print(f"✅ Full verse accuracy: {correct_verses / total_verses:.2%}")

print(f"\nSimple verse parses correct: {simple_correct} / {len(simple_df)}")
print(f"✅ Accuracy on simple verses: {simple_correct / len(simple_df):.2%}" if len(simple_df) else "⚠️ No simple verses.")

print(f"\nComplex verse parses correct: {complex_correct} / {len(complex_df)}")
print(f"✅ Accuracy on complex verses: {complex_correct / len(complex_df):.2%}" if len(complex_df) else "⚠️ No complex verses.")

📊 Accuracy Summary

Total gold clauses: 779
Correctly predicted clauses: 644
✅ Overall clause accuracy: 82.67%

Total gold clauses in simple verses: 685
Correctly predicted clauses in simple verses: 595
✅ Clause accuracy in simple verses: 86.86%

Total gold clauses in complex verses: 94
Correctly predicted clauses in complex verses: 49
✅ Clause accuracy in complex verses: 52.13%

Embedded clauses in gold: 24
Found embedded clauses: 8
✅ Embedded clause accuracy: 33.33%

Cut clauses in gold: 22
Found cut clauses: 6
✅ Cut clause accuracy: 27.27%

Other clauses in gold: 733
Found other clauses: 630
✅ Other clause accuracy: 85.95%

Complete verse parses: 132 / 200
✅ Full verse accuracy: 66.00%

Simple verse parses correct: 128 / 178
✅ Accuracy on simple verses: 71.91%

Complex verse parses correct: 4 / 22
✅ Accuracy on complex verses: 18.18%


In [321]:
def full_clause_match(row):
    return set(row["predicted_clauses"]) == set(row["gold_clauses"])

df_results["full_clause_match"] = df_results.apply(full_clause_match, axis=1)

simple_full_correct = df_results[df_results["embedded_count"] == 0]["full_clause_match"].sum()
complex_full_correct = df_results[df_results["embedded_count"] != 0]["full_clause_match"].sum()

print("\n📘 Fully Correct Clause Identification by Complexity (Order-insensitive)")
print(f"🔹 Simple: {simple_full_correct} / {len(simple_df)} → Accuracy: {simple_full_correct / len(simple_df):.2%}" if len(simple_df) else "🔹 No simple verses.")
print(f"🔸 Complex: {complex_full_correct} / {len(complex_df)} → Accuracy: {complex_full_correct / len(complex_df):.2%}" if len(complex_df) else "🔸 No complex verses.")


📘 Fully Correct Clause Identification by Complexity (Order-insensitive)
🔹 Simple: 128 / 178 → Accuracy: 71.91%
🔸 Complex: 4 / 22 → Accuracy: 18.18%


In [322]:
df_results["ordered_full_match"] = df_results.apply(
    lambda row: row["predicted_clauses"] == row["gold_clauses"], axis=1
)

simple_ordered_correct = df_results[df_results["embedded_count"] == 0]["ordered_full_match"].sum()
complex_ordered_correct = df_results[df_results["embedded_count"] != 0]["ordered_full_match"].sum()

print("\n📘 Fully Correct Clause Identification by Complexity (Order-sensitive)")
print(f"🔹 Simple: {simple_ordered_correct} / {len(simple_df)} → Accuracy: {simple_ordered_correct / len(simple_df):.2%}" if len(simple_df) else "🔹 No simple verses.")
print(f"🔸 Complex: {complex_ordered_correct} / {len(complex_df)} → Accuracy: {complex_ordered_correct / len(complex_df):.2%}" if len(complex_df) else "🔸 No complex verses.")


📘 Fully Correct Clause Identification by Complexity (Order-sensitive)
🔹 Simple: 128 / 178 → Accuracy: 71.91%
🔸 Complex: 4 / 22 → Accuracy: 18.18%


In [323]:
def compute_precision(row):
    predicted = row["predicted_clauses"]
    if not predicted:
        return None
    gold = row["gold_clauses"]
    matched = [cl for cl in predicted if cl in gold]
    return len(matched) / len(predicted)

df_results["precision"] = df_results.apply(compute_precision, axis=1)

In [324]:
simple_df = df_results[df_results["embedded_count"] == 0]
complex_df = df_results[df_results["embedded_count"] != 0]

simple_precision = simple_df["precision"].dropna().mean()
complex_precision = complex_df["precision"].dropna().mean()

In [325]:
def detailed_precision_summary(df, label):
    df = df.copy()
    df = df[~df["precision"].isna()]
    n_verses = len(df)
    avg_precision = df["precision"].mean()
    total_predicted = df["predicted_clauses"].apply(len).sum()  # still list-based
    total_matched = df["matched_count"].sum()  # now scalar-based

    print(f"\n📘 Precision Summary – {label}")
    print(f"Verses evaluated: {n_verses}")
    print(f"Total predicted clauses: {total_predicted}")
    print(f"Total correct predicted clauses: {total_matched}")
    print(f"📐 Average per-verse precision: {avg_precision:.2%}")
    print(f"📐 Overall clause-level precision: {total_matched / total_predicted:.2%}" if total_predicted else "⚠️ No predictions made.")

In [326]:
detailed_precision_summary(simple_df, "Simple")
detailed_precision_summary(complex_df, "Complex")


📘 Precision Summary – Simple
Verses evaluated: 178
Total predicted clauses: 670
Total correct predicted clauses: 595
📐 Average per-verse precision: 87.19%
📐 Overall clause-level precision: 88.81%

📘 Precision Summary – Complex
Verses evaluated: 22
Total predicted clauses: 81
Total correct predicted clauses: 49
📐 Average per-verse precision: 53.55%
📐 Overall clause-level precision: 60.49%


In [327]:
# Initialize counters
total_predicted = df_results["predicted_clauses"].apply(len).sum()
total_matched = df_results["matched_count"].sum()

predicted_embedded = 0
matched_embedded = 0
predicted_non_embedded = 0
matched_non_embedded = 0

for _, row in df_results.iterrows():
    gold = set(row["gold_clauses"])
    embedded = set(row["embedded_clauses"])
    predicted = set(row["predicted_clauses"])

    for clause in predicted:
        if clause in embedded:
            predicted_embedded += 1
            if clause in gold:
                matched_embedded += 1
        else:
            predicted_non_embedded += 1
            if clause in gold:
                matched_non_embedded += 1

# Compute precisions
global_precision = total_matched / total_predicted if total_predicted else None
embedded_precision = matched_embedded / predicted_embedded if predicted_embedded else None
non_embedded_precision = matched_non_embedded / predicted_non_embedded if predicted_non_embedded else None

# Print results
print("\n📘 Clause-Level Precision by Clause Type")

print(f"🌐 Global precision: {total_matched} / {total_predicted} → {global_precision:.2%}" if global_precision is not None else "🌐 No predictions made.")

print(f"🔸 Embedded precision: {matched_embedded} / {predicted_embedded} → {embedded_precision:.2%}" if embedded_precision is not None else "🔸 No embedded clauses predicted.")

print(f"🔹 Non-embedded precision: {matched_non_embedded} / {predicted_non_embedded} → {non_embedded_precision:.2%}" if non_embedded_precision is not None else "🔹 No non-embedded clauses predicted.")


📘 Clause-Level Precision by Clause Type
🌐 Global precision: 644 / 751 → 85.75%
🔸 Embedded precision: 8 / 8 → 100.00%
🔹 Non-embedded precision: 635 / 742 → 85.58%


In [328]:
def precision_by_clause_type(df, clause_type="embedded"):
    total_predicted = 0
    total_matched = 0

    for _, row in df.iterrows():
        gold_set = set(row["gold_clauses"])
        embedded_set = set(row["embedded_clauses"])
        predicted = row["predicted_clauses"]

        for clause in predicted:
            if clause_type == "embedded" and clause in embedded_set:
                total_predicted += 1
                if clause in gold_set:
                    total_matched += 1
            elif clause_type == "non_embedded" and clause not in embedded_set:
                total_predicted += 1
                if clause in gold_set:
                    total_matched += 1

    label = "Embedded" if clause_type == "embedded" else "Non-embedded"
    print(f"\n📘 Precision Summary – {label} Clauses")
    print(f"Predicted {label.lower()} clauses: {total_predicted}")
    print(f"Correctly predicted {label.lower()} clauses: {total_matched}")
    if total_predicted:
        print(f"📐 Precision: {total_matched / total_predicted:.2%}")
    else:
        print(f"⚠️ No {label.lower()} clauses predicted.")


In [329]:
precision_by_clause_type(df_results, clause_type="embedded")
precision_by_clause_type(df_results, clause_type="non_embedded")


📘 Precision Summary – Embedded Clauses
Predicted embedded clauses: 8
Correctly predicted embedded clauses: 8
📐 Precision: 100.00%

📘 Precision Summary – Non-embedded Clauses
Predicted non-embedded clauses: 743
Correctly predicted non-embedded clauses: 636
📐 Precision: 85.60%
