In [44]:
import json
import pandas as pd

from tfob import TFOb,  get_dss, get_bhsa

In [45]:
BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [2]:
pd.get_option("display.max_rows")
pd.get_option("display.max_columns")
pd.get_option("display.max_colwidth")

50

In [3]:
gold_path = "data/verses_clauses_dict.json"
model_path = "data/fine_tuning_datasets/basic_models_outputs/output_validation_large_gpt4o.jsonl"

In [40]:
# --- 1. Load Data ---
def load_gold_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_model_output_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

# --- 2. Embedded clause detection --- (Olivier Lauzanne)

def is_in_non_cut(i, verse, clauses):


def find_embedded_clauses(verse, clauses):

    cut_clauses = [clause for clause in clauses if clause not in verse]
    embedded_clauses = set()
    
    for cut_clause in cut_clauses:
        for i in range(len(cut_clause)):
            first_part = cut_clause[:i]
            second_part = cut_clause[i:]
            
            if first_part in verse and second_part in verse:
                j = verse.index(first_part) + len(first_part)
                k = verse.index(second_part, j)
                print("first part: ", repr(first_part), "index: ", j)
                print("second part: ", repr(second_part), "index: ", k)
                print(verse[j:k])
                for clause in clauses:
                    if clause in verse:
                        l = verse.index(clause)
                        if j <= l < k:
                            embedded_clauses.add(clause)   
    return embedded_clauses


# --- 3. Comparison ---
def compare_model_to_gold(gold_dict, model_output_list):
    results = []

    # Index model output by cleaned verse
    model_map = {}
    for i, entry in enumerate(model_output_list):
        verse = entry["verse"].replace("Parse this verse:", "").strip()
        try:
            predicted = json.loads(entry["parsed_clauses"])
            model_map[verse] = (i, [c.strip() for c in predicted])
        except json.JSONDecodeError:
            continue

    for verse, info in gold_dict.items():
        gold_clauses = [c.strip() for c in info["clauses"]]
        #print(gold_clauses)
        complexity = info["complexity"]
        embedded = find_embedded_clauses(verse, gold_clauses)

        if verse not in model_map:
            continue  # model never predicted on this verse

        row_idx, predicted_clauses = model_map[verse]

        matched = [c for c in gold_clauses if c in predicted_clauses]
        missed = [c for c in gold_clauses if c not in predicted_clauses]
        embedded_found = [c for c in embedded if c in predicted_clauses]
        embedded_missed = [c for c in embedded if c not in predicted_clauses]
        non_embedded = [c for c in gold_clauses if c not in embedded]
        non_embedded_found = [c for c in non_embedded if c in predicted_clauses]

        complete_verse_correct = int(predicted_clauses == gold_clauses)
        
        results.append({
            "row": row_idx,
            "verse": verse,
            "complexity": complexity,

            # Clause info

            "gold_clauses": gold_clauses,
            "predicted_clauses": predicted_clauses,
            "matched_clauses": matched,
            "missed_clauses": missed,
        
            # Counts
            "total_gold": len(gold_clauses),
            "matched_count": len(matched),
            "missed_count": len(missed),
            "embedded_count": len(embedded),
            "embedded_found_count": len(embedded_found),
            "embedded_missed_count": len(embedded_missed),
            "non_embedded_count": len(non_embedded),
            "non_embedded_found": len(non_embedded_found),
            "complete_verse_correct": complete_verse_correct,
        
            # Optional: keep raw lists for reference/debugging
            # Comment these out if not needed

            
            "embedded_clauses": embedded,
            "embedded_found_list": embedded_found,
            "embedded_missed_list": embedded_missed
        })


    return pd.DataFrame(results)

In [29]:
gold_dict = load_gold_json(gold_path)
model_output = load_model_output_jsonl(model_path)

In [27]:
#df_results = compare_model_to_gold(gold_dict, model_output)
#len(df_results)

In [16]:
df_results[(df_results.embedded_found_count != 0) & (df_results.complexity == "simple")]

Unnamed: 0,row,verse,complexity,gold_clauses,predicted_clauses,matched_clauses,missed_clauses,total_gold,matched_count,missed_count,embedded_count,embedded_found_count,embedded_missed_count,non_embedded_count,non_embedded_found,complete_verse_correct,embedded_clauses,embedded_found_list,embedded_missed_list
359,670,ויאמר אלהם הוא אשר דבר יהוה שבתון שבתקדש ליהוה...,simple,"[ויאמר אלהם, הוא, אשר דבר יהוה, שבתון שבתקדש ל...","[ויאמר אלהם הוא אשר דבר יהוה, שבתון שבתקדש ליה...","[שבתון שבתקדש ליהוה מחר, ואת כלהעדף הניחו לכם ...","[ויאמר אלהם, הוא, אשר דבר יהוה, את אשרתאפו, אפ...",9,2,7,6,1,5,3,1,0,"{אשר דבר יהוה, אפו, הוא, את אשרתאפו, את אשרתבש...",[שבתון שבתקדש ליהוה מחר],"[אשר דבר יהוה, אפו, הוא, את אשרתאפו, את אשרתבשלו]"
2021,1171,ויפתח עזרא הספר לעיני כלהעם כימעל כלהעם היה וכ...,simple,"[ויפתח עזרא הספר לעיני כלהעם, כימעל כלהעם היה,...","[ויפתח עזרא הספר לעיני כלהעם, כימעל כלהעם היה,...","[ויפתח עזרא הספר לעיני כלהעם, כימעל כלהעם היה]","[ועמדו כלהעם, כפתחו]",4,2,2,2,1,1,2,1,0,"{כימעל כלהעם היה, כפתחו}",[כימעל כלהעם היה],[כפתחו]


In [33]:
verse = df_results.loc[359].verse
clauses = df_results.loc[359].gold_clauses

In [34]:
print(verse)
print(clauses)

ויאמר אלהם הוא אשר דבר יהוה שבתון שבתקדש ליהוה מחר את אשרתאפו אפו ואת אשרתבשלו בשלו ואת כלהעדף הניחו לכם למשמרת עדהבקר
['ויאמר אלהם', 'הוא', 'אשר דבר יהוה', 'שבתון שבתקדש ליהוה מחר', 'את אשרתאפו', 'אפו', 'ובשלו', 'את אשרתבשלו', 'ואת כלהעדף הניחו לכם למשמרת עדהבקר']


In [41]:
find_embedded_clauses(verse, clauses)

first part:  'ו' index:  1
second part:  'בשלו' index:  74
יאמר אלהם הוא אשר דבר יהוה שבתון שבתקדש ליהוה מחר את אשרתאפו אפו ואת אשרת


{'אפו',
 'אשר דבר יהוה',
 'את אשרתאפו',
 'את אשרתבשלו',
 'הוא',
 'שבתון שבתקדש ליהוה מחר'}

In [43]:
verse[1]

'י'

In [None]:
df_complex = df_results[df_results.complexity == "complex"]
len(df_complex)
df_complex

In [None]:
# Overall counts
total_clauses = df_results["total_gold"].sum()
total_found = df_results["matched_count"].sum()

# Embedded
total_embedded = df_results["embedded_count"].sum()
found_embedded = df_results["embedded_found_count"].sum()

# Non-embedded
total_non_embedded = df_results["non_embedded_count"].sum()
found_non_embedded = df_results["non_embedded_found"].sum()

# Full verse match
total_verses = len(df_results)
correct_verses = df_results["complete_verse_correct"].sum()

# By complexity
simple_df = df_results[df_results["complexity"] == "simple"]
complex_df = df_results[df_results["complexity"] == "complex"]

simple_correct = simple_df["complete_verse_correct"].sum()
complex_correct = complex_df["complete_verse_correct"].sum()

# --- Print Results ---
print("📊 Accuracy Summary\n")

print(f"Total gold clauses: {total_clauses}")
print(f"Correctly predicted clauses: {total_found}")
print(f"✅ Overall clause accuracy: {total_found / total_clauses:.2%}\n")

print(f"Embedded clauses in gold: {total_embedded}")
print(f"Found embedded clauses: {found_embedded}")
print(f"✅ Embedded clause accuracy: {found_embedded / total_embedded:.2%}" if total_embedded else "⚠️ No embedded clauses.\n")

print(f"\nNon-embedded clauses in gold: {total_non_embedded}")
print(f"Found non-embedded clauses: {found_non_embedded}")
print(f"✅ Non-embedded clause accuracy: {found_non_embedded / total_non_embedded:.2%}" if total_non_embedded else "⚠️ No non-embedded clauses.\n")

print(f"\nComplete verse parses: {correct_verses} / {total_verses}")
print(f"✅ Full verse accuracy: {correct_verses / total_verses:.2%}")

print(f"\nSimple verse parses correct: {simple_correct} / {len(simple_df)}")
print(f"✅ Accuracy on simple verses: {simple_correct / len(simple_df):.2%}" if len(simple_df) else "⚠️ No simple verses.")

print(f"\nComplex verse parses correct: {complex_correct} / {len(complex_df)}")
print(f"✅ Accuracy on complex verses: {complex_correct / len(complex_df):.2%}" if len(complex_df) else "⚠️ No complex verses.")


In [None]:
def full_clause_match(row):
    return set(row["predicted_clauses"]) == set(row["gold_clauses"])

df_results["full_clause_match"] = df_results.apply(full_clause_match, axis=1)

simple_full_correct = df_results[df_results["complexity"] == "simple"]["full_clause_match"].sum()
complex_full_correct = df_results[df_results["complexity"] == "complex"]["full_clause_match"].sum()

print("\n📘 Fully Correct Clause Identification by Complexity (Order-insensitive)")
print(f"🔹 Simple: {simple_full_correct} / {len(simple_df)} → Accuracy: {simple_full_correct / len(simple_df):.2%}" if len(simple_df) else "🔹 No simple verses.")
print(f"🔸 Complex: {complex_full_correct} / {len(complex_df)} → Accuracy: {complex_full_correct / len(complex_df):.2%}" if len(complex_df) else "🔸 No complex verses.")

In [None]:
df_results["ordered_full_match"] = df_results.apply(
    lambda row: row["predicted_clauses"] == row["gold_clauses"], axis=1
)

simple_ordered_correct = df_results[df_results["complexity"] == "simple"]["ordered_full_match"].sum()
complex_ordered_correct = df_results[df_results["complexity"] == "complex"]["ordered_full_match"].sum()

print("\n📘 Fully Correct Clause Identification by Complexity (Order-sensitive)")
print(f"🔹 Simple: {simple_ordered_correct} / {len(simple_df)} → Accuracy: {simple_ordered_correct / len(simple_df):.2%}" if len(simple_df) else "🔹 No simple verses.")
print(f"🔸 Complex: {complex_ordered_correct} / {len(complex_df)} → Accuracy: {complex_ordered_correct / len(complex_df):.2%}" if len(complex_df) else "🔸 No complex verses.")

In [None]:
def compute_precision(row):
    predicted = row["predicted_clauses"]
    if not predicted:
        return None
    gold = row["gold_clauses"]
    matched = [cl for cl in predicted if cl in gold]
    return len(matched) / len(predicted)

df_results["precision"] = df_results.apply(compute_precision, axis=1)

In [None]:
simple_df = df_results[df_results["complexity"] == "simple"]
complex_df = df_results[df_results["complexity"] == "complex"]

simple_precision = simple_df["precision"].dropna().mean()
complex_precision = complex_df["precision"].dropna().mean()

In [None]:
def detailed_precision_summary(df, label):
    df = df.copy()
    df = df[~df["precision"].isna()]
    n_verses = len(df)
    avg_precision = df["precision"].mean()
    total_predicted = df["predicted_clauses"].apply(len).sum()  # still list-based
    total_matched = df["matched_count"].sum()  # now scalar-based

    print(f"\n📘 Precision Summary – {label}")
    print(f"Verses evaluated: {n_verses}")
    print(f"Total predicted clauses: {total_predicted}")
    print(f"Total correct predicted clauses: {total_matched}")
    print(f"📐 Average per-verse precision: {avg_precision:.2%}")
    print(f"📐 Overall clause-level precision: {total_matched / total_predicted:.2%}" if total_predicted else "⚠️ No predictions made.")

In [None]:
detailed_precision_summary(simple_df, "Simple")
detailed_precision_summary(complex_df, "Complex")

In [None]:
# Initialize counters
total_predicted = df_results["predicted_clauses"].apply(len).sum()
total_matched = df_results["matched_count"].sum()

predicted_embedded = 0
matched_embedded = 0
predicted_non_embedded = 0
matched_non_embedded = 0

for _, row in df_results.iterrows():
    gold = set(row["gold_clauses"])
    embedded = set(row["embedded_clauses"])
    predicted = set(row["predicted_clauses"])

    for clause in predicted:
        if clause in embedded:
            predicted_embedded += 1
            if clause in gold:
                matched_embedded += 1
        else:
            predicted_non_embedded += 1
            if clause in gold:
                matched_non_embedded += 1

# Compute precisions
global_precision = total_matched / total_predicted if total_predicted else None
embedded_precision = matched_embedded / predicted_embedded if predicted_embedded else None
non_embedded_precision = matched_non_embedded / predicted_non_embedded if predicted_non_embedded else None

# Print results
print("\n📘 Clause-Level Precision by Clause Type")

print(f"🌐 Global precision: {total_matched} / {total_predicted} → {global_precision:.2%}" if global_precision is not None else "🌐 No predictions made.")

print(f"🔸 Embedded precision: {matched_embedded} / {predicted_embedded} → {embedded_precision:.2%}" if embedded_precision is not None else "🔸 No embedded clauses predicted.")

print(f"🔹 Non-embedded precision: {matched_non_embedded} / {predicted_non_embedded} → {non_embedded_precision:.2%}" if non_embedded_precision is not None else "🔹 No non-embedded clauses predicted.")

In [None]:
def precision_by_clause_type(df, clause_type="embedded"):
    total_predicted = 0
    total_matched = 0

    for _, row in df.iterrows():
        gold_set = set(row["gold_clauses"])
        embedded_set = set(row["embedded_clauses"])
        predicted = row["predicted_clauses"]

        for clause in predicted:
            if clause_type == "embedded" and clause in embedded_set:
                total_predicted += 1
                if clause in gold_set:
                    total_matched += 1
            elif clause_type == "non_embedded" and clause not in embedded_set:
                total_predicted += 1
                if clause in gold_set:
                    total_matched += 1

    label = "Embedded" if clause_type == "embedded" else "Non-embedded"
    print(f"\n📘 Precision Summary – {label} Clauses")
    print(f"Predicted {label.lower()} clauses: {total_predicted}")
    print(f"Correctly predicted {label.lower()} clauses: {total_matched}")
    if total_predicted:
        print(f"📐 Precision: {total_matched / total_predicted:.2%}")
    else:
        print(f"⚠️ No {label.lower()} clauses predicted.")


In [None]:
precision_by_clause_type(df_results, clause_type="embedded")
precision_by_clause_type(df_results, clause_type="non_embedded")
