# Test

In [None]:
import json
from collections import Counter
import math
from typing import List, Dict, Any

def load_jsonl(path: str) -> List[dict]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for ln, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError as e:
                raise ValueError(f"JSON decode error in {path} at line {ln}: {e}") from e
    return rows

def split_by_corpus(rows: List[dict], key: str = "corpus") -> Dict[str, List[dict]]:
    out: Dict[str, List[dict]] = {}
    for r in rows:
        c = r.get(key) or "UNKNOWN"
        out.setdefault(c, []).append(r)
    return out

def frame_counter(rows: List[dict]) -> Counter:
    c = Counter()
    for r in rows:
        for fr in (r.get("frames") or []):
            name = fr.get("name")
            if isinstance(name, str) and name:
                c[name] += 1
    return c

def jsd_from_counters(c1: Counter, c2: Counter, base: float = 2.0) -> float:
    keys = set(c1) | set(c2)
    s1 = sum(c1.values())
    s2 = sum(c2.values())
    if s1 == 0 and s2 == 0:
        return 0.0
    if s1 == 0 or s2 == 0:
        return 1.0  # maximal divergence when one distribution is empty

    p = {k: c1.get(k, 0) / s1 for k in keys}
    q = {k: c2.get(k, 0) / s2 for k in keys}
    m = {k: 0.5 * (p[k] + q[k]) for k in keys}

    def kl(a, b):
        out = 0.0
        for k, av in a.items():
            if av == 0:
                continue
            bv = b[k]
            if bv == 0:
                continue
            out += av * (math.log(av / bv) / math.log(base))
        return out

    return 0.5 * kl(p, m) + 0.5 * kl(q, m)


In [None]:
selected_lemmas = [
        "attack_nn","bag_nn","ball_nn","bit_nn","chairman_nn","circle_vb","contemplation_nn","donkey_nn",
        "edge_nn","face_nn","fiction_nn","gas_nn","graft_nn","head_nn","land_nn","lane_nn","lass_nn",
        "multitude_nn","ounce_nn","part_nn","pin_vb","plane_nn","player_nn","prop_nn","quilt_nn","rag_nn",
        "record_nn","relationship_nn","risk_nn","savage_nn","stab_nn","stroke_vb","thump_nn","tip_vb",
        "tree_nn","twist_nn","word_nn"
    ]

In [None]:
TARGET_CORPORA = ["corpus_1", "corpus_2"]

for lemma in selected_lemmas:
    # Path the parsed FrameNet data files
    lemma_path = f"... /{lemma}_lemma_FrameNet_parsed.jsonl" # <-- Replace with the actual path
    token_path = f"... /{lemma}_token_FrameNet_parsed.jsonl" # <-- Replace with the actual path

    lemma_rows = load_jsonl(lemma_path)
    token_rows = load_jsonl(token_path)

    lemma_by = split_by_corpus(lemma_rows, key="corpus")
    token_by = split_by_corpus(token_rows, key="corpus")

    jsds = {}
    for corp in TARGET_CORPORA:
        l_counts = frame_counter(lemma_by.get(corp, []))
        t_counts = frame_counter(token_by.get(corp, []))
        jsds[corp] = jsd_from_counters(l_counts, t_counts, base=2.0)

    print({
        "lemma": lemma,
        "JSD_corpus_1": round(jsds["corpus_1"], 6),
        "JSD_corpus_2": round(jsds["corpus_2"], 6),
    })
