# English

In [None]:
import pandas as pd
from frame_semantic_transformer import FrameSemanticTransformer

In [None]:
frame_transformer_base = FrameSemanticTransformer('base', batch_size=50)
frame_transformer_base.model.to('cuda:0')
frame_transformer_small = FrameSemanticTransformer('small', batch_size=16)
frame_transformer_small.model.to('cuda:2')

In [None]:
frame_transformer_base = FrameSemanticTransformer(batch_size=16)

result = frame_transformer_base.detect_frames_bulk([
            'I saw a movie yesterday',
            'We own a house',
            'We borrow money',
            'Mom sends a letter',
            'The child drinks milk',
            'She tells the truth',
            'I know the answer',
            'The car stops here',
            'He opens the window',
            'We go to the park',
            'The train arrives now',
            'The book is on the shelf',
            'I think of you',
            'He loves his dog',
            'She understands the question',
            'We forget the name',
            'They talk about the weather',
            'The teacher answers the student',
            'Dad promises a present',
            'She calls her friend',
            'Ice melts quickly',
            'The sun shines today',
            'He slams the door',
            'He writes an email',
            'The meeting starts at nine oclock',
            'I turn on the TV',
            'It is raining outside',
            'The child sleeps deeply',
            'She comes up with the answer',
            'She gives up',
])

In [None]:
result

In [None]:
selected_lemmas = ['attack_nn', 'bag_nn', 'ball_nn', 'bit_nn', 'chairman_nn', 'circle_vb', 'contemplation_nn', 'donkey_nn', 'edge_nn', 'face_nn', 'fiction_nn', 'gas_nn', 'graft_nn', 'head_nn', 'land_nn', 'lane_nn', 'lass_nn', 'multitude_nn', 'ounce_nn', 'part_nn', 'pin_vb', 'plane_nn', 'player_nn', 'prop_nn', 'quilt_nn', 'rag_nn', 'record_nn', 'relationship_nn', 'risk_nn', 'savage_nn', 'stab_nn', 'stroke_vb', 'thump_nn', 'tip_vb', 'tree_nn', 'twist_nn', 'word_nn']

mode = 'token' # 'token' or 'lemma'

In [None]:
for selected_lemma in selected_lemmas:
    print(f"Processing lemma: {selected_lemma}")

    # Load corpus data
    corpus_1_path = f'... /SemEval_en_split/corpus1/{mode}/ccoha1_{selected_lemma}.csv' # <-- Replace with actual path
    df = pd.read_csv(corpus_1_path, dtype=str, keep_default_na=False)
    corpus_1 = df["sent"].astype(str).str.strip()
    corpus_1 = corpus_1[corpus_1 != ""].tolist()

    corpus_2_path = f'... /SemEval_en_split/corpus2/{mode}/ccoha2_{selected_lemma}.csv' # <-- Replace with actual path
    df = pd.read_csv(corpus_2_path, dtype=str, keep_default_na=False)
    corpus_2 = df["sent"].astype(str).str.strip()
    corpus_2 = corpus_2[corpus_2 != ""].tolist()

    # Parse the data
    corpora = {
    "corpus_1": corpus_1,
    "corpus_2": corpus_2,
    }

    total_results = {name: [] for name in corpora}

    for name, sentences in corpora.items():
        base_results = frame_transformer_base.detect_frames_bulk(sentences)  # sentences are list[str]
        for base_result in base_results:
            org_sent = base_result.sentence

            if not base_result.frames:
                print(f"No frames detected for {org_sent}.\nParse again with small model.")
                small_result = frame_transformer_small.detect_frames(org_sent)
                if not small_result.frames:
                    print(f"Still no frames detected for {org_sent}.\n")
                    continue
                else: # If small model detects frames, use its result
                    total_results[name].append(small_result)
            else:
                total_results[name].append(base_result)

    # Save results to txt
    output_path = f'... /{selected_lemma}_{mode}_FrameNet_parsed.txt' # <-- Replace with actual path
    with open(output_path, 'w') as f:
        for name, results in total_results.items():
            f.write(f"=== Results for {name} ===\n\n")
            for result in results:
                f.write(str(result))
                f.write("\n\n")
    print(f"Finished processing lemma: {selected_lemma}\n")

In [None]:
# Some testing code
import pandas as pd
corpus_1_path = '... corpus1/token/ccoha1_multitude_nn.csv'
df = pd.read_csv(corpus_1_path, dtype=str, keep_default_na=False)
corpus_1 = df["sent"].astype(str).str.strip()
corpus_1 = corpus_1[corpus_1 != ""].tolist()

corpus_2_path = '... /corpus2/token/ccoha2_multitude_nn.csv'
df = pd.read_csv(corpus_2_path, dtype=str, keep_default_na=False)
corpus_2 = df["sent"].astype(str).str.strip()
corpus_2 = corpus_2[corpus_2 != ""].tolist()

In [None]:
corpora = {
    "corpus_1": corpus_1,
    "corpus_2": corpus_2,
}

total_results = {name: [] for name in corpora}

for name, sentences in corpora.items():
    base_results = frame_transformer_base.detect_frames_bulk(sentences)  # sentences are list[str]
    for base_result in base_results:
        org_sent = base_result.sentence

        if not base_result.frames:
            print(f"No frames detected for {org_sent}.\nParse again with small model.")
            small_result = frame_transformer_small.detect_frames(org_sent)
            if not small_result.frames:
                print(f"Still no frames detected for {org_sent}.\n")
                continue
            else: # If small model detects frames, use its result
                total_results[name].append(small_result)
        else:
            total_results[name].append(base_result)

In [None]:
total_results

## Compute skipped sentences

In [None]:
from __future__ import annotations

import json
import re
from pathlib import Path
import pandas as pd


def count_skipped_in_jsonl(
    jsonl_path: Path,
    corpus_field: str = "corpus",
    trigger_field: str = "trigger_locations",
) -> dict[str, dict[str, int]]:
    """
    Returns per-corpus totals and skipped counts:
      {
        "corpus_1": {"total": N, "skipped": K},
        "corpus_2": {"total": N, "skipped": K},
      }

    skipped := trigger_locations == []
    """
    out: dict[str, dict[str, int]] = {}

    with jsonl_path.open("r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                raise ValueError(f"Invalid JSON on line {line_no} in {jsonl_path}: {e}") from e

            corpus = obj.get(corpus_field) or "MISSING_CORPUS_FIELD"
            out.setdefault(corpus, {"total": 0, "skipped": 0})

            out[corpus]["total"] += 1
            if obj.get(trigger_field, None) == []:
                out[corpus]["skipped"] += 1

    return out


def lemma_from_filename(p: Path) -> str:
    """
    Expect: <lemma>_<mode>_FrameNet_parsed.jsonl
    Example: attack_nn_lemma_FrameNet_parsed.jsonl -> attack_nn
    """
    m = re.match(r"(.+?)_(lemma|token)_FrameNet_parsed\.jsonl$", p.name)
    return m.group(1) if m else p.stem


def build_skipped_table(
    base_dir: str | Path,
    modes: list[str] = ["lemma", "token"],
    corpus_names: list[str] = ["corpus_1", "corpus_2"],
) -> pd.DataFrame:
    """
    Scans:
      {base_dir}/{mode}/*_FrameNet_parsed.jsonl

    Row per (mode, lemma, corpus).
    """
    base_dir = Path(base_dir)
    rows = []

    for mode in modes:
        d = base_dir / mode
        if not d.exists():
            continue

        for jsonl_path in sorted(d.glob("*_FrameNet_parsed.jsonl")):
            lemma = lemma_from_filename(jsonl_path)
            stats = count_skipped_in_jsonl(jsonl_path)

            for corpus in corpus_names:
                total = stats.get(corpus, {}).get("total", 0)
                skipped = stats.get(corpus, {}).get("skipped", 0)
                rate = (skipped / total) if total else 0.0

                rows.append({
                    "mode": mode,
                    "lemma": lemma,
                    "corpus": corpus,
                    "total": total,
                    "skipped": skipped,
                    "skipped_rate": rate,
                })

    return pd.DataFrame(rows)


def summary_counts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Only counts (no averages):
    - per (mode, corpus): n_lemmas, missed_total, total_sentences
    - overall per mode (corpus=ALL): sums across corpora, n_lemmas (unique lemmas in that mode)
    """
    if df.empty:
        return pd.DataFrame(columns=["mode", "corpus", "n_lemmas", "missed_total", "total_sentences"])

    per_corpus = (
        df.groupby(["mode", "corpus"], as_index=False)
          .agg(
              n_lemmas=("lemma", "nunique"),
              missed_total=("skipped", "sum"),
              total_sentences=("total", "sum"),
          )
    )

    overall = (
        df.groupby(["mode"], as_index=False)
          .agg(
              n_lemmas=("lemma", "nunique"),
              missed_total=("skipped", "sum"),
              total_sentences=("total", "sum"),
          )
    )
    overall["corpus"] = "ALL"
    overall = overall[["mode", "corpus", "n_lemmas", "missed_total", "total_sentences"]]

    return pd.concat([per_corpus, overall], ignore_index=True)

if __name__ == "__main__":
    BASE = "..." # <-- Path to the parsed data

    df = build_skipped_table(BASE, modes=["lemma", "token"], corpus_names=["corpus_1", "corpus_2"])
    df = df.sort_values(["mode", "lemma", "corpus"], kind="stable")

    summary = summary_counts(df).sort_values(["mode", "corpus"], kind="stable")

    # In hết rows (không head 20)
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 200)

    print("=== Per-lemma skipped table (ALL rows) ===")
    print(df.to_string(index=False))

    print("\n=== Total miss (skipped_rate) ===")
    print(summary.to_string(index=False))

    # Save nếu cần
    out_dir = Path(BASE) / "_reports"
    out_dir.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_dir / "skipped_table.csv", index=False)
    summary.to_csv(out_dir / "skipped_summary.csv", index=False)


# Swedish

In [None]:
!git clone https://github.com/lucyYB/SweFN-SRL.git

In [None]:
import pandas as pd
from frame_semantic_transformer import FrameSemanticTransformer
import sys

In [None]:
# Implement custom Traning and Inference loaders for the Swedish Framenet data
# This is the core step necessary to get FrameSemanticTransformer to work with different languages/framenets

import xml.etree.ElementTree as ET
import random
import re
from typing import List
from frame_semantic_transformer.data.loaders.loader import TrainingLoader, InferenceLoader
from frame_semantic_transformer.data.frame_types import Frame, FrameAnnotatedSentence, FrameAnnotation, FrameElementAnnotation
from frame_semantic_transformer.data.augmentations import (
    DataAugmentation,
    LowercaseAugmentation,
    RemoveEndPunctuationAugmentation,
)
from nltk.stem import SnowballStemmer

swedish_stemmer = SnowballStemmer("swedish")


def extract_frame(xml_frame) -> Frame:
    """
    Extract a Frame element from the Swedish XML for a frame
    """
    name = xml_frame.attrib["id"].replace('swefn--', '')
    core_elms = [
        ft.attrib["val"] for ft in xml_frame.findall(".//feat[@att='coreElement']")
    ]
    non_core_elms = [
        ft.attrib["val"]
        for ft in xml_frame.findall(".//feat[@att='peripheralElement']")
    ]
    lus = [ft.attrib["val"] for ft in xml_frame.findall(".//feat[@att='LU']")]
    # some examples have triggers outside the listed 'LU', but they are usually registerred as 'suggestionForLU'
    lus += [ft.attrib["val"] for ft in xml_frame.findall(".//feat[@att='suggestionForLU']")]

    return Frame(
        name=name,
        core_elements=core_elms,
        non_core_elements=non_core_elms,
        lexical_units=lus,
    )


def extract_example(example_xml, frame_name) -> FrameAnnotatedSentence:
    """
    Extract an annotated training sentence from a Swedish FrameNet Example XML
    NOTE: This isn't ideal since only 1 frame is tagged in each example. This may
    cause the Swedish FrameSemanticTransformer to only ever tag 1 frame per sentence.
    """
    nodes_to_extract = [n for n in example_xml]
    text = ""
    trigger_locs = []
    frame_elements = []
    while len(nodes_to_extract) > 0:
        node = nodes_to_extract.pop(0)
        # sometimes there's nodes in nodes, compound annotation in SweFN
        # in this case, push the children of this node to the front of the queue and keep going
        if len(node) > 0:
            nodes_to_extract = [n for n in node] + nodes_to_extract
        else:
            cur_index = len(text)
            if not node.text:
                continue
            node_text = re.sub(r"\s+", ' ', node.text)
            if node.attrib.get("name") == "LU":
                trigger_locs.append(cur_index)
            elif "name" in node.attrib:
                frame_elements.append(
                    FrameElementAnnotation(
                        name=node.attrib["name"],
                        start_loc=cur_index,
                        end_loc=cur_index + len(node_text),
                    )
                )
            text += node_text + " "
    text = text.strip()
    return FrameAnnotatedSentence(
        text=text,
        annotations=[
            FrameAnnotation(
                frame=frame_name,
                trigger_locs=trigger_locs,
                frame_elements=frame_elements,
            )
        ],
    )


class SwedishTrainingLoader(TrainingLoader):
    """
    Training Loader for Swedish
    This class tells FrameSemanticTransformer how to load the Swedish FrameNet training data
    """
    train_sentences: List[FrameAnnotatedSentence]
    test_sentences: List[FrameAnnotatedSentence]
    val_sentences: List[FrameAnnotatedSentence]


    def __init__(self, swedish_framenet_xml_file, test_portion=0.1, val_portion=0.1, seed=42):
        # parse annotated sentences from XML
        annotated_sentences = []
        tree = ET.parse(swedish_framenet_xml_file)
        root = tree.getroot()
        for xml_frame in root.findall(".//Sense"):
            frame = extract_frame(xml_frame)
            for child in xml_frame:
                if 'example' in child.tag:
                    annotated_sentences.append(extract_example(child, frame.name))
        # split into train/test/val 
        random.Random(seed).shuffle(annotated_sentences)
        num_test = int(test_portion * len(annotated_sentences))
        num_val = int(val_portion * len(annotated_sentences))

        self.test_sentences = annotated_sentences[0:num_test]
        self.val_sentences = annotated_sentences[num_test:num_test + num_val]
        self.train_sentences = annotated_sentences[num_test + num_val:]
    
    def load_training_data(self):
        return self.train_sentences
    
    def load_validation_data(self):
        return self.val_sentences
    
    def load_test_data(self):
        return self.test_sentences
    
    def get_augmentations(self) -> List[DataAugmentation]:
        """
        These augmentations try to increase the training data by making safe tweaks to the text
        For instance, removing the punctuation at the end, or lowercasing the whole sentence
        """
        return [
            RemoveEndPunctuationAugmentation(0.3),
            LowercaseAugmentation(0.2),
        ]


class SwedishInferenceLoader(InferenceLoader):
    """
    Inference loader for Swedish
    This class tells FrameSemanticTransformer which frames and LUs are available during inference
    """

    frames: List[Frame]

    def __init__(self, swedish_framenet_xml_file, test_portion=0.1, val_portion=0.1, seed=42):
        # parse annotated sentences from XML
        self.frames = []
        tree = ET.parse(swedish_framenet_xml_file)
        root = tree.getroot()
        for xml_frame in root.findall(".//Sense"):
            frame = extract_frame(xml_frame)
            self.frames.append(frame)

    def load_frames(self):
        return self.frames
    
    def normalize_lexical_unit_text(self, lu):
        """
        This method normalizes lexical unit text for Swedish during inference
        Lexical Units help give hints to the model about what frames are likely 
        """
        normalized_lu = lu.lower()
        normalized_lu = re.sub(r"\..+$", "", normalized_lu)
        normalized_lu = re.sub(r"[^a-ö ]", " ", normalized_lu)
        
        ##### try 2
        return "_".join([swedish_stemmer.stem(word) for word in normalized_lu.split()])

In [None]:
from frame_semantic_transformer import FrameSemanticTransformer

# Path to tới XML SweFN (bắt buộc)
SWEFN_XML = ("... /SweFN-SRL/Model1/frame-semantic-transformer/swefn.xml")   # <-- Replace with actual path

loader = SwedishInferenceLoader(SWEFN_XML)

# Load checkpoint + inference_loader
custom_transformer = FrameSemanticTransformer(
    "... /models/frame_swe_m1_base", # <-- Replace with actual path
    inference_loader=loader,
    batch_size=32
)

# Move model to GPU
custom_transformer.model.to("cuda:0")


In [None]:
# quick test
swe_test_sentences = ['Jag såg en film igår', # I watched a movie yesterday
                      'Vi äger ett hus', # We own a house
                      'Vi lånar pengar', # We borrow money
                      'Mamma skickar ett brev', # Mom sends a letter
                      'Barnet dricker mjölk', # The child drinks milk
                      'Hon säger sanningen', # She tells the truth  
                      'Jag vet svaret', # I know the answer
                      'Bilen stannar här', # The car stops here
                      'Han öppnar fönstret', # He opens the window
                      'Vi går till parken', # We go to the park
                      'Tåget anländer nu', # The train arrives now
                      'Boken ligger på hyllan', # The book is on the shelf
                      'Jag tänker på dig ', # I think of you
                      'Han älskar sin hund', # He loves his dog
                      'Hon förstår frågan', # She understands the question
                      'Vi glömmer namnet', # We forget the name
                      'De pratar om vädret', # They talk about the weather
                      'Läraren svarar eleven', # The teacher answers the student
                      'Pappa lovar en present', # Dad promises a present
                      'Hon ringer sin vän', # She calls her friend
                      'Isen smälter snabbt', # The ice melts quickly
                      'Solen skiner idag', # The sun is shining today
                      'Han slår igen dörren', # He slams the door shut
                      'Han skriver ett mejl', # He writes an email
                      'Mötet börjar klockan nio', # The meeting starts at nine o'clock
                      'Jag slår på TV:n', # I turn on the TV
                      'Det regnar ute', # It's raining outside
                      'Barnet sover djupt', # The child is sleeping deeply
                      'Hon kommer på svaret', # She comes up with the answer
                      'Hon ger upp' # She gives up
]

res = custom_transformer.detect_frames_bulk(swe_test_sentences)
res
