# Frame Semantic Transformer: Swedish Training

This notebook trains and evaluates a Swedish SRL model based on the [Frame Semantic Transformer library](https://github.com/chanind/frame-semantic-transformer)

In [1]:
import torch
print(f'built: {torch.backends.cuda.is_built()}')
print(f'CUDNN version: {torch.backends.cudnn.version()}')
torch.cuda.is_available()
# torch.cuda.device_count()

# torch.__version__
# # verify that your graphics card and driver both support the required CUDA version 
# torch.zeros(1).cuda()

# !nvcc --version

built: True
CUDNN version: 8500


True

In [2]:
!nvidia-smi

import os
print(f"num cpus: {os.cpu_count()}")

Tue May 30 22:44:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.01       Driver Version: 516.01       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:07:00.0  On |                  N/A |
|  0%   46C    P8    10W / 166W |    647MiB /  8192MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# Clone the git repo and switch to the multilingual-training-refactor branch
# NOTE: Once this code is merged, you can just do a normal pip install

# !rm -rf /frame-semantic-transformer
# !git clone https://github.com/chanind/frame-semantic-transformer.git
# !git checkout multilingual-training-refactor

In [3]:
import sys
sys.path.append('./frame-semantic-transformer')
%cd ./frame-semantic-transformer

c:\Users\lucyy\Documents\ADS_thesis\Model1_wSuggLU_base\frame-semantic-transformer


In [4]:
# check package version
# import transformers
# transformers.__version__

'4.24.0'

In [3]:
# !pip install pytorch_lightning sentencepiece
# Download swedish Framenet XML
# !wget https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml

--2023-02-24 11:15:07--  https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml
Resolving svn.spraakdata.gu.se (svn.spraakdata.gu.se)... 130.241.135.158
Connecting to svn.spraakdata.gu.se (svn.spraakdata.gu.se)|130.241.135.158|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7342238 (7,0M) [text/xml]
Saving to: 'swefn.xml'

     0K .......... .......... .......... .......... ..........  0% 1,59M 4s
    50K .......... .......... .......... .......... ..........  1% 3,16M 3s
   100K .......... .......... .......... .......... ..........  2%  109M 2s
   150K .......... .......... .......... .......... ..........  2% 3,27M 2s
   200K .......... .......... .......... .......... ..........  3%  121M 2s
   250K .......... .......... .......... .......... ..........  4%  113M 1s
   300K .......... .......... .......... .......... ..........  4%  118M 1s
   350K .......... .......... .......... .......... ..........  5% 96,1M 1s
   400K .......... .......... ....

In [4]:
# Implement custom Traning and Inference loaders for the Swedish Framenet data
# This is the core step necessary to get FrameSemanticTransformer to work with different languages/framenets

import xml.etree.ElementTree as ET
import random
import re
from typing import List
from frame_semantic_transformer.data.loaders.loader import TrainingLoader, InferenceLoader
from frame_semantic_transformer.data.frame_types import Frame, FrameAnnotatedSentence, FrameAnnotation, FrameElementAnnotation
from frame_semantic_transformer.data.augmentations import (
    DataAugmentation,
    LowercaseAugmentation,
    RemoveEndPunctuationAugmentation,
)
from nltk.stem import SnowballStemmer

swedish_stemmer = SnowballStemmer("swedish")


def extract_frame(xml_frame) -> Frame:
    """
    Extract a Frame element from the Swedish XML for a frame
    """
    name = xml_frame.attrib["id"].replace('swefn--', '')
    core_elms = [
        ft.attrib["val"] for ft in xml_frame.findall(".//feat[@att='coreElement']")
    ]
    non_core_elms = [
        ft.attrib["val"]
        for ft in xml_frame.findall(".//feat[@att='peripheralElement']")
    ]
    lus = [ft.attrib["val"] for ft in xml_frame.findall(".//feat[@att='LU']")]
    # some examples have triggers outside the listed 'LU', but they are usually registerred as 'suggestionForLU'
    lus += [ft.attrib["val"] for ft in xml_frame.findall(".//feat[@att='suggestionForLU']")]

    return Frame(
        name=name,
        core_elements=core_elms,
        non_core_elements=non_core_elms,
        lexical_units=lus,
    )


def extract_example(example_xml, frame_name) -> FrameAnnotatedSentence:
    """
    Extract an annotated training sentence from a Swedish FrameNet Example XML
    NOTE: This isn't ideal since only 1 frame is tagged in each example. This may
    cause the Swedish FrameSemanticTransformer to only ever tag 1 frame per sentence.
    """
    nodes_to_extract = [n for n in example_xml]
    text = ""
    trigger_locs = []
    frame_elements = []
    while len(nodes_to_extract) > 0:
        node = nodes_to_extract.pop(0)
        # sometimes there's nodes in nodes, compound annotation in SweFN
        # in this case, push the children of this node to the front of the queue and keep going
        if len(node) > 0:
            nodes_to_extract = [n for n in node] + nodes_to_extract
        else:
            cur_index = len(text)
            if not node.text:
                continue
            node_text = re.sub(r"\s+", ' ', node.text)
            if node.attrib.get("name") == "LU":
                trigger_locs.append(cur_index)
            elif "name" in node.attrib:
                frame_elements.append(
                    FrameElementAnnotation(
                        name=node.attrib["name"],
                        start_loc=cur_index,
                        end_loc=cur_index + len(node_text),
                    )
                )
            text += node_text + " "
    text = text.strip()
    return FrameAnnotatedSentence(
        text=text,
        annotations=[
            FrameAnnotation(
                frame=frame_name,
                trigger_locs=trigger_locs,
                frame_elements=frame_elements,
            )
        ],
    )


class SwedishTrainingLoader(TrainingLoader):
    """
    Training Loader for Swedish
    This class tells FrameSemanticTransformer how to load the Swedish FrameNet training data
    """
    train_sentences: List[FrameAnnotatedSentence]
    test_sentences: List[FrameAnnotatedSentence]
    val_sentences: List[FrameAnnotatedSentence]


    def __init__(self, swedish_framenet_xml_file, test_portion=0.1, val_portion=0.1, seed=42):
        # parse annotated sentences from XML
        annotated_sentences = []
        tree = ET.parse(swedish_framenet_xml_file)
        root = tree.getroot()
        for xml_frame in root.findall(".//Sense"):
            frame = extract_frame(xml_frame)
            for child in xml_frame:
                if 'example' in child.tag:
                    annotated_sentences.append(extract_example(child, frame.name))
        # split into train/test/val 
        random.Random(seed).shuffle(annotated_sentences)
        num_test = int(test_portion * len(annotated_sentences))
        num_val = int(val_portion * len(annotated_sentences))

        self.test_sentences = annotated_sentences[0:num_test]
        self.val_sentences = annotated_sentences[num_test:num_test + num_val]
        self.train_sentences = annotated_sentences[num_test + num_val:]
    
    def load_training_data(self):
        return self.train_sentences
    
    def load_validation_data(self):
        return self.val_sentences
    
    def load_test_data(self):
        return self.test_sentences
    
    def get_augmentations(self) -> List[DataAugmentation]:
        """
        These augmentations try to increase the training data by making safe tweaks to the text
        For instance, removing the punctuation at the end, or lowercasing the whole sentence
        """
        return [
            RemoveEndPunctuationAugmentation(0.3),
            LowercaseAugmentation(0.2),
        ]


class SwedishInferenceLoader(InferenceLoader):
    """
    Inference loader for Swedish
    This class tells FrameSemanticTransformer which frames and LUs are available during inference
    """

    frames: List[Frame]

    def __init__(self, swedish_framenet_xml_file, test_portion=0.1, val_portion=0.1, seed=42):
        # parse annotated sentences from XML
        self.frames = []
        tree = ET.parse(swedish_framenet_xml_file)
        root = tree.getroot()
        for xml_frame in root.findall(".//Sense"):
            frame = extract_frame(xml_frame)
            self.frames.append(frame)

    def load_frames(self):
        return self.frames
    
    def normalize_lexical_unit_text(self, lu):
        """
        This method normalizes lexical unit text for Swedish during inference
        Lexical Units help give hints to the model about what frames are likely 
        """
        normalized_lu = lu.lower()
        normalized_lu = re.sub(r"\..+$", "", normalized_lu)
        normalized_lu = re.sub(r"[^a-ö ]", " ", normalized_lu)
        
        ##### try 2
        return "_".join([swedish_stemmer.stem(word) for word in normalized_lu.split()])

In [5]:
%reload_ext tensorboard

In [7]:
%tensorboard --logdir ./lightning_logs --host localhost --port 1005

In [7]:
MODEL_TYPE = 'base'

from frame_semantic_transformer.training.train import train
# !rm -rf ./outputs/
# !rm -rf ./lightning_logs/*

In [8]:
# Training slowly
model, tokenizer = train(
    base_model=f"chanind/frame-semantic-transformer-{MODEL_TYPE}",
    # base_model = "./tuned_model1_base",
    batch_size=4,
    max_epochs=45,
    lr=5e-5,
    skip_initial_epochs_validation=0,
    num_workers=0, ### due to python multiprocessing NOT working with pytorch
    
    # Here is where we pass in our shiny new Swedish loaders!
    training_loader=SwedishTrainingLoader('./swefn.xml'),
    inference_loader=SwedishInferenceLoader('./swefn.xml'),
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

The Final Best model is saved as "tuned_model_..."

In [None]:
# !rm -rf tuned_model_{MODEL_TYPE}
# model.save_pretrained(f'tuned_model1_{MODEL_TYPE}')
# tokenizer.save_pretrained(f'tuned_model1_{MODEL_TYPE}')

Manual evaluation on one sentence

In [5]:
def manual_inference_sentence(chosen_model):
    # Let's experiment with the model we just trained
    from frame_semantic_transformer import FrameSemanticTransformer

    # Need to pass in the Swedish inference loader when creating the FrameSemanticTransformer instance, otherwise it will use the Framenet 1.7 loader
    inference_loader = SwedishInferenceLoader('./swefn.xml')
    ##### Adjust this path to use the best-performing model from the "outputs" folder
    transformer = FrameSemanticTransformer(chosen_model, inference_loader=inference_loader)

    text = "Axel fick parkerade på Odengatan , detta gjorde honom pepp ."
    result = transformer.detect_frames(text)

    print(f"\n Results found in: {result.sentence}")
    print(f"\n Trigger Location: {result.trigger_locations}" )
    locs = result.trigger_locations
    for loc in locs:
        print(f"    starting with: {result.sentence[loc]}")
    
    for frame in result.frames:
        print(f"FRAME: {frame.name}")
        for element in frame.frame_elements:
            print(f"  {element.name}: {element.text}")

In [6]:
# the Best model
model_path = "./tuned_model1_base"
manual_inference_sentence(model_path)


 Results found in: Axel fick parkerade p Odengatan, detta gjorde honom pepp.

 Trigger Location: [10]
    starting with: p
FRAME: Placing
  Agent: Axel
  Goal: p Odengatan


#### on Test set

In [5]:
import os
os.getcwd()
# %cd ./frame-semantic-transformer

'c:\\Users\\lucyy\\Documents\\ADS_thesis\\Model1_wSuggLU_base\\frame-semantic-transformer'

In [5]:
# from frame_semantic_transformer.training.evaluate_model import evaluate_model

# "test_model_frame" is created from func "evaluate_model" to speed up
from frame_semantic_transformer.training.test_model_frame import evaluate_model

from frame_semantic_transformer.data.LoaderDataCache import LoaderDataCache
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from frame_semantic_transformer.constants import MODEL_MAX_LENGTH

chosen_model_path = "./tuned_model1_base"
chosen_model = T5ForConditionalGeneration.from_pretrained(chosen_model_path)
tokenizer = T5TokenizerFast.from_pretrained(chosen_model_path, 
                                            model_max_length=MODEL_MAX_LENGTH)

tl = SwedishTrainingLoader('./swefn.xml')
il = SwedishInferenceLoader('./swefn.xml')
lc = LoaderDataCache(il)

test_results = evaluate_model(
    chosen_model,
    tokenizer,
    loader_cache = lc,
    training_loader =tl,
    batch_size = 4,
    num_workers = 0,
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Evaluating on test set


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         Test metric                   DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   test_args_extraction_f1          0.5829798579216003
 test_frame_classification_f1       0.6295525431632996
          test_loss                 0.1851096898317337
test_trigger_identification_f1      0.4715789556503296
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
