# Stroll SRL Test

Test the eScience tool Stroll for semantic role labeling of Dutch: https://github.com/Filter-Bubble/stroll

Note: also take a look at AllenNLP: https://allennlp.org/

In [1]:
import stanza
import stroll.stanza
import pandas as pd

Using backend: pytorch


In [3]:
def nlp_analysis_to_table(nlp_analysis):
    nbr_of_words = 0
    for s in nlp_analysis.sentences:
        for w in s.words:
            if nbr_of_words == 0:
                nlp_table_df = pd.DataFrame({"id": [w.id], 
                                             "text": [w.text], 
                                             "lemma": [w.lemma],
                                             "upos": [w.upos],
                                             "xpos": [w.xpos],
                                             "feats": [w.feats],
                                             "head": [w.head],
                                             "deprel": [w.deprel],
                                             "deps": [w.deps],
                                             "misc": [w.misc],
                                             "start_char": [w.start_char],
                                             "end_char": [w.end_char],
                                             "parent": [w.parent],
                                             "sent": [w.sent],
                                             "srl": [w.srl],
                                             "frame": [w.frame],
                                            })
            else:
                nlp_table_df.loc[len(nlp_table_df.index)] = [ w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, w.head, w.deprel, w.deps, w.misc, 
                                                              w.start_char, w.end_char, w.parent, w.sent, w.srl, w.frame ]
            nbr_of_words += 1
    return nlp_table_df

In [70]:
SRL_FIELDS = [ "sent_id", "head_id", "head", "nsubj", "rel", "Arg0", "Arg1", "Arg2", 
               "ArgM-ADV", "ArgM-CAU", "ArgM-DIS", "ArgM-LOC", "ArgM-MNR", "ArgM-MOD", "ArgM-NEG", "ArgM-REC", "ArgM-TMP" ]


def srl_dict_to_srl_list(srl_dict):
    srl_list = len(SRL_FIELDS) * [ "" ]
    for i in range(0, len(SRL_FIELDS)):
        if SRL_FIELDS[i] in srl_dict:
            srl_list[i] = srl_dict[SRL_FIELDS[i]]
    return srl_list


def add_srl_data_to_srl_table(srl_table_df, srl_data, sentence):
    for phrase_key in srl_data:
        if 'head' in srl_data[phrase_key]:
            srl_data[phrase_key]["head"] += " " + sentence[phrase_key]
        elif phrase_key > 0:
            srl_data[phrase_key]["head"] = sentence[phrase_key]
        else:
            srl_data[phrase_key]["head"] = "FILLER"
        srl_table_df.loc[len(srl_table_df)] = srl_dict_to_srl_list(srl_data[phrase_key])


def nlp_table_to_srl_table(nlp_table_df):
    srl_table_df = pd.DataFrame({ field: [] for field in SRL_FIELDS })
    srl_data = {}
    sentence = {}
    last_id = 0
    sent_id = 1
    for i, row in nlp_table_df.iterrows():
        if row['id'] <= last_id:
            if len(srl_data) > 0:
                add_srl_data_to_srl_table(srl_table_df, srl_data, sentence)
            sent_id += 1
            srl_data = {}
            sentence = {}
        if row['srl'] != "_":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if row['srl'] in srl_data[row['head']]:
                print(f"duplicate role for {row['srl']} [{i}]: {srl_data[row['head']][row['srl']]} and {row['lemma']}")
            srl_data[row['head']][row['srl']] = row['lemma']
        if row['frame'] == "rel":
            if row['id'] not in srl_data:
                srl_data[row['id']] = { "sent_id": sent_id, "head_id": row["id"] }
            srl_data[row['id']][row['frame']] = row['lemma']
        if row['deprel'] == "nsubj":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'nsubj' in srl_data[row['head']]:
                srl_data[row['head']]["nsubj"] += " " + row['lemma']
            else:
                srl_data[row['head']]["nsubj"] = row['lemma']
        if row['deprel'] == "compound:prt":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'head' in srl_data[row['head']]:
                srl_data[row['head']]["head"] += " " + row['lemma']
            else:
                srl_data[row['head']]["head"] = row['lemma']
        last_id = row['id']
        sentence[row['id']] = row['lemma'] 
    if len(srl_data) > 0:
        add_srl_data_to_srl_table(srl_table_df, srl_data, sentence)
    return srl_table_df

In [40]:
FILE_DIR = "../letters/"

def read_file(in_file_id):
    try:
        in_file = open(f"{FILE_DIR}{in_file_id}.txt", "r")
    except:
        sys.exit(f"cannot open {FILE_DIR}{in_file_id}.txt")
    text = ""
    for line in in_file:
        text += line.strip() + " "
    in_file.close()
    return text.strip()

In [12]:
run_nlp = stanza.Pipeline(lang='nl', processors='tokenize,lemma,pos,depparse,srl')

2021-12-13 16:52:57 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |
| srl       | default |

2021-12-13 16:52:57 INFO: Use device: cpu
2021-12-13 16:52:57 INFO: Loading: tokenize
2021-12-13 16:52:57 INFO: Loading: pos
2021-12-13 16:52:57 INFO: Loading: lemma
2021-12-13 16:52:57 INFO: Loading: depparse
2021-12-13 16:52:58 INFO: Loading: srl
2021-12-13 16:52:59 INFO: Done loading processors!


In [71]:
text = read_file(10)
nlp_analysis = run_nlp(text)
nlp_table_df = nlp_analysis_to_table(nlp_analysis)
nlp_table_to_srl_table(nlp_table_df)



duplicate role for ArgM-CAU [101]: corona and daarom
duplicate role for Arg1 [110]: toekomst and er
duplicate role for Arg1 [198]: succes and leven


Unnamed: 0,sent_id,head_id,head,nsubj,rel,Arg0,Arg1,Arg2,ArgM-ADV,ArgM-CAU,ArgM-DIS,ArgM-LOC,ArgM-MNR,ArgM-MOD,ArgM-NEG,ArgM-REC,ArgM-TMP
0,2.0,4.0,J.,naam,,,,,,,,,,,,,
1,2.0,11.0,gaan,ik,gaan,,ik,tijd_machine,,,,,,,,,
2,3.0,5.0,laat,het,,,,,,,,,,,,,toekomst
3,3.0,14.0,worden,het,,,,anders,,,,,,,,,
4,4.0,2.0,zijn,technologie,,,technologie,,,,,,,,,,
5,4.0,10.0,worden,het,,,,,,,,,,,,,
6,5.0,4.0,druk,straat,,,,,,,,,,,,,
7,5.0,7.0,rijden,auto,rijden,,auto,,,,,,,,,,
8,6.0,5.0,auto,het,,,,,wennen,,,,,,,,
9,6.0,9.0,wennen,ik,,,,,,,,,,,,,


In [21]:
text

' Vrolijk en schoon Mijn naam is J. en ik ben in een tijdmachine gegaan. Het is 40 jaar later in de toekomst en het is zo anders geworden. Er is veel meer technologie en het is vrolijker geworden. De straten zijn drukker, er rijden meer auto’s. Maar het zijn andere auto’s dan wat ik gewend ben. Auto’s rijden op schoon water in plaats van benzine wat vroeger zo was. De stad lijkt gezelliger geworden dan vroeger en mensen hebben meer contact met elkaar. Sinds de corona virus van 40 jaar geleden trekken mensen daarom meer met elkaar op. Mijn toekomst ziet er goed uit, ik heb een gezin met 3 kinderen. Ik woon in Amsterdam en heb een erg mooie huis. Ik ben zakenman geworden en heb aardig wat geld opgespaard. Ik wil al dit geld uiteindelijk gaan geven aan mn kinderen, ze zijn namelijk alles voor mij. Ik ben ook erg goed met mijn familie, we zien elkaar elke week minimaal 3x. Zoon, deze brief is voor jou en ik wil jou nog heel veel succes wensen met je leven. Maak er het beste van jongen!  '

In [72]:
nlp_table_df[:30]

Unnamed: 0,id,text,lemma,upos,xpos,feats,head,deprel,deps,misc,start_char,end_char,parent,sent,srl,frame
0,1,Vrolijk,vrolijk,ADJ,ADJ|vrij|basis|zonder,Degree=Pos,0,root,,,0,7,"[\n {\n ""id"": 1,\n ""text"": ""Vrolijk"",\n...","[\n {\n ""id"": 1,\n ""text"": ""Vrolijk"",\n...",_,_
1,2,en,en,CCONJ,VG|neven,,3,cc,,,8,10,"[\n {\n ""id"": 2,\n ""text"": ""en"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""Vrolijk"",\n...",_,_
2,3,schoon,schoon,ADJ,ADJ|vrij|basis|zonder,Degree=Pos,1,conj,,,11,17,"[\n {\n ""id"": 3,\n ""text"": ""schoon"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""Vrolijk"",\n...",_,_
3,1,Mijn,mijn,PRON,VNW|bez|det|stan|vol|1|ev|prenom|zonder|agr,Person=1|Poss=Yes|PronType=Prs,2,nmod:poss,,,18,22,"[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",_,_
4,2,naam,naam,NOUN,N|soort|ev|basis|zijd|stan,Gender=Com|Number=Sing,4,nsubj,,,23,27,"[\n {\n ""id"": 2,\n ""text"": ""naam"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",_,_
5,3,is,zijn,AUX,WW|pv|tgw|ev,Number=Sing|Tense=Pres|VerbForm=Fin,4,cop,,,28,30,"[\n {\n ""id"": 3,\n ""text"": ""is"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",_,_
6,4,J.,J.,PROPN,N|eigen|ev|basis|zijd|stan,Gender=Com|Number=Sing,0,root,,,31,33,"[\n {\n ""id"": 4,\n ""text"": ""J."",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",_,_
7,5,en,en,CCONJ,VG|neven,,11,cc,,,34,36,"[\n {\n ""id"": 5,\n ""text"": ""en"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",_,_
8,6,ik,ik,PRON,VNW|pers|pron|nomin|vol|1|ev,Case=Nom|Person=1|PronType=Prs,11,nsubj,,,37,39,"[\n {\n ""id"": 6,\n ""text"": ""ik"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",Arg1,_
9,7,ben,zijn,AUX,WW|pv|tgw|ev,Number=Sing|Tense=Pres|VerbForm=Fin,11,aux,,,40,43,"[\n {\n ""id"": 7,\n ""text"": ""ben"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""Mijn"",\n ...",_,_
