# Stroll SRL Test

Test the eScience tool Stroll for semantic role labeling of Dutch: https://github.com/Filter-Bubble/stroll

Note: also take a look at AllenNLP: https://allennlp.org/

In [1]:
import stanza
import stroll.stanza
import pandas as pd

Using backend: pytorch


In [17]:
PARAGRAPH ="""
   De Sinterklaasintocht in Utrecht gaat vanwege de opleving van het coronavirus niet door. 
   De tocht,die gepland stond voor komende zondag, is daarom niet veilig te organiseren, concludeert de organisatie.
   Ook alternatieve tochten bleken geen veilige optie.
"""

In [3]:
def nlp_analysis_to_table(nlp_analysis):
    nbr_of_words = 0
    for s in nlp_analysis.sentences:
        for w in s.words:
            if nbr_of_words == 0:
                nlp_table_df = pd.DataFrame({"id": [w.id], 
                                             "text": [w.text], 
                                             "lemma": [w.lemma],
                                             "upos": [w.upos],
                                             "xpos": [w.xpos],
                                             "feats": [w.feats],
                                             "head": [w.head],
                                             "deprel": [w.deprel],
                                             "deps": [w.deps],
                                             "misc": [w.misc],
                                             "start_char": [w.start_char],
                                             "end_char": [w.end_char],
                                             "parent": [w.parent],
                                             "sent": [w.sent],
                                             "srl": [w.srl],
                                             "frame": [w.frame],
                                            })
            else:
                nlp_table_df.loc[len(nlp_table_df.index)] = [ w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, w.head, w.deprel, w.deps, w.misc, 
                                                              w.start_char, w.end_char, w.parent, w.sent, w.srl, w.frame ]
            nbr_of_words += 1
    return nlp_table_df

In [34]:
SRL_FIELDS = [ "sent_id", "head_id", "head", "nsubj", "rel", "Arg0", "Arg1", "ArgM-CAU", "ArgM-MNR", "ArgM-NEG" ]


def srl_dict_to_srl_list(srl_dict):
    srl_list = len(SRL_FIELDS) * [ "" ]
    for i in range(0, len(SRL_FIELDS)):
        if SRL_FIELDS[i] in srl_dict:
            srl_list[i] = srl_dict[SRL_FIELDS[i]]
    return srl_list


def add_srl_data_to_srl_table(srl_table_df, srl_data, sentence):
    for phrase_key in srl_data:
        if 'head' in srl_data[phrase_key]:
            srl_data[phrase_key]["head"] += " " + sentence[phrase_key]
        else:
            srl_data[phrase_key]["head"] = sentence[phrase_key]
        srl_table_df.loc[len(srl_table_df)] = srl_dict_to_srl_list(srl_data[phrase_key])


def nlp_table_to_srl_table(nlp_table_df):
    srl_table_df = pd.DataFrame({ field: [] for field in SRL_FIELDS })
    srl_data = {}
    sentence = {}
    last_id = 0
    sent_id = 1
    for i, row in nlp_table_df.iterrows():
        if row['id'] <= last_id and len(srl_data) > 0:
            add_srl_data_to_srl_table(srl_table_df, srl_data, sentence)
            sent_id += 1
            srl_data = {}
            sentence = {}
        if row['srl'] != "_":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if row['srl'] in srl_data[row['head']]:
                print(f"duplicate role for {row['srl']}: {srl_data[row['head']][row['srl']]} and {row['lemma']}")
            srl_data[row['head']][row['srl']] = row['lemma']
        if row['frame'] == "rel":
            if row['id'] not in srl_data:
                srl_data[row['id']] = { "sent_id": sent_id, "head_id": row["id"] }
            srl_data[row['id']][row['frame']] = row['lemma']
        if row['deprel'] == "nsubj":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'nsubj' in srl_data[row['head']]:
                srl_data[row['head']]["nsubj"] += " " + row['lemma']
            else:
                srl_data[row['head']]["nsubj"] = row['lemma']
        if row['deprel'] == "compound:prt":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'head' in srl_data[row['head']]:
                srl_data[row['head']]["head"] += " " + row['lemma']
            else:
                srl_data[row['head']]["head"] = row['lemma']
        last_id = row['id']
        sentence[row['id']] = row['lemma'] 
    if len(srl_data) > 0:
        add_srl_data_to_srl_table(srl_table_df, srl_data, sentence)
    return srl_table_df

In [18]:
run_nlp = stanza.Pipeline(lang='nl', processors='tokenize,lemma,pos,depparse,srl')
nlp_analysis = run_nlp(PARAGRAPH)
nlp_table_df = nlp_analysis_to_table(nlp_analysis)

2021-11-11 09:35:23 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |
| srl       | default |

2021-11-11 09:35:23 INFO: Use device: cpu
2021-11-11 09:35:23 INFO: Loading: tokenize
2021-11-11 09:35:23 INFO: Loading: pos
2021-11-11 09:35:24 INFO: Loading: lemma
2021-11-11 09:35:24 INFO: Loading: depparse
2021-11-11 09:35:24 INFO: Loading: srl
2021-11-11 09:35:25 INFO: Done loading processors!


In [35]:
nlp_table_to_srl_table(nlp_table_df)

Unnamed: 0,sent_id,head_id,head,nsubj,rel,Arg0,Arg1,ArgM-CAU,ArgM-MNR,ArgM-NEG
0,1.0,5.0,door gaan,sinterklaasintocht,gaan,,sinterklaasintocht,opleving,,niet
1,2.0,16.0,organiseren,tocht,organiseren,,tocht,daarom,veilig,niet
2,2.0,6.0,staan,die,,,die,,,
3,2.0,18.0,concluderen,organisatie,concluderen,organisatie,,,,
4,3.0,4.0,blijken,tocht,,,,,,optie


In [33]:
PARAGRAPH

'\n   De Sinterklaasintocht in Utrecht gaat vanwege de opleving van het coronavirus niet door. \n   De tocht,die gepland stond voor komende zondag, is daarom niet veilig te organiseren, concludeert de organisatie.\n   Ook alternatieve tochten bleken geen veilige optie.\n'

In [20]:
nlp_table_df

Unnamed: 0,id,text,lemma,upos,xpos,feats,head,deprel,deps,misc,start_char,end_char,parent,sent,srl,frame
0,1,De,de,DET,LID|bep|stan|rest,Definite=Def,2,det,,,4,6,"[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
1,2,Sinterklaasintocht,sinterklaasintocht,NOUN,N|soort|ev|basis|zijd|stan,Gender=Com|Number=Sing,5,nsubj,,,7,25,"[\n {\n ""id"": 2,\n ""text"": ""Sinterklaas...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",Arg1,_
2,3,in,in,ADP,VZ|init,,4,case,,,26,28,"[\n {\n ""id"": 3,\n ""text"": ""in"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
3,4,Utrecht,Utrecht,PROPN,N|eigen|ev|basis|onz|stan,Gender=Neut|Number=Sing,2,nmod,,,29,36,"[\n {\n ""id"": 4,\n ""text"": ""Utrecht"",\n...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
4,5,gaat,gaan,VERB,WW|pv|tgw|met-t,Number=Sing|Tense=Pres|VerbForm=Fin,0,root,,,37,41,"[\n {\n ""id"": 5,\n ""text"": ""gaat"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,rel
5,6,vanwege,vanwege,ADP,VZ|init,,8,case,,,42,49,"[\n {\n ""id"": 6,\n ""text"": ""vanwege"",\n...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
6,7,de,de,DET,LID|bep|stan|rest,Definite=Def,8,det,,,50,52,"[\n {\n ""id"": 7,\n ""text"": ""de"",\n ""...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
7,8,opleving,opleving,NOUN,N|soort|ev|basis|zijd|stan,Gender=Com|Number=Sing,5,obl,,,53,61,"[\n {\n ""id"": 8,\n ""text"": ""opleving"",\...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",ArgM-CAU,_
8,9,van,van,ADP,VZ|init,,11,case,,,62,65,"[\n {\n ""id"": 9,\n ""text"": ""van"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
9,10,het,het,DET,LID|bep|stan|evon,Definite=Def,11,det,,,66,69,"[\n {\n ""id"": 10,\n ""text"": ""het"",\n ...","[\n {\n ""id"": 1,\n ""text"": ""De"",\n ""...",_,_
