# Stroll Twitter

Use Stroll semantic role labeling for information extraction from tweets.

## 1. Reading the data

In [1]:
import os
import pandas as pd
import re

from IPython.display import clear_output

In [2]:
DATADIR = "../../puregome/data/text/"
TEXT = "text"
USER = "user"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

In [4]:
def remove_urls(tweet_text):
    return(re.sub(r"\bhttps?://\S*","",tweet_text,flags=re.IGNORECASE))

In [5]:
def restore_newlines(tweet_text):
    return re.sub(r'\\n', '\n', tweet_text)

In [6]:
def read_tweets(file_pattern):
    file_names = sorted(os.listdir(DATADIR))
    texts = []
    for file_name in file_names:
        if re.search(file_pattern, file_name):
            df = pd.read_csv(DATADIR + file_name)
            texts.extend(list(df[TEXT]))
            squeal(file_name)
    return texts

## 2. Stroll

Functions copied from stroll_srl_test.ipynb

In [7]:
import stanza
import stroll.stanza

Using backend: pytorch


In [16]:
def analyze_text(text):
    nlp_analysis = run_nlp(text)
    nlp_table_df = nlp_analysis_to_table(nlp_analysis)
    nlp_table_df = correct_attachments_table(nlp_table_df)
    srl_table_df = nlp_table_to_srl_table(nlp_table_df)
    return text, nlp_table_df, srl_table_df

In [8]:
run_nlp = stanza.Pipeline(lang='nl', processors='tokenize,lemma,pos,depparse,srl')

2022-01-18 11:13:22 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |
| srl       | default |

2022-01-18 11:13:22 INFO: Use device: cpu
2022-01-18 11:13:22 INFO: Loading: tokenize
2022-01-18 11:13:22 INFO: Loading: pos
2022-01-18 11:13:22 INFO: Loading: lemma
2022-01-18 11:13:22 INFO: Loading: depparse
2022-01-18 11:13:23 INFO: Loading: srl
2022-01-18 11:13:24 INFO: Done loading processors!


In [9]:
def nlp_analysis_to_table(nlp_analysis):
    nbr_of_words = 0
    for s in nlp_analysis.sentences:
        for w in s.words:
            if nbr_of_words == 0:
                nlp_table_df = pd.DataFrame({"id": [w.id], 
                                             "text": [w.text], 
                                             "lemma": [w.lemma],
                                             "upos": [w.upos],
                                             "xpos": [w.xpos],
                                             "feats": [w.feats],
                                             "head": [w.head],
                                             "deprel": [w.deprel],
                                             "deps": [w.deps],
                                             "misc": [w.misc],
                                             "start_char": [w.start_char],
                                             "end_char": [w.end_char],
                                             "parent": [w.parent],
                                             "sent": [w.sent],
                                             "srl": [w.srl],
                                             "frame": [w.frame],
                                            })
            else:
                nlp_table_df.loc[len(nlp_table_df.index)] = [ w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, 
                                                              w.head, w.deprel, w.deps, w.misc, w.start_char, w.end_char, 
                                                              w.parent, w.sent, w.srl, w.frame, ]
            nbr_of_words += 1
    return nlp_table_df

In [10]:
SRL_FIELDS = [ "sent_id", "head_id", "head", "nsubj", "rel", "Arg0", "Arg1", "Arg2", 
               "ArgM-ADV", "ArgM-CAU", "ArgM-DIS", "ArgM-LOC", "ArgM-MNR", "ArgM-MOD", "ArgM-NEG", "ArgM-REC", "ArgM-TMP", ]


def srl_dict_to_srl_list(srl_dict, nlp_dict):
    srl_list = len(SRL_FIELDS) * [ "" ]
    for i in range(0, len(SRL_FIELDS)):
        if SRL_FIELDS[i] in srl_dict:
            srl_list[i] = srl_dict[SRL_FIELDS[i]]
        if SRL_FIELDS[i] in nlp_dict:
            srl_list[i] = nlp_dict[SRL_FIELDS[i]]
    return srl_list

In [11]:
def add_srl_data_to_srl_table(srl_table_df, srl_data, nlp_data, sentence):
    print(srl_data)
    for phrase_key in srl_data:
        if 'head' in srl_data[phrase_key]:
            srl_data[phrase_key]["head"] += " " + sentence[phrase_key]
        elif phrase_key > 0:
            srl_data[phrase_key]["head"] = sentence[phrase_key]
        else:
            srl_data[phrase_key]["head"] = "FILLER"
        if phrase_key in nlp_data:
            srl_table_df.loc[len(srl_table_df)] = srl_dict_to_srl_list(srl_data[phrase_key], nlp_data[phrase_key])
        else:
            srl_table_df.loc[len(srl_table_df)] = srl_dict_to_srl_list(srl_data[phrase_key], {})

In [12]:
def swap_aux_head(sentence_df, child, head, heads_head):
    for i in range(0, len(sentence_df)):
        if sentence_df.at[i, "id"] == head:
            sentence_df.at[i, "head"] = child
        elif sentence_df.at[i, "id"] == child:
            sentence_df.at[i, "head"] = heads_head
        elif sentence_df.at[i, "head"] == head:
            sentence_df.at[i, "head"] = child
    return sentence_df

In [13]:
def correct_attachments_sentence(sentence_df):
    children = {}
    xpos = {}
    upos = {}
    text = {}
    heads = {}
    for i, row in sentence_df.iterrows():
        child = row["id"]
        head = row["head"]
        if head not in children:
            children[head] = []
        children[head].append(child)
        xpos[child] = row["xpos"]
        upos[child] = row["upos"]
        text[child] = row["text"]
        heads[child] = head
    for head in children:
        if head != 0 and not re.search("^WW", xpos[head]):
            for child in children[head]:
                if re.search("^WW", xpos[child]) and upos[child] == "AUX":
                    sentence_df = swap_aux_head(sentence_df, child, head, heads[head])
    return sentence_df

In [14]:
def correct_attachments_table(nlp_table_df):
    sentence_df = pd.DataFrame([])
    nlp_table_df_out = pd.DataFrame([])
    last_id = -1
    for i, row in nlp_table_df.iterrows():
        if row["id"] < last_id:
            new_sentence_df = correct_attachments_sentence(sentence_df)
            if len(nlp_table_df_out) == 0:
                nlp_table_df_out = new_sentence_df
            else:
                nlp_table_df_out = pd.concat([nlp_table_df_out, new_sentence_df])
            sentence_df = pd.DataFrame([])
        sentence_df = sentence_df.append(pd.DataFrame([row]), ignore_index = True)
        last_id = row["id"]
    if len(sentence_df) > 0:
        new_sentence_df = correct_attachments_sentence(sentence_df)
        if len(nlp_table_df_out) == 0:
            nlp_table_df_out = new_sentence_df
        else:
            nlp_table_df_out = pd.concat([nlp_table_df_out, new_sentence_df])
    return nlp_table_df_out

In [15]:
def nlp_table_to_srl_table(nlp_table_df):
    srl_table_df = pd.DataFrame({ field: [] for field in SRL_FIELDS })
    srl_data = {}
    nlp_data = {}
    sentence = {}
    last_id = 0
    sent_id = 1
    for i, row in nlp_table_df.iterrows():
        if row['id'] <= last_id:
            if len(srl_data) > 0:
                add_srl_data_to_srl_table(srl_table_df, srl_data, nlp_data, sentence)
            sent_id += 1
            srl_data = {}
            nlp_data = {}
            sentence = {}
        if row['srl'] != "_":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if row['srl'] in srl_data[row['head']]:
                print(f"duplicate role for {row['srl']} [{i}]: {srl_data[row['head']][row['srl']]} and {row['lemma']}")
                srl_data[row['head']][row['srl']] += " " + row['lemma']
            else:
                srl_data[row['head']][row['srl']] = row['lemma']
        if row['frame'] == "rel":
            if row['id'] not in srl_data:
                srl_data[row['id']] = { "sent_id": sent_id, "head_id": row["id"] }
            if row['frame'] not in srl_data[row['id']]:
                srl_data[row['id']][row['frame']] = row['lemma']
            else:
                srl_data[row['id']][row['frame']] += " " + row['lemma']
        if row['deprel'] == "nsubj":
            if row['head'] not in nlp_data:
                nlp_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'nsubj' in nlp_data[row['head']]:
                nlp_data[row['head']]["nsubj"] += " " + row['lemma']
            else:
                nlp_data[row['head']]["nsubj"] = row['lemma']
        if row['deprel'] == "compound:prt":
            if row['head'] not in nlp_data:
                nlp_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'head' in nlp_data[row['head']]:
                nlp_data[row['head']]["head"] += " " + row['lemma']
            else:
                nlp_data[row['head']]["head"] = row['lemma']
        last_id = row['id']
        sentence[row['id']] = row['lemma'] 
    if len(srl_data) > 0:
        add_srl_data_to_srl_table(srl_table_df, srl_data, nlp_data, sentence)
    return srl_table_df

## 3. Who, where, when, what, why and how?

In [17]:
def show_results(results_dict):
    results_df = pd.DataFrame([{ "key": key, "count": results_dict[key], "best value": ""} 
                               for key in sorted(results_dict, key=lambda k: results_dict[k], reverse=True)])
    best_keys = get_best_keys(results_df)
    for i, row in results_df.iterrows():
        if row["key"] in best_keys:
            results_df.at[i, "best value"] = "yes"
    return results_df

In [18]:
def get_best_keys(results_df):
    best_count = -1
    best_keys = []
    for i, row in results_df.iterrows():
        if row["count"] > best_count:
            best_count = row["count"]
            best_keys = [row["key"]]
        elif row["count"] == best_count:
            best_keys.append(row["key"])
    case_is_upper = []
    for key in best_keys:
        case_is_upper.append(re.search(r"^[A-Z]", key) != None)
    if True in case_is_upper:
        best_keys = [ best_keys[i] for i in range(0, len(best_keys)) if case_is_upper[i] ] 
    return best_keys

In [19]:
def get_actors(srl_table_df):
    actors = {}
    for i, row in srl_table_df.iterrows():
        if row["Arg0"] != "":
            actor = row["Arg0"]
        elif row["nsubj"] != "":
            actor = row["nsubj"]
        else:
            actor = ""
        if actor != "":
            if actor in actors:
                actors[actor] += 1
            else:
                actors[actor] = 1
    return actors

In [20]:
def get_locations(srl_table_df):
    locations = {}
    for i, row in srl_table_df.iterrows():
        if row["ArgM-LOC"] != "":
            location = row["ArgM-LOC"]
        else:
            location = ""
        if location != "":
            if location in locations:
                locations[location] += 1
            else:
                locations[location] = 1
    return locations

In [21]:
def get_times(srl_table_df):
    times = {}
    for i, row in srl_table_df.iterrows():
        if row["ArgM-TMP"] != "":
            time = row["ArgM-TMP"]
        else:
            time = ""
        if time != "":
            if time in times:
                times[time] += 1
            else:
                times[time] = 1
    return times

In [22]:
def get_causes(srl_table_df):
    causes = {}
    for i, row in srl_table_df.iterrows():
        if row["ArgM-CAU"] != "":
            cause = row["ArgM-CAU"]
        else:
            cause = ""
        if cause != "":
            if cause in causes:
                causes[cause] += 1
            else:
                causes[cause] = 1
    return causes

In [23]:
def get_manners(srl_table_df):
    manners = {}
    for i, row in srl_table_df.iterrows():
        if row["ArgM-MNR"] != "":
            manner = row["ArgM-MNR"]
        else:
            manner = ""
        if manner != "":
            if manner in manners:
                manners[manner] += 1
            else:
                manners[manner] = 1
    return manners

In [24]:
def get_actions(srl_table_df):
    actions = {}
    for i, row in srl_table_df.iterrows():
        if row["rel"] != "":
            action = row["rel"]
        elif row["head"] != "":
            action = row["head"]
        else:
            action = ""
        if action != "":
            if action in actions:
                actions[action] += 1
            else:
                actions[action] = 1
    return actions

In [36]:
def get_arguments(srl_table_df, number=1):
    argument_name = "Arg" + str(number)
    arguments = {}
    for i, row in srl_table_df.iterrows():
        if row[argument_name] != "":
            argument = row[argument_name]
        else:
            argument = ""
        if argument != "":
            if argument in arguments:
                arguments[argument] += 1
            else:
                arguments[argument] = 1
    return arguments

## 3. Application

In [42]:
for hour in [ "20211231-05", "20211231-06", "20211231-07",]:
    tweet_texts = read_tweets(hour)
    nlp_table_df_all = pd.DataFrame([])
    srl_table_df_all = pd.DataFrame([])
    counter = 0
    for tweet_text in tweet_texts:
        try:
            counter += 1
            if counter % 10 == 0:
                squeal(f"{hour} {counter}/{len(tweet_texts)} " + tweet_text)
            text, new_nlp_table_df, new_srl_table_df = analyze_text(restore_newlines(remove_urls(tweet_text)))
            nlp_table_df_all = pd.concat([nlp_table_df_all, new_nlp_table_df])
            srl_table_df_all = pd.concat([srl_table_df_all, new_srl_table_df])
        except: 
            pass
    srl_table_df_all.to_csv(f"../data{hour}_srl_table_df_all.csv.gz", compression="gzip")

20211231-07 16820/16820 @Dapperdikkerdje Dankjewel jij ook een gezond en gelukkig 2022




duplicate role for Arg1 [5]: @Dapperdikkerdje and gezond
{2: {'sent_id': 1, 'head_id': 2, 'Arg1': '@Dapperdikkerdje gezond', 'rel': 'dankjewen', 'Arg0': 'jij', 'ArgM-DIS': 'ook'}}


In [43]:
len(nlp_table_df_all), len(srl_table_df_all)

(376978, 35804)

In [33]:
show_results(get_actions(srl_table_df_all))[:10]

Unnamed: 0,key,count,best value
0,zijn,715,yes
1,hebben,339,
2,gaan,330,
3,doen,185,
4,komen,177,
5,FILLER,169,
6,zeggen,148,
7,zien,141,
8,weten,130,
9,worden,122,


In [41]:
show_results(get_arguments(srl_table_df_all, number=0))[:10]

Unnamed: 0,key,count,best value
0,ik,775,yes
1,je,462,
2,we,254,
3,ze,193,
4,die,186,
5,hij,82,
6,jij,55,
7,mens,49,
8,u,48,
9,jullie,47,


In [None]:
data = []
for i, row in nlp_table_df.iterrows():
    if len(data) > 0 and row["id"] < data[-1]["id"]:
        check(data)
        data = []
    data.append({"id": row["id"], "pos": row["upos"], "lemma": row["lemma"], "head": row["head"]})
if len(data) > 0:
    check(data)

## 4. Read and process data

In [None]:
srl_table_df_all = pd.DataFrame([])
for hour in [ "05", "06", "07" ]:
    srl_table_df = pd.read_csv(f"20211231-{hour}_srl_table_df_all.csv.gz", compression="gzip", index_col=0).fillna("")
    srl_table_df_all = pd.concat([srl_table_df_all, srl_table_df])

In [None]:
srl_table_df_all[:10]

In [None]:
len(srl_table_df_all)

In [None]:
srl_table_df_all[srl_table_df_all["Arg0"]=="Rutte"]

In [None]:
analysis = show_results(get_actors(srl_table_df_all))
[ (row["key"], row["count"]) for i, row in analysis.iterrows() if re.search("^[A-Z]", row["key"]) ][:10]