# Stroll Twitter

Use Stroll semantic role labeling for information extraction from tweets.

## 1. Reading the data

In [None]:
import os
import pandas as pd
import re

from IPython.display import clear_output

In [None]:
DATADIR = "../../puregome/data/text/"
TEXT = "text"
USER = "user"

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

In [None]:
def remove_urls(tweet_text):
    return(re.sub(r"\bhttps?://\S*","",tweet_text,flags=re.IGNORECASE))

In [None]:
def restore_newlines(tweet_text):
    return re.sub(r'\\n', '\n', tweet_text)

In [None]:
def read_tweets(file_pattern):
    file_names = sorted(os.listdir(DATADIR))
    texts = []
    for file_name in file_names:
        if re.search(file_pattern, file_name):
            df = pd.read_csv(DATADIR + file_name)
            texts.extend(list(df[TEXT]))
            squeal(file_name)
    return texts

## 2. Stroll

Functions copied from stroll_srl_test.ipynb

In [None]:
import stanza
import stroll.stanza

In [None]:
def nlp_analysis_to_table(nlp_analysis):
    nbr_of_words = 0
    for s in nlp_analysis.sentences:
        for w in s.words:
            if nbr_of_words == 0:
                nlp_table_df = pd.DataFrame({"id": [w.id], 
                                             "text": [w.text], 
                                             "lemma": [w.lemma],
                                             "upos": [w.upos],
                                             "xpos": [w.xpos],
                                             "feats": [w.feats],
                                             "head": [w.head],
                                             "deprel": [w.deprel],
                                             "deps": [w.deps],
                                             "misc": [w.misc],
                                             "start_char": [w.start_char],
                                             "end_char": [w.end_char],
                                             "parent": [w.parent],
                                             "sent": [w.sent],
                                             "srl": [w.srl],
                                             "frame": [w.frame],
                                            })
            else:
                nlp_table_df.loc[len(nlp_table_df.index)] = [ w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, w.head, w.deprel, w.deps, w.misc, 
                                                              w.start_char, w.end_char, w.parent, w.sent, w.srl, w.frame ]
            nbr_of_words += 1
    return nlp_table_df

In [None]:
SRL_FIELDS = [ "sent_id", "head_id", "head", "nsubj", "rel", "Arg0", "Arg1", "Arg2", 
               "ArgM-ADV", "ArgM-CAU", "ArgM-DIS", "ArgM-LOC", "ArgM-MNR", "ArgM-MOD", "ArgM-NEG", "ArgM-REC", "ArgM-TMP" ]


def srl_dict_to_srl_list(srl_dict):
    srl_list = len(SRL_FIELDS) * [ "" ]
    for i in range(0, len(SRL_FIELDS)):
        if SRL_FIELDS[i] in srl_dict:
            srl_list[i] = srl_dict[SRL_FIELDS[i]]
    return srl_list

In [None]:
def add_srl_data_to_srl_table(srl_table_df, srl_data, sentence):
    for phrase_key in srl_data:
        if 'head' in srl_data[phrase_key]:
            srl_data[phrase_key]["head"] += " " + sentence[phrase_key]
        elif phrase_key > 0:
            srl_data[phrase_key]["head"] = sentence[phrase_key]
        else:
            srl_data[phrase_key]["head"] = "FILLER"
        srl_table_df.loc[len(srl_table_df)] = srl_dict_to_srl_list(srl_data[phrase_key])

In [None]:
def nlp_table_to_srl_table(nlp_table_df):
    srl_table_df = pd.DataFrame({ field: [] for field in SRL_FIELDS })
    srl_data = {}
    sentence = {}
    last_id = 0
    sent_id = 1
    for i, row in nlp_table_df.iterrows():
        if row['id'] <= last_id:
            if len(srl_data) > 0:
                add_srl_data_to_srl_table(srl_table_df, srl_data, sentence)
            sent_id += 1
            srl_data = {}
            sentence = {}
        if row['srl'] != "_":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if row['srl'] in srl_data[row['head']]:
                print(f"duplicate role for {row['srl']} [{i}]: {srl_data[row['head']][row['srl']]} and {row['lemma']}")
            srl_data[row['head']][row['srl']] = row['lemma']
        if row['frame'] == "rel":
            if row['id'] not in srl_data:
                srl_data[row['id']] = { "sent_id": sent_id, "head_id": row["id"] }
            srl_data[row['id']][row['frame']] = row['lemma']
        if row['deprel'] == "nsubj":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'nsubj' in srl_data[row['head']]:
                srl_data[row['head']]["nsubj"] += " " + row['lemma']
            else:
                srl_data[row['head']]["nsubj"] = row['lemma']
        if row['deprel'] == "compound:prt":
            if row['head'] not in srl_data:
                srl_data[row['head']] = { "sent_id": sent_id, "head_id": row["head"] }
            if 'head' in srl_data[row['head']]:
                srl_data[row['head']]["head"] += " " + row['lemma']
            else:
                srl_data[row['head']]["head"] = row['lemma']
        last_id = row['id']
        sentence[row['id']] = row['lemma'] 
    if len(srl_data) > 0:
        add_srl_data_to_srl_table(srl_table_df, srl_data, sentence)
    return srl_table_df

In [None]:
def analyze_text(text):
    nlp_analysis = run_nlp(text)
    nlp_table_df = nlp_analysis_to_table(nlp_analysis)
    srl_table_df = nlp_table_to_srl_table(nlp_table_df)
    return nlp_table_df, srl_table_df

In [None]:
run_nlp = stanza.Pipeline(lang='nl', processors='tokenize,lemma,pos,depparse,srl')

## 3. Application

In [None]:
def show_results(results_dict):
    results_df = pd.DataFrame([{ "key": key, "count": results_dict[key], "best value": ""} 
                               for key in sorted(results_dict, key=lambda k: results_dict[k], reverse=True)])
    best_keys = get_best_keys(results_df)
    for i, row in results_df.iterrows():
        if row["key"] in best_keys:
            results_df.at[i, "best value"] = "yes"
    return results_df

In [None]:
def get_best_keys(results_df):
    best_count = -1
    best_keys = []
    for i, row in results_df.iterrows():
        if row["count"] > best_count:
            best_count = row["count"]
            best_keys = [row["key"]]
        elif row["count"] == best_count:
            best_keys.append(row["key"])
    case_is_upper = []
    for key in best_keys:
        case_is_upper.append(re.search(r"^[A-Z]", key) != None)
    if True in case_is_upper:
        best_keys = [ best_keys[i] for i in range(0, len(best_keys)) if case_is_upper[i] ] 
    return best_keys

In [None]:
def get_actions(srl_table_df):
    actions = {}
    for i, row in srl_table_df.iterrows():
        if row["rel"] != "":
            action = row["rel"]
        elif row["head"] != "":
            action = row["head"]
        else:
            action = ""
        if action != "":
            if action in actions:
                actions[action] += 1
            else:
                actions[action] = 1
    return actions

In [None]:
def get_actors(srl_table_df):
    actors = {}
    for i, row in srl_table_df.iterrows():
        if row["Arg0"] != "":
            actor = row["Arg0"]
        elif row["head"] != "":
            actor = row["head"]
        else:
            actor = ""
        if actor != "":
            if actor in actors:
                actors[actor] += 1
            else:
                actors[actor] = 1
    return actors

In [None]:
def get_arguments(srl_table_df, number=1):
    argument_name = "Arg" + str(number)
    arguments = {}
    for i, row in srl_table_df.iterrows():
        if row[argument_name] != "":
            argument = row[argument_name]
        else:
            argument = ""
        if argument != "":
            if argument in arguments:
                arguments[argument] += 1
            else:
                arguments[argument] = 1
    return arguments

In [None]:
def check(data):
    for word_data in data:
        if word_data["pos"] == "ADJ":
            print(word_data["lemma"], data[word_data["head"]-1]["lemma"])

In [None]:
for hour in [ "20211231-06", "20211231-07" ]:
    tweet_texts = read_tweets(hour)
    nlp_table_df_all = pd.DataFrame([])
    srl_table_df_all = pd.DataFrame([])
    counter = 0
    for tweet_text in tweet_texts:
        try:
            counter += 1
            squeal(f"{hour} {counter}/{len(tweet_texts)} " + tweet_text)
            new_nlp_table_df, new_srl_table_df = analyze_text(restore_newlines(remove_urls(tweet_text)))
            nlp_table_df_all = pd.concat([nlp_table_df_all, new_nlp_table_df])
            srl_table_df_all = pd.concat([srl_table_df_all, new_srl_table_df])
        except: 
            pass
    srl_table_df_all.to_csv(hour + "_srl_table_df_all.csv.gz", compression="gzip")

In [None]:
len(nlp_table_df_all), len(srl_table_df_all)

In [None]:
nlp_table_df_all.to_csv("2022123105_nlp_table_df_all.csv.gz", compression="gzip")

In [None]:
srl_table_df_all.to_csv("2022123105_srl_table_df_all.csv.gz"), compression="gzip"

In [None]:
show_results(get_actions(srl_table_df))[:10]

In [None]:
show_results(get_arguments(srl_table_df, number=1))[:10]

In [None]:
data = []
for i, row in nlp_table_df.iterrows():
    if len(data) > 0 and row["id"] < data[-1]["id"]:
        check(data)
        data = []
    data.append({"id": row["id"], "pos": row["upos"], "lemma": row["lemma"], "head": row["head"]})
if len(data) > 0:
    check(data)

## 4. Read and process data

In [None]:
srl_table_df_all = pd.DataFrame([])
for hour in [ "05", "06", "07" ]:
    srl_table_df = pd.read_csv(f"20211231-{hour}_srl_table_df_all.csv.gz", compression="gzip", index_col=0).fillna("")
    srl_table_df_all = pd.concat([srl_table_df_all, srl_table_df])

In [None]:
srl_table_df_all[:10]

In [None]:
len(srl_table_df_all)

In [None]:
srl_table_df_all[srl_table_df_all["Arg0"]=="Rutte"]

In [None]:
analysis = show_results(get_actors(srl_table_df_all))
[ (row["key"], row["count"]) for i, row in analysis.iterrows() if re.search("^[A-Z]", row["key"]) ][:10]