# AllenNLP tests

Use AllenNLP's semantic role labeling in combination with manually written rules for extracting causal relations from political speeches. We found two limitations: only relations within a single sentence were found and complex relations involving understanding the sentence were hard to identify.

Links:

* software installation: https://github.com/allenai/allennlp (do not forget to install NLTK popular models)
* software usage: https://demo.allennlp.org/semantic-role-labeling (tab: Model Usage)

## 1. Language processing modules

In [1]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import spacy

We use AllenNLP for semantic role labeling and Spacy for lemmatization (of verbs) and tokenizing.

In [2]:
spacy_analyze = spacy.load('en_core_web_sm')

In [3]:
srl_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

  "AllenNLP Tango is an experimental API and parts of it might change or disappear "
2021-12-09 17:28:13,972 - INFO - allennlp.common.plugins - Plugin allennlp_models available
2021-12-09 17:28:14,213 - INFO - cached_path - cache of https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz is up-to-date
2021-12-09 17:28:14,217 - INFO - allennlp.models.archival - loading archive file https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz from cache at /home/erikt/.allennlp/cache/b5f1db011cc85691a5fa2bf29e055a712261a2e5d74a74edd7da2fffc98d4ab8.4c4ac7e06ec3d85631bd26b839f90b5a375d3ceeb43e3c74f1cf4758dcee2bb3
2021-12-09 17:28:14,221 - INFO - allennlp.models.archival - extracting archive file /home/erikt/.allennlp/cache/b5f1db011cc85691a5fa2bf29e055a712261a2e5d74a74edd7da2fffc98d4ab8.4c4ac7e06ec3d85631bd26b839f90b5a375d3ceeb43e3c74f1cf4758dcee2bb3 to temp dir /tmp/tmp3tdvewco
2021-12-09 17:28:17,750

## 2. Causal relation functions

In [4]:
import re

In [5]:
RED = 1
BLUE = 4
BLACK = 0
NO_COLOR = -1
COLOR_CONCEPT_1 = RED
COLOR_CONCEPT_2 = BLUE
COLOR_CONTENT_RELATION_EXPLANATION = BLACK
CONCEPT_1 = "Content_Concept_1"
CONCEPT_2 = "Content_Concept_2"
CONTENT_RELATION_EXPLANATION = "Content_Relation_Explanation"
NO_CONCEPT = ""
COLOR_CODES = { CONCEPT_1: COLOR_CONCEPT_1, CONCEPT_2: COLOR_CONCEPT_2, CONTENT_RELATION_EXPLANATION: COLOR_CONTENT_RELATION_EXPLANATION, NO_CONCEPT: NO_COLOR }
# 20211206 begore expansion: 34 verbs; after 78 verbs
RELATION_VERBS = [ "activate", "achieve", "affect", "aggravate", "allow", "attribute", "avoid", "base", "balance", "boost", 
                   "bring", "brought", "cause", "change", "compell", "comply", "compromise", "consolidate", "contain", "contribute", 
                   "control", "create", "deceive", "deduce", "depend", "elicit", "eliminate", "enable", "enact", "endanger", 
                   "enforce", "entail", "ensure", "erode", "fail", "flow", "force", "found", "increase", "generate", 
                   "ignite", "implicate", "imply", "induce", "infer", "influence", "initiate", "intend", "justify", "launch", 
                   "lead", "make", "manipulate", "mislead", "motivate", "obey", "open", "originate", "permit", "pick", 
                   "preserve", "produce", "protect", "provoke", "reduce", "reinforce", "restore", "result", "safeguard", "secure", 
                   "solve", "stimulate", "strenghten", "support", "tackle", "trigger", "undermine", "weaken", ]
REVERSE_VERBS = [ "arise", "need" ]
ARGM_DIS_ROLES = [ "furthermore", "instead", "moreover", "nevertheless", "otherwise", "therefore", "thus" ]
NO_ARG0_ROLES = [ "he", "i", "she", "they", "we", "you" ]

In [6]:
def strip_tag(token):
    if len(token) > 1:
        if re.search("^\[", token):
            token = re.sub("^\[", "", token)
            token = re.sub(":$", "", token)
        token = re.sub("\]$", "", token)
    return token

In [7]:
def print_with_color(string, color_code):
    print(f"\x1b[1;3{color_code};47m{string}\x1b[m", end="")

In [8]:
def get_concept_tag_argm_prp(argument):
    if argument == "ARGM-PRP":
        return CONCEPT_2
    elif argument in ["V", "ARG0", "ARG1", "ARG2" ]:
        return CONCEPT_1
    return NO_CONCEPT

In [9]:
def get_concept_tag_argm_dis(argument):
    color_code = NO_COLOR
    if argument == "ARGM-DIS":
        return CONTENT_RELATION_EXPLANATION
    elif argument != "":
        return CONCEPT_2
    return NO_CONCEPT

In [10]:
def get_concept_tag_v(argument, roles):
    if argument == "V":
        return CONTENT_RELATION_EXPLANATION
    elif argument in [ "ARG0", "ARGM-PRD", "ARGM-MNR" ] and (argument != "ARG0" or roles["ARG0"].lower() not in NO_ARG0_ROLES):
        return CONCEPT_1
    elif argument in [ "ARG1", "ARG2", ]:
        return CONCEPT_2
    return NO_CONCEPT

In [11]:
def get_concept_tag_reverse_v(argument, roles):
    if argument == "V":
        return CONTENT_RELATION_EXPLANATION
    elif argument in [ "ARG0", "ARGM-PRD", "ARGM-MNR" ] and (argument != "ARG0" or roles["ARG0"].lower() not in NO_ARG0_ROLES):
        return CONCEPT_2
    elif argument in [ "ARG1", "ARG2", ]:
        return CONCEPT_1
    return NO_CONCEPT

In [12]:
def check_verb(verb, RELATION_VERBS):
    if verb in RELATION_VERBS:
        return True
    if re.sub("[ds]$", "", verb) in RELATION_VERBS:
        return True
    if re.sub("ed$", "", verb) in RELATION_VERBS:
        return True
    if re.sub("ing$", "", verb) in RELATION_VERBS:
        return True
    if re.sub("ing$", "e", verb) in RELATION_VERBS:
        return True
    return False

In [13]:
def print_token(token, argument, first_token, last_token, color_code):
    if color_code == NO_COLOR:
        print_token_no_color(token, argument, first_token, last_token)
    else:
        print_token_color(token, argument, first_token, last_token, color_code)

In [14]:
def print_token_no_color(token, argument, first_token, last_token):
    if first_token:
        print(f"[{argument} ", end="")
    print(f"{token}", end="")
    if last_token:
        print(f"]", end="")
    print(" ", end="")

In [15]:
def print_token_color(token, argument, first_token, last_token, color_code):
    if first_token:
        print_with_color(f"[{argument} ", color_code)
    print_with_color(f"{token}", color_code)
    if last_token:
        print_with_color(f"]", color_code)
        print(" ", end="")
    else:
        print_with_color(" ", color_code)

In [16]:
def find_verb(tokens, arguments, lemmas):
    verb = ""
    if len(lemmas) != len(tokens):
        print_with_color(f"find_verb: error: different lengths for tokens ({len(tokens)}) and lemmas ({len(lemmas)})", 1)
    for i in range(0, len(arguments)):
        if arguments[i] == "V":
            verb = lemmas[i]
    return verb

In [17]:
def convert_analysis(analyzed_sentence):
    tokens_in = analyzed_sentence.split()
    arguments = []
    tokens_out = []
    first_tokens = []
    last_tokens = []
    current_argument = ""
    roles = {}
    first_token = False
    for i in range(0, len(tokens_in)):
        if re.search("^\[", tokens_in[i]):
            current_argument = strip_tag(tokens_in[i])
            first_token = True
            continue
        arguments.append(current_argument)
        tokens_out.append(strip_tag(tokens_in[i]))
        first_tokens.append(first_token)
        last_tokens.append(False)
        if current_argument != "":
            if current_argument not in roles:
                roles[current_argument] = strip_tag(tokens_in[i])
            else:
                roles[current_argument] += " " + strip_tag(tokens_in[i])
        if re.search("\]$", tokens_in[i]):
            current_argument = ""
            last_tokens[-1] = True
        first_token = False
    return [tokens_out, arguments, first_tokens, last_tokens, roles]

In [18]:
def get_lemmas(sentence):
    lemmas = []
    results = spacy_analyze(sentence)
    for token in results:
        lemmas.append(token.lemma_)
    return lemmas

In [19]:
def find_causal_relations(analyzed_sentence, lemmas):
    tokens, arguments, first_tokens, last_tokens, roles = convert_analysis(analyzed_sentence)
    concepts = []
    for i in range(0, len(tokens)):
        if "ARGM-PRP" in arguments:
            concepts.append(get_concept_tag_argm_prp(arguments[i]))
        elif "ARGM-DIS" in arguments and roles["ARGM-DIS"].lower() in ARGM_DIS_ROLES:
            concepts.append(get_concept_tag_argm_dis(arguments[i]))
        elif "V" in arguments and check_verb(find_verb(tokens, arguments, lemmas), RELATION_VERBS):
            concepts.append(get_concept_tag_v(arguments[i], roles))
        elif "V" in arguments and check_verb(find_verb(tokens, arguments, lemmas), REVERSE_VERBS):
            concepts.append(get_concept_tag_reverse_v(arguments[i], roles))
        else:
            concepts.append(NO_CONCEPT)
    return { "tokens": tokens, "arguments": arguments, "first_tokens": first_tokens, "last_tokens": last_tokens, "roles": roles, "concepts": concepts }

In [20]:
def pretty_print(causal_relation_data):
    for i in range(0, len(causal_relation_data["tokens"])):
        color_code = COLOR_CODES[causal_relation_data["concepts"][i]]
        print_token(causal_relation_data["tokens"][i], causal_relation_data["arguments"][i], causal_relation_data["first_tokens"][i], causal_relation_data["last_tokens"][i], color_code)
    print("")

In [21]:
def srl_analyze(sentence, filter="", print_flag=True):
    srl_analysis = srl_predictor.predict(sentence=sentence)
    causal_relations = []
    lemmas = get_lemmas(re.sub("\s+", " ", sentence))
    for verb_data in srl_analysis['verbs']:
        causal_relations.append(find_causal_relations(verb_data['description'], lemmas))
    if print_flag:
        for causal_relation_data in causal_relations:
            if filter == "" or re.search(filter, " ".join(causal_relation_data["tokens"] + causal_relation_data["arguments"])):
                pretty_print(causal_relation_data)
    return causal_relations

## 3. Test cases

In [None]:
srl_analyze("John sees the mountain with the snow")

In [None]:
srl_analyze("""You, in Greece, with our support, need to rebuild your country, your structures, your administration, 
               your economy to increase the competitiveness of Greece.""")

In [None]:
srl_analyze("This opened the way to welfare gains from stronger economic and financial integration.")

In [None]:
srl_analyze("One risk is the temptation for governments to overborrow because the economic costs of excessive public debt")

In [None]:
srl_analyze("A resulting loosening of fiscal discipline in individual member states can endanger the stability-oriented monetary policy.")

In [None]:
srl_analyze("Therefore, the markets did not properly perform their expected policing function.")

In [None]:
srl_analyze("I hope that following the lessons of interdependence not only at global, but also at European level given by the crisis")

In [None]:
srl_analyze("European economic strategy needs the full commitment of the European political community.")

In [None]:
srl_analyze("And a European economic system whose resilience flows from its single market")

In [None]:
srl_analyze("there are of course considerable budgetary challenges arising from the recent exceptional measures")

In [None]:
srl_analyze("And the best hope of a return to growth and job creation is inside the euro area.")

In [None]:
srl_analyze("To conclude, let me say a few words on the euro area more generally.")

In [None]:
srl_analyze("""We have taken important, fundamental decisions over the last couple of months to safeguard the stability 
               of the euro area, and indeed we are now in the phase of implementation.""")

In [None]:
srl_analyze("""A number of governments have embarked on a path of reform and fiscal consolidation that was unthinkable only 
               very recently, and they have taken important decisions and I encourage them to keep this determination.""")

In [None]:
srl_analyze("""These reforms are now being implemented and this effort must continue with credibility, with consistency, 
               with coherence over time.""")

In [None]:
srl_analyze("As we said there will not be magic solutions.")

In [None]:
srl_analyze("We need sustained efforts and determination.")

In [None]:
srl_analyze("At the same time, the existing financial backstops are being used as necessary.")

In [None]:
srl_analyze("""Most recently, the financial assistance to the recapitalisation of Spanish banks has been agreed 
               and is ready for implementation.""")

In [None]:
srl_analyze("""Giving to the ECB the ultimate responsibility for supervision of banks in the euro area 
               will decisively contribute to increase confidence between the banks 
               and in this way increase the financial stability in the euro area.""")

In [None]:
srl_analyze("Second, Germany is acutely aware of the need to tackle the root causes and not just the symptoms of the crisis.")

In [None]:
srl_analyze("""This is why it is pressing strongly for institutional reforms of the EMU framework plus structural reforms 
               and budgetary discipline in the member states.""")

In [None]:
srl_analyze("The introduction of the euro eliminated exchange rate risks.")

In [None]:
srl_analyze("""Another implication of the euro area's single monetary policy is that the key interest rates 
               are set for the currency bloc as a whole.""")

In [None]:
srl_analyze("""One risk is the temptation for governments to overborrow because the economic costs of excessive public debt, 
               for example higher interest rates, can be more easily shifted to other member states.""")

In [None]:
srl_analyze("""Even Germany ran up excessive deficits for a few years and, even worse, championed a reform of the SGP 
               which ultimately further weakened the application of the fiscal rules.""")

In [None]:
srl_analyze("""However, the EMU framework not only failed to avoid excessive deficits, it was also unable to prevent 
               the build-up of macroeconomic imbalances within the euro area.""")

In [None]:
srl_analyze("""The resulting increase in domestic inflation and wages eroded the competitiveness of the countries concerned and 
               increased their dependence on capital imports.""")

In [None]:
srl_analyze("The task of implementing the reforms and regaining competitiveness entailed significant political and social costs.")

In [None]:
srl_analyze("""However, these efforts, supported by a strong expansion in the global economy, allowed German growth to rebound 
               after 2005.""")

In [None]:
srl_analyze("""In order to achieve a turnaround and allow further assistance, it is now essential for Greece to deliver 
               on the promises that have been made.""")

## 4. Propbank role explanation

Modifiers in Propbank (source: http://clear.colorado.edu/compsem/documents/propbank_guidelines.pdf )
* ADJ: Adjectival
* ADV: Adverbials
* CAU: Cause
* COM: Comitative
* DIR: Directional
* DIS: Discourse
* DSP: Direct Speech
* EXT: Extent
* GOL: Goal
* LOC: Locative
* LVB: Light Verb
* MNR: Manner
* MOD: Modal
* NEG: Negation
* PRD: Secondary Predication
* PRP: Purpose
* REC: Reciprocals
* SLC: Relative Clause

## 5. Process texts

In [22]:
import json

In [23]:
def read_data(file_name):
    json_data = []
    infile = open(file_name, "r")
    for line in infile:
        json_data.append(json.loads(line))
    infile.close()
    return json_data

In [24]:
def get_concepts_json(json_data_element):
    concepts = {}
    for label_element in json_data_element["label"]:
        label = label_element[2]
        phrase = json_data_element["data"][label_element[0]: label_element[1]]
        if label not in concepts:
            concepts[label] = phrase
        else:
            concepts[label] += " " + phrase
    return list(concepts.items())

In [25]:
def get_concepts_srl(srl_data):
    concepts = {}
    for i in range(0, len(srl_data["concepts"])):
        if srl_data["concepts"][i] != NO_CONCEPT:
            label = srl_data["concepts"][i]
            phrase = srl_data["tokens"][i]
            if label not in concepts:
                concepts[label] = phrase
            elif len(phrase) > 1 or re.search("\w", phrase):
                concepts[label] += " " + phrase
            else:
                concepts[label] += phrase
    return list(concepts.items())

In [26]:
def show_precision_recall(present, found, correct, correct_phrases):
    print(f"found: {found}; present: {present}; correct: {correct}; ", end="")
    if found > 0:
        print(f"precision: {correct/found:.3f}; ", end="")
    print(f"recall: {correct/present:.3f}", end="")
    if len(correct_phrases) > 0:
        print(f"; correct phrases: {correct_phrases}", end="")
    print("")

In [27]:
def evaluate_concepts(concepts_json, concepts_srl):
    correct = 0
    concepts_srl_used = len(concepts_srl) * [False]
    correct_phrases = []
    for i in range(0, len(concepts_json)):
        for j in range(0, len(concepts_srl)):
            if not concepts_srl_used[j] and concepts_srl[j][0] == concepts_json[i][0] and concepts_srl[j][1] == concepts_json[i][1]:
                correct += 1
                concepts_srl_used[j] = False # change to True to avoid accepting concepts more than once 
                correct_phrases.append(concepts_srl[j][1])
                break
    present = len(concepts_json)
    found = len(concepts_srl)
    return present, found, correct, correct_phrases

In [28]:
def escape_characters(json_data):
    for data in json_data:
        data["data"] = re.sub("-", "_", data["data"])
        data["data"] = re.sub("\[", "_", data["data"])
        data["data"] = re.sub("\]", "_", data["data"])
    return json_data

In [29]:
json_data = escape_characters(read_data("../data/femke-20211012.jsonl"))

In [30]:
get_concepts_json(json_data[20])

[('Content_Concept_1', 'introduction of the euro'),
 ('Content_Relation_Explanation', 'eliminated'),
 ('Content_Concept_2', 'exchange rate risks')]

In [31]:
def flatten_list(data):
    if type(data) != list:
        return [data]
    else:
        if len(data) == 0:
            return []
        else:
            first_element_list = flatten_list(data[0])
            rest_list = flatten_list(data[1:])
            first_element_list.extend(rest_list)
            return first_element_list

In [38]:
def save_paragraph_data(paragraph_data, out_file_name="data.jsonl"):
    counter = 1
    out_data = []
    for paragraph in paragraph_data:
        while True:
            paragraph_labels = []
            for labels in paragraph["labels"]:
                if len(labels) > 0:
                    paragraph_labels.extend(labels[0])
                    labels.pop(0)
                    break
            if len(paragraph_labels) == 0:
                break
            out_data_item = { "id": counter, "text": paragraph["text"], "label": paragraph_labels}
            for key in paragraph:
                if key not in [ "text", "labels" ]:
                    out_data_item[key] = paragraph[key]
            out_data.append(out_data_item)
            counter += 1
    out_file = open(out_file_name, "w")
    for data in out_data:
        print(json.dumps(data), file=out_file)
    out_file.close()

In [33]:
def tokenize(string):
    return " ".join([str(token) for token in spacy_analyze(string)])

In [76]:
def add_meta_data(paragraph_data, json_data):
    counter = 0
    text_seen_dict = {}
    for paragraph in paragraph_data:
        tokenized_paragraph = tokenize(paragraph["text"])
        text_seen_count = 0
        for i in range(0, len(json_data)):
            if json_data[i]["data"] not in text_seen_dict and tokenize(json_data[i]["data"]) == tokenized_paragraph:
                text_seen_count += 1
                for key in ["paragraph_id", "source_id", "speech_id" ]:
                    paragraph[key] = json_data[i][key]
                for label_data in json_data[i]["label"]:
                    label = f"[{text_seen_count}] " + label_data[2]
                    phrase = json_data[i]["data"][label_data[0]: label_data[1]]
                    if label in paragraph:
                        paragraph[label].append(phrase)
                    else:
                        paragraph[label] = [phrase]
            if text_seen_count > 0 and tokenize(json_data[i]["data"]) != tokenized_paragraph:
                break
            else:
                text_seen_dict[json_data[i]["data"]] = True
        if json_data[-1]["data"] in text_seen_dict:
            text_seen_dict = {}
            print(f"\n{json_data[i]['data']}\n{tokenized_paragraph}")
        counter += 1
        print(f"{counter} ({len(text_seen_dict)}) ", end="")
    print("")
    return paragraph_data

In [59]:
def save_doccano_format(srl_analysis_paragraphs, json_data):
    paragraph_data = []
    for srl_analysis_paragraph in srl_analysis_paragraphs:
        text_paragraph = ""
        labels_paragraph = []
        for srl_analysis_sentence in srl_analysis_paragraph:
            labels_sentence = []
            text_sentence = ""
            for srl_analysis in srl_analysis_sentence:
                text = ""
                labels = []
                
                for i in range(0, len(srl_analysis["tokens"])):
                    token = srl_analysis["tokens"][i]
                    label = srl_analysis["concepts"][i]
                    if label != "":
                        if len(labels) > 0 and labels[-1][2] == label and labels[-1][1] == len(text_paragraph) + len(text):
                            labels[-1] = (labels[-1][0], len(text_paragraph) + len(text) + len(token) + 1, label)
                        else:
                            labels.append((len(text_paragraph) + len(text), len(text_paragraph) + len(text) + len(token) + 1, label))
                    text += token + " "
                if len(labels) > 0:
                    labels_sentence.append(labels)
                text_sentence = text
            text_paragraph += text_sentence
            labels_paragraph.append(labels_sentence)
        paragraph_data.append({ "text": text_paragraph, "labels": labels_paragraph })
    paragraph_data = add_meta_data(paragraph_data, json_data)
    save_paragraph_data(paragraph_data)

In [54]:
def process_concepts(concepts_json, paragraph_text, paragraph_id, print_flag=True):
    if print_flag:
        print("####################")
    concepts_srl = []
    spacy_analysis = spacy_analyze(paragraph_text)
    srl_analysis_paragraph = []
    for sentence in spacy_analysis.sents:
        srl_analysis_sentence = srl_analyze(str(sentence), filter="", print_flag=print_flag)
        for srl_analysis_data in srl_analysis_sentence:
            concepts_srl.extend(get_concepts_srl(srl_analysis_data))
        if print_flag:
            print("")
        srl_analysis_paragraph.append(srl_analysis_sentence)
    present, found, correct, correct_phrases = evaluate_concepts(concepts_json, concepts_srl)
    print(f"id {paragraph_id}: ", end="")
    show_precision_recall(present, found, correct, correct_phrases)
    return present, found, correct,srl_analysis_paragraph

In [53]:
def analyze_paragraph(json_data, start_paragraph_id=0, last_paragraph_id = 100, show_only_one = True, print_flag=True):
    seen = {}
    present_total = 0
    found_total = 0
    correct_total = 0
    previous_paragraph_text = ""
    previous_concepts_json = []
    nbr_of_paragraphs = 0
    srl_analysis_paragraphs = []
    for paragraph_id in range(start_paragraph_id, last_paragraph_id):
        new_paragraph_text = json_data[paragraph_id]["data"]
        new_concepts_json = get_concepts_json(json_data[paragraph_id])
        if new_paragraph_text in seen:
            previous_concepts_json.extend(new_concepts_json)
            nbr_of_paragraphs += 1
        elif show_only_one and previous_paragraph_text != "":
            break
        else:
            if len(previous_paragraph_text) > 0:
                present, found, correct, srl_analysis_paragraph = process_concepts(previous_concepts_json, previous_paragraph_text, paragraph_id, print_flag=print_flag)
                present_total += present
                found_total += found
                correct_total += correct
                srl_analysis_paragraphs.append(srl_analysis_paragraph)
            previous_paragraph_text = new_paragraph_text
            previous_concepts_json = new_concepts_json
            seen[previous_paragraph_text] = True
            nbr_of_paragraphs = 1

    present, found, correct, srl_analysis_paragraph = process_concepts(previous_concepts_json, previous_paragraph_text, paragraph_id + 1, print_flag=print_flag)
    srl_analysis_paragraphs.append(srl_analysis_paragraph)
    present_total += present
    found_total += found
    correct_total += correct
    print(f"ALL({nbr_of_paragraphs}): ", end="")
    show_precision_recall(present_total, found_total, correct_total, [])
    save_doccano_format(srl_analysis_paragraphs, json_data)

New paragraphs can be found on the ids: 0, 12, 14, 15, 20, 24, 25, 29, 32, 41, 47, 48, 51

In [86]:
analyze_paragraph(json_data, start_paragraph_id=1295, last_paragraph_id=len(json_data), show_only_one=False, print_flag=False)

id 1296: found: 3; present: 3; correct: 0; precision: 0.000; recall: 0.000
id 1298: found: 8; present: 6; correct: 0; precision: 0.000; recall: 0.000
id 1299: found: 9; present: 3; correct: 2; precision: 0.222; recall: 0.667; correct phrases: ['reduce', 'actions by the ECB']
id 1301: found: 6; present: 6; correct: 1; precision: 0.167; recall: 0.167; correct phrases: ['price stability']
id 1303: found: 5; present: 6; correct: 0; precision: 0.000; recall: 0.000
id 1307: found: 7; present: 12; correct: 3; precision: 0.429; recall: 0.250; correct phrases: ['monetary policy', 'monetary policy', 'monetary policy']
id 1312: found: 0; present: 15; correct: 0; recall: 0.000
id 1314: found: 10; present: 6; correct: 0; precision: 0.000; recall: 0.000
id 1318: found: 3; present: 12; correct: 2; precision: 0.667; recall: 0.167; correct phrases: ['achieved', 'achieved']
id 1322: found: 5; present: 12; correct: 0; precision: 0.000; recall: 0.000
id 1325: found: 5; present: 9; correct: 0; precision: 0

In [83]:
json_data[1480]["data"] = re.sub("2$", " 2", json_data[1480]["data"])

## 6. Visualize data

Copied from: https://github.com/eriktks/cognitive_mapping/blob/main/phrases.ipynb 

In [None]:
def make_ids(labels, text):
    ids = {}
    for label_data in labels:
        while text[label_data[0]] in " .,?!:;":
            label_data[0] += 1
        while text[label_data[1]-1] in " .,?!:;":
            label_data[1] -= 1
        if label_data[0] in ids or label_data[1] in ids:
            print(f"overlapping relation parts! {data_item['label']}")
        ids[label_data[0]] = { "type": "start", "label": label_data[2] }
        ids[label_data[1]] = { "type":"end", "label": label_data[2] }
    return ids 

In [None]:
def add_colors_to_text(text, ids):
    for key in sorted(ids.keys(), reverse=True):
        if ids[key]["type"] == "end":
            text = text[:key+1] + "\x1b[m" + text[key+1:]
        else:
            if ids[key]["label"] == "Content_Concept_1":
                color_code = 1
            elif ids[key]["label"] == "Content_Concept_2":
                color_code = 4
            elif ids[key]["label"] == "Content_Relation_Explanation":
                color_code = 0
            else:
                color_code = 7
                print(f"unknown relation part label! ({ids[key]['label']})")
            text = text[:key] + f"\x1b[1;3{color_code};47m" + text[key:]
    return text

Note that the id parameter of the function `visualize` starts at the value 1

In [None]:
def visualize(json_data, i):
    text = json_data[i-1]["data"]
    ids = make_ids(json_data[i-1]['label'], text)
    text = add_colors_to_text(text, ids)
    print(text + "\n")

In [None]:
for i in range(1, 2):
    visualize(json_data, i)