## ORLConverter

This converts files from the SSA-Format (PERIN-Format) to the format for the Opinion-Role-Labeller (incl. masked Polar-Expression).

In [1]:
# imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import os
import pandas as pd
import json
import uuid

In [2]:
# makedirs if not exist
os.makedirs("../../etl/data/processed/ORLConverter", exist_ok=True)

In [3]:
os.makedirs("../../etl/data/raw/ORLConverter", exist_ok=True)

In [4]:
# parameters
# Input data in SSA format (in JSON)
INPUT_DATA_SSA_TRAIN="../../etl/data/processed/Perin_Preprocessing/01_train.json"
INPUT_DATA_SSA_VAL="../../etl/data/processed/Perin_Preprocessing/01_val.json"
INPUT_DATA_SSA_TEST="../../etl/data/processed/Perin_Preprocessing/01_test.json"

ORL_OUTPUT_PATH_TRAIN="../../etl/data/processed/ORLConverter/01_train_orl.txt"
ORL_OUTPUT_PATH_VAL="../../etl/data/processed/ORLConverter/01_val_orl.txt"
ORL_OUTPUT_PATH_TEST="../../etl/data/processed/ORLConverter/01_test_orl.txt"

RANDOM_STATE=42

In [5]:
def read_json(INP_FILE):
    with open(INP_FILE, encoding="utf-8") as f:
        data = json.load(f)
        return data

ssa_data_train = read_json(INPUT_DATA_SSA_TRAIN)
ssa_data_val = read_json(INPUT_DATA_SSA_VAL)
ssa_data_test = read_json(INPUT_DATA_SSA_TEST)

In [6]:
def list_to_file(sents, filepath):
    with open(filepath, 'w') as f:
        for item in sents:
            f.write("%s" % (item + "\n"))

In [7]:
# validate the correctness of the parsing
ssa_data_train[0]['opinions'][0]['Source'][0][0]

'Oberländische'

In [8]:
# check input format
ssa_data_train

[{'sent_id': '0',
  'text': 'Im Kanton Bern etwa haben die Organisatoren des Emmentalischen bereits versichert , das Fest auch unter Ausschluss der Öffentlichkeit durchzuführen , das Oberländische findet ohne Zuschauer auf dem Brünigpass statt .',
  'opinions': [{'Source': [['Oberländische'], ['154:167']],
    'Target': [['Brünigpass'], ['198:208']],
    'Polar_expression': [['findet'], ['168:174']],
    'Polarity': 'Neutral',
    'Intensity': 'Average'}]},
 {'sent_id': '1',
  'text': 'Ende 2016 hatten die SBB das Projekt angekündigt , mit einem Drittel der Wohnungen im gemeinnützigen Segment .',
  'opinions': [{'Source': [['SBB'], ['21:24']],
    'Target': [['Projekt'], ['29:36']],
    'Polar_expression': [['angekündigt'], ['37:48']],
    'Polarity': 'Positive',
    'Intensity': 'Average'}]},
 {'sent_id': '2',
  'text': 'Knapp ein Vierteljahrhundert später wirken diese Vorstellungen naiv .',
  'opinions': [{'Source': [['Vorstellungen'], ['49:62']],
    'Target': [['naiv'], ['63:67']],

In [9]:
def convert_ssa_to_orl(data: list):
    output = []
    for i, sent_dict in enumerate(data):
        # validate that...
        # has 1 opinion
        sent = sent_dict["text"]
        try:
            assert len(sent_dict['opinions']) == 1, "Sentences for annotation can currently only contain a single opinion."
            for opinion in sent_dict['opinions']:
                for k, v in opinion.items():
                    # print(k, v)
                    pass
                # from IPython.core.debugger import Pdb; Pdb().set_trace()
                # for each opinion we mask the holder
                source_s = int(opinion["Source"][1][0].split(":")[0])
                source_e = int(opinion["Source"][1][0].split(":")[1])
                # for each opinion we mask the target
                target_s = int(opinion["Target"][1][0].split(":")[0])
                target_e = int(opinion["Target"][1][0].split(":")[1])
                # replace polar expression
                pexp_s = int(opinion["Polar_expression"][1][0].split(":")[0])
                pexp_e = int(opinion["Polar_expression"][1][0].split(":")[1])

                masked1 = "".join((sent[:source_s],"HOLDER",sent[source_e:]))
                
                if source_s > target_s:
                    lendiff1 = 0
                else: 
                    # print("Trigger " + str(i))
                    lendiff1 = len(masked1) - len(sent)

                masked2 = "".join((masked1[:target_s + lendiff1],"TARGET",masked1[target_e + lendiff1:]))
                masked_no_s = "".join((sent[:target_s],"TARGET",sent[target_e:]))
                
                if target_s > pexp_s and source_s > pexp_s:
                    lendiff2 = 0
                elif target_s > pexp_s and source_s < pexp_s:
                    lendiff2 = len(masked1) - len(sent)
                elif target_s < pexp_s and source_s < pexp_s:
                    lendiff2 = len(masked2) - len(sent)
                elif target_s < pexp_s and source_s > pexp_s:
                    lendiff2 = len(masked_no_s) - len(sent)
                else:
                    lendiff2 = 0

                masked3 = "".join((masked2[:pexp_s + lendiff2],"PEXP",masked2[pexp_e + lendiff2:]))
                
                # add CLS and SEP tokens to both sides
                sent = "[CLS] " + sent + " [SEP]"
                masked = "[CLS] " + masked3 + " [SEP]"
                
                # print(sent)
                # print(masked)
                
                # verify correctness
                assert masked.find(" HOLDER ") != -1 and masked.find(" TARGET ") != -1 and masked.find(" PEXP ") != -1, "Something went wrong with HTP-replacement."
                
                assert target_s != source_s or target_s != pexp_s or source_s != target_s, "Skipping, because inconclusive start/end."
        except Exception as e:
            print(f"Skipped converting to ORL, {e}, example {i}")
            continue
        output.append(masked)
        output.append(sent)
    return output

In [10]:
"Hello World...".find("kllk")

-1

In [11]:
orl_data_train = convert_ssa_to_orl(ssa_data_train)

Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 91
Skipped converting to ORL, Something went wrong with HTP-replacement., example 120
Skipped converting to ORL, Something went wrong with HTP-replacement., example 166
Skipped converting to ORL, Something went wrong with HTP-replacement., example 180
Skipped converting to ORL, Something went wrong with HTP-replacement., example 244
Skipped converting to ORL, Something went wrong with HTP-replacement., example 342
Skipped converting to ORL, Something went wrong with HTP-replacement., example 398
Skipped converting to ORL, Something went wrong with HTP-replacement., example 405
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 524
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 683
Skipped converting to ORL, Sentences for annotation can currently only contain a single op

In [12]:
orl_data_val = convert_ssa_to_orl(ssa_data_val)

Skipped converting to ORL, Something went wrong with HTP-replacement., example 39
Skipped converting to ORL, Something went wrong with HTP-replacement., example 89
Skipped converting to ORL, Something went wrong with HTP-replacement., example 349
Skipped converting to ORL, Something went wrong with HTP-replacement., example 415
Skipped converting to ORL, Something went wrong with HTP-replacement., example 737
Skipped converting to ORL, Something went wrong with HTP-replacement., example 874
Skipped converting to ORL, Something went wrong with HTP-replacement., example 936
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 958
Skipped converting to ORL, Something went wrong with HTP-replacement., example 1121
Skipped converting to ORL, Something went wrong with HTP-replacement., example 1506
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 1677
Skipped converting to ORL, Somet

In [13]:
orl_data_test = convert_ssa_to_orl(ssa_data_test)

Skipped converting to ORL, Something went wrong with HTP-replacement., example 13
Skipped converting to ORL, Something went wrong with HTP-replacement., example 205
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 455
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 460
Skipped converting to ORL, Something went wrong with HTP-replacement., example 774
Skipped converting to ORL, Something went wrong with HTP-replacement., example 1103
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 1269
Skipped converting to ORL, Something went wrong with HTP-replacement., example 1336
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 1684
Skipped converting to ORL, Something went wrong with HTP-replacement., example 1984
Skipped converting to ORL, Something went wrong with HTP-r

In [14]:
orl_data_train

['[CLS] Im Kanton Bern etwa haben die Organisatoren des Emmentalischen bereits versichert , das Fest auch unter Ausschluss der Öffentlichkeit durchzuführen , das HOLDER PEXP ohne Zuschauer auf dem TARGET statt . [SEP]',
 '[CLS] Im Kanton Bern etwa haben die Organisatoren des Emmentalischen bereits versichert , das Fest auch unter Ausschluss der Öffentlichkeit durchzuführen , das Oberländische findet ohne Zuschauer auf dem Brünigpass statt . [SEP]',
 '[CLS] Ende 2016 hatten die HOLDER das TARGET PEXP , mit einem Drittel der Wohnungen im gemeinnützigen Segment . [SEP]',
 '[CLS] Ende 2016 hatten die SBB das Projekt angekündigt , mit einem Drittel der Wohnungen im gemeinnützigen Segment . [SEP]',
 '[CLS] Knapp ein Vierteljahrhundert später PEXP diese HOLDER TARGET . [SEP]',
 '[CLS] Knapp ein Vierteljahrhundert später wirken diese Vorstellungen naiv . [SEP]',
 '[CLS] Der HOLDER PEXP ja keine TARGET aus . [SEP]',
 '[CLS] Der Grinch raubt ja keine Getränkeladen aus . [SEP]',
 '[CLS] Die HOLDE

In [15]:
# save as txt file
list_to_file(orl_data_train, ORL_OUTPUT_PATH_TRAIN)
list_to_file(orl_data_val, ORL_OUTPUT_PATH_VAL)
list_to_file(orl_data_test, ORL_OUTPUT_PATH_TEST)