## ORLConverter

This converts files from the SSA-Format (PERIN-Format) to the format for the Opinion-Role-Labeller (incl. masked Polar-Expression).

In [1]:
# imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import os
import pandas as pd
import json
import uuid

In [2]:
# makedirs if not exist
os.makedirs("../../etl/data/processed/ORLConverter", exist_ok=True)

In [3]:
os.makedirs("../../etl/data/raw/ORLConverter", exist_ok=True)

In [4]:
# parameters
# Input data in SSA format (in JSON)
INPUT_DATA_SSA_TRAIN="../../etl/data/processed/Perin_Preprocessing/01_train.json"
INPUT_DATA_SSA_VAL="../../etl/data/processed/Perin_Preprocessing/01_val.json"
INPUT_DATA_SSA_TEST="../../etl/data/processed/Perin_Preprocessing/01_test.json"

ORL_OUTPUT_PATH_TRAIN="../../etl/data/processed/ORLConverter/01_train_orl.txt"
ORL_OUTPUT_PATH_VAL="../../etl/data/processed/ORLConverter/01_val_orl.txt"
ORL_OUTPUT_PATH_TEST="../../etl/data/processed/ORLConverter/01_test_orl.txt"

RANDOM_STATE=42

In [5]:
def read_json(INP_FILE):
    with open(INP_FILE, encoding="utf-8") as f:
        data = json.load(f)
        return data

ssa_data_train = read_json(INPUT_DATA_SSA_TRAIN)
ssa_data_val = read_json(INPUT_DATA_SSA_VAL)
ssa_data_test = read_json(INPUT_DATA_SSA_TEST)

In [6]:
def list_to_file(sents, filepath):
    with open(filepath, 'w') as f:
        for item in sents:
            f.write("%s" % (item + "\n"))

In [7]:
# validate the correctness of the parsing
ssa_data_train[0]['opinions'][0]['Source'][0][0]

'Justizminister'

In [8]:
# check input format
ssa_data_train

[{'sent_id': '0',
  'text': 'Weil BLICK exklusiv erfuhr : Justizminister Arnold Koller will an der Bundesratssitzung Lösungsvorschläge einbringen , um der Lage Herr zu werden .',
  'opinions': [{'Source': [['Justizminister'], ['29:43']],
    'Target': [['Lösungsvorschläge'], ['88:105']],
    'Polar_expression': [['einbringen'], ['106:116']],
    'Polarity': 'Positive',
    'Intensity': 'Average'}]},
 {'sent_id': '1',
  'text': '2018 gewann Huggingface auch noch den ersten Platz in einem Wettbewerb der prestigeträchtigen Neurips-Konferenz für maschinelles Lernen .',
  'opinions': [{'Source': [['Huggingface'], ['12:23']],
    'Target': [['Lernen'], ['129:135']],
    'Polar_expression': [['gewann'], ['5:11']],
    'Polarity': 'Positive',
    'Intensity': 'Average'}]},
 {'sent_id': '2',
  'text': 'In den Städten , Kantonen und auf Bundesebene versuchen Lobbygruppen , die Gesetze im Bereich der Ernährung und des Tierschutzes zu verschärfen .',
  'opinions': [{'Source': [['Lobbygruppen'], ['

In [9]:
def convert_ssa_to_orl(data: list):
    output = []
    for i, sent_dict in enumerate(data):
        # validate that...
        # has 1 opinion
        sent = sent_dict["text"]
        try:
            assert len(sent_dict['opinions']) == 1, "Sentences for annotation can currently only contain a single opinion."
            for opinion in sent_dict['opinions']:
                for k, v in opinion.items():
                    # print(k, v)
                    pass
                # from IPython.core.debugger import Pdb; Pdb().set_trace()
                # for each opinion we mask the holder
                source_s = int(opinion["Source"][1][0].split(":")[0])
                source_e = int(opinion["Source"][1][0].split(":")[1])
                # for each opinion we mask the target
                target_s = int(opinion["Target"][1][0].split(":")[0])
                target_e = int(opinion["Target"][1][0].split(":")[1])
                # replace polar expression
                pexp_s = int(opinion["Polar_expression"][1][0].split(":")[0])
                pexp_e = int(opinion["Polar_expression"][1][0].split(":")[1])

                masked1 = "".join((sent[:source_s],"HOLDER",sent[source_e:]))
                
                if source_s > target_s:
                    lendiff1 = 0
                else: 
                    # print("Trigger " + str(i))
                    lendiff1 = len(masked1) - len(sent)

                masked2 = "".join((masked1[:target_s + lendiff1],"TARGET",masked1[target_e + lendiff1:]))
                masked_no_s = "".join((sent[:target_s],"TARGET",sent[target_e:]))
                
                if target_s > pexp_s and source_s > pexp_s:
                    lendiff2 = 0
                elif target_s > pexp_s and source_s < pexp_s:
                    lendiff2 = len(masked1) - len(sent)
                elif target_s < pexp_s and source_s < pexp_s:
                    lendiff2 = len(masked2) - len(sent)
                elif target_s < pexp_s and source_s > pexp_s:
                    lendiff2 = len(masked_no_s) - len(sent)
                else:
                    lendiff2 = 0

                masked3 = "".join((masked2[:pexp_s + lendiff2],"PEXP",masked2[pexp_e + lendiff2:]))
                
                # add CLS and SEP tokens to both sides
                sent = "[CLS] " + sent + " [SEP]"
                masked = "[CLS] " + masked3 + " [SEP]"
                
                # print(sent)
                # print(masked)
                
                # verify correctness
                assert masked.find(" HOLDER ") != -1 and masked.find(" TARGET ") != -1 and masked.find(" PEXP ") != -1, "Something went wrong with HTP-replacement."
                
                assert target_s != source_s or target_s != pexp_s or source_s != target_s, "Skipping, because inconclusive start/end."
        except Exception as e:
            print(f"Skipped converting to ORL, {e}, example {i}")
            continue
        output.append(masked)
        output.append(sent)
    return output

In [10]:
"Hello World...".find("kllk")

-1

In [11]:
orl_data_train = convert_ssa_to_orl(ssa_data_train)

Skipped converting to ORL, Something went wrong with HTP-replacement., example 94
Skipped converting to ORL, Something went wrong with HTP-replacement., example 110
Skipped converting to ORL, Something went wrong with HTP-replacement., example 240
Skipped converting to ORL, Something went wrong with HTP-replacement., example 244
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 324
Skipped converting to ORL, Something went wrong with HTP-replacement., example 333
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 500
Skipped converting to ORL, Something went wrong with HTP-replacement., example 542
Skipped converting to ORL, Something went wrong with HTP-replacement., example 560
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 1066
Skipped converting to ORL, Something went wrong with HTP-replacement., example 1305
Skipp

In [12]:
orl_data_val = convert_ssa_to_orl(ssa_data_val)

Skipped converting to ORL, Something went wrong with HTP-replacement., example 64
Skipped converting to ORL, Something went wrong with HTP-replacement., example 372
Skipped converting to ORL, Something went wrong with HTP-replacement., example 429
Skipped converting to ORL, Something went wrong with HTP-replacement., example 534
Skipped converting to ORL, Something went wrong with HTP-replacement., example 658
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 925
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 927


In [13]:
orl_data_test = convert_ssa_to_orl(ssa_data_test)

Skipped converting to ORL, Something went wrong with HTP-replacement., example 129
Skipped converting to ORL, Sentences for annotation can currently only contain a single opinion., example 899


In [14]:
orl_data_train

['[CLS] Weil BLICK exklusiv erfuhr : HOLDER Arnold Koller will an der Bundesratssitzung TARGET PEXP , um der Lage Herr zu werden . [SEP]',
 '[CLS] Weil BLICK exklusiv erfuhr : Justizminister Arnold Koller will an der Bundesratssitzung Lösungsvorschläge einbringen , um der Lage Herr zu werden . [SEP]',
 '[CLS] 2018 PEXP HOLDER auch noch den ersten Platz in einem Wettbewerb der prestigeträchtigen Neurips-Konferenz für maschinelles TARGET . [SEP]',
 '[CLS] 2018 gewann Huggingface auch noch den ersten Platz in einem Wettbewerb der prestigeträchtigen Neurips-Konferenz für maschinelles Lernen . [SEP]',
 '[CLS] In den Städten , Kantonen und auf Bundesebene versuchen HOLDER , die TARGET im Bereich der Ernährung und des Tierschutzes zu PEXP . [SEP]',
 '[CLS] In den Städten , Kantonen und auf Bundesebene versuchen Lobbygruppen , die Gesetze im Bereich der Ernährung und des Tierschutzes zu verschärfen . [SEP]',
 '[CLS] Am Sonntag PEXP sich die HOLDER in sozialen Medien für die Unterstützung bei d

In [15]:
# save as txt file
list_to_file(orl_data_train, ORL_OUTPUT_PATH_TRAIN)
list_to_file(orl_data_val, ORL_OUTPUT_PATH_VAL)
list_to_file(orl_data_test, ORL_OUTPUT_PATH_TEST)