## UDTConverter

This converts files from the SSA-Format (PERIN-Format) to the format for annotation in the [Universal Data Tool](https://udt.dev/).

Should also be able to convert the UDT format back to the SSA-Format for evaluation.

In [1]:
# imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import os
import pandas as pd
import json
import uuid

In [2]:
# makedirs if not exist
os.makedirs("../../etl/data/processed/UDTConverter", exist_ok=True)

In [3]:
os.makedirs("../../etl/data/raw/UDTConverter", exist_ok=True)

In [4]:
# parameters
# Input data in SSA format (in JSON)
INPUT_DATA_SSA="../../etl/data/processed/Perin_Preprocessing/manual_annotation.json"
UDT_OUTPUT_PATH="../../etl/data/processed/UDTConverter/manual_annotation_udt.json"

UDT_INPUT_PATH="../../etl/data/raw/UDTConverter/manual_annotation_udt_annotated.json"
OUTPUT_DATA_SSA="../../etl/data/processed/UDTConverter/manual_annotation_ssa_annotated.json"
RANDOM_STATE=42

In [5]:
def read_json(INP_FILE):
    with open(INP_FILE, encoding="utf-8") as f:
        data = json.load(f)
        return data
    
ssa_data = read_json(INPUT_DATA_SSA)

In [6]:
def save_dict_as_json(dict, filename):
    with open(filename, 'w') as f:
        json.dump(dict, f)

In [7]:
# validate the correctness of the parsing
ssa_data[0]['opinions'][0]['Source'][0][0]

'Oshkosh'

In [8]:
# check input format
ssa_data

[{'sent_id': '0',
  'text': 'Oshkosh geniesst den Ausblick auf den Genfersee .',
  'opinions': [{'Source': [['Oshkosh'], ['0:7']],
    'Target': [['Ausblick'], ['21:29']],
    'Polar_expression': [['geniesst'], ['8:16']],
    'Polarity': 'Neutral',
    'Intensity': 'Average'}]},
 {'sent_id': '1',
  'text': 'Die Opposition will den Unmut ausnutzen und am nächsten Samstag eine weitere Kundgebung durchführen .',
  'opinions': [{'Source': [['Opposition'], ['4:14']],
    'Target': [['Unmut'], ['24:29']],
    'Polar_expression': [['ausnutzen'], ['30:39']],
    'Polarity': 'Negative',
    'Intensity': 'Average'}]},
 {'sent_id': '2',
  'text': 'Deutsches Laisser-faire dagegen verträgt sich nicht mit der politischen und ökonomischen Realität .',
  'opinions': [{'Source': [['Laisser-faire'], ['10:23']],
    'Target': [['Realität'], ['89:97']],
    'Polar_expression': [['verträgt'], ['32:40']],
    'Polarity': 'Negative',
    'Intensity': 'Average'}]},
 {'sent_id': '3',
  'text': 'Ihr geistiger V

In [9]:
def convert_ssa_to_udt(data: list, interface):
    for sent in data:
        # validate that...
        # has 1 opinion
        sample = {
            "_id": sent["sent_id"],
            "document": sent["text"],
        }
        try:
            len(sent['opinions']) == 1, "Sentences for annotation can currently only contain a single opinion."
            for opinion in sent['opinions']:
                for k, v in opinion.items():
                    print(k, v)
                # from IPython.core.debugger import Pdb; Pdb().set_trace()
                sample["annotation"] = {
                    "entities": [
                    {
                        "text": opinion["Polar_expression"][0][0],
                        "textId": str(uuid.uuid4().hex),
                        "label": "PolEx",
                        "start": int(opinion["Polar_expression"][1][0].split(":")[0]),
                        "end": int(opinion["Polar_expression"][1][0].split(":")[1])
                      },
                    ]
                }
        except Exception as e:
            print(f"Skipped converting to UDT, {e}")
            continue
        interface["samples"].append(sample)
    return interface

In [10]:
udt_dict = {
    "name": "Sentiment Dataset",
    "interface": {
    "type": "text_entity_relations",
    "entityLabels": [
      {
        "id": "Source",
        "displayName": "Source",
        "description": "The source of a sentiment or action."
      },
      {
        "id": "Target",
        "displayName": "Target",
        "description": "The target of a sentiment or action."
      },
      {
        "id": "PolEx",
        "displayName": "PolEx",
        "description": "A verb indicating a polar expression between a holder and a target?"
      }
    ],
    "relationLabels": [
      {
        "id": "Positive",
        "displayName": "Pro",
        "description": "A positive relation between a holder and a target."
      },
      {
        "id": "Neutral",
        "displayName": "Neutral",
        "description": "A neutral relation between a holder and a target."
      },
      {
        "id": "Negative",
        "displayName": "Con",
        "description": "A contra relation between a holder and a target."
      },
    ]
  },
  "samples": [],
}

In [11]:
udt_data = convert_ssa_to_udt(ssa_data, udt_dict)

Source [['Oshkosh'], ['0:7']]
Target [['Ausblick'], ['21:29']]
Polar_expression [['geniesst'], ['8:16']]
Polarity Neutral
Intensity Average
Source [['Opposition'], ['4:14']]
Target [['Unmut'], ['24:29']]
Polar_expression [['ausnutzen'], ['30:39']]
Polarity Negative
Intensity Average
Source [['Laisser-faire'], ['10:23']]
Target [['Realität'], ['89:97']]
Polar_expression [['verträgt'], ['32:40']]
Polarity Negative
Intensity Average
Source [['Grünliberalen'], ['168:181']]
Target [['Ideen'], ['195:200']]
Polar_expression [['geklaut'], ['201:208']]
Polarity Neutral
Intensity Average
Source [['Julian'], ['60:66']]
Target [['Bildung'], ['129:136']]
Polar_expression [['plädieren'], ['50:59']]
Polarity Positive
Intensity Average
Source [['Frage'], ['56:61']]
Target [['Sicht'], ['4:9']]
Polar_expression [['stellt'], ['26:32']]
Polarity Positive
Intensity Average
Source [['Stadtrat'], ['51:59']]
Target [['Hauptanliegen'], ['91:104']]
Polar_expression [['aufnehmen'], ['120:129']]
Polarity Positive

In [12]:
# save as json file
save_dict_as_json(udt_data, UDT_OUTPUT_PATH)
udt_data

{'name': 'Sentiment Dataset',
 'interface': {'type': 'text_entity_relations',
  'entityLabels': [{'id': 'Source',
    'displayName': 'Source',
    'description': 'The source of a sentiment or action.'},
   {'id': 'Target',
    'displayName': 'Target',
    'description': 'The target of a sentiment or action.'},
   {'id': 'PolEx',
    'displayName': 'PolEx',
    'description': 'A verb indicating a polar expression between a holder and a target?'}],
  'relationLabels': [{'id': 'Positive',
    'displayName': 'Pro',
    'description': 'A positive relation between a holder and a target.'},
   {'id': 'Neutral',
    'displayName': 'Neutral',
    'description': 'A neutral relation between a holder and a target.'},
   {'id': 'Negative',
    'displayName': 'Con',
    'description': 'A contra relation between a holder and a target.'}]},
 'samples': [{'_id': '0',
   'document': 'Oshkosh geniesst den Ausblick auf den Genfersee .',
   'annotation': {'entities': [{'text': 'geniesst',
      'textId': '

### Output-Format / Input-Format

</br>

<details>
    <summary><it>Show demo code</it></summary>
  
```json
{
  "name": "New undefined Dataset",
  "interface": {
    "type": "text_entity_relations",
    "entityLabels": [
      {
        "id": "food",
        "displayName": "Food",
        "description": "Edible item."
      },
      {
        "id": "hat",
        "displayName": "Hat",
        "description": "Something worn on the head."
      }
    ],
    "relationLabels": [
      {
        "id": "subject",
        "displayName": "Subject"
      }
    ]
  },
  "samples": [
    {
      "_id": "seqapblyl",
      "document": "I love you.",
      "annotation": {
        "entities": [
          {
            "text": "love",
            "textId": "yruukn",
            "start": 2,
            "end": 5
          }
        ],
        "relations": [
          {
            "from": "yruukn",
            "to": "yruukn",
            "label": "subject",
            "color": "#d32f2f"
          }
        ]
      },
      "brush": "complete"
    },
    {
      "_id": "scec120d4",
      "document": "I don't love you.",
      "annotation": {
        "entities": [
          {
            "text": "I don",
            "textId": "d3xn6k",
            "label": "food",
            "start": 0,
            "end": 4
          },
          {
            "text": "love you",
            "textId": "xkw8je",
            "label": "hat",
            "start": 8,
            "end": 15
          }
        ],
        "relations": [
          {
            "from": "d3xn6k",
            "to": "xkw8je",
            "label": "subject",
            "color": "#d32f2f"
          }
        ]
      },
      "brush": "complete"
    },
```
    
</details>


In [13]:
def convert_udt_to_ssa(input_list):
    output_list = []
    for dic in input_list:
        sent_id = dic['_id']
        text = dic['document']
        annotations = dic['annotation']
        entities = annotations['entities']
        relations = annotations['relations']
        opinions = []
        for rel in relations:
            opinion = {}
            for ent in entities:
                if ent['label'] == "PolEx":
                    opinion['Polar_expression'] = [[ent['text']], [f"{ent['start']}:{ent['end'] + 1}"]]
                if ent['textId'] == rel['from']:
                    opinion['Source'] = [[ent['text']], [f"{ent['start']}:{ent['end'] + 1}"]]
                if ent['textId'] == rel['to']:
                    opinion['Target'] = [[ent['text']], [f"{ent['start']}:{ent['end'] + 1}"]]
            opinion['Polarity'] = rel['label']
            opinion['Intensity'] = "Average"
            opinions.append(opinion)
        output_dic = {'sent_id': sent_id, 'text': text, 'opinions': opinions}
        output_list.append(output_dic)
    print(output_list)
    return output_list

In [14]:
# only the samples dict!
udt_data = {
  "name": "Sentiment Dataset",
  "interface": {
    "type": "text_entity_relations",
    "entityLabels": [
      {
        "id": "Source",
        "displayName": "Source",
        "description": "The source of a sentiment or action."
      },
      {
        "id": "Target",
        "displayName": "Target",
        "description": "The target of a sentiment or action."
      },
      {
        "id": "Polar_expression",
        "displayName": "PolEx",
        "description": "A verb indicating a polar expression between a holder and a target?"
      }
    ],
    "relationLabels": [
      {
        "id": "Positive",
        "displayName": "Pro",
        "description": "A positive relation between a holder and a target."
      },
      {
        "id": "Neutral",
        "displayName": "Neutral",
        "description": "A neutral relation between a holder and a target."
      },
      {
        "id": "Negative",
        "displayName": "Con",
        "description": "A contra relation between a holder and a target."
      }
    ]
  },
  "samples": [
    {
      "_id": "0",
      "document": "Ein Mann schiesst am Weihnachtstag in einer Wohnung auf seine Freundin .",
      "annotation": {
        "entities": [
          {
            "text": "Mann",
            "textId": "h2d5vl",
            "label": "Source",
            "start": 4,
            "end": 7
          },
          {
            "text": "schiesst",
            "textId": "7a443915c23c414bb1c4e7bd5b2c02cb",
            "label": "PolEx",
            "start": 9,
            "end": 16
          },
          {
            "text": "Freundin",
            "textId": "9m8apf",
            "label": "Target",
            "start": 62,
            "end": 69
          }
        ],
        "relations": [
          {
            "from": "h2d5vl",
            "to": "9m8apf",
            "label": "Negative",
            "color": "#7b1fa2"
          }
        ]
      },
      "brush": "complete"
    },
    {
      "_id": "1",
      "document": "Zudem will sich die FPÖ stärker der Familienpolitik und hier insbesondere den Bedürfnissen von Alleinerziehenden widmen .",
      "annotation": {
        "entities": [
          {
            "text": "FPÖ",
            "textId": "mqc6yl",
            "label": "Source",
            "start": 20,
            "end": 22
          },
          {
            "text": "Bedürfnissen",
            "textId": "w87h93",
            "label": "Target",
            "start": 78,
            "end": 89
          },
          {
            "text": "widmen",
            "textId": "be3067d496e943408d5dda49e7898081",
            "label": "PolEx",
            "start": 113,
            "end": 118
          }
        ],
        "relations": [
          {
            "from": "mqc6yl",
            "to": "w87h93",
            "label": "Positive",
            "color": "#d32f2f"
          }
        ]
      },
      "brush": "complete"
    },
    {
      "_id": "2",
      "document": "Die frühere Fraktionschefin und heutige Nationalrätin wird für die Stadionvorlage des Stadtrats eintreten .",
      "annotation": {
        "entities": [
          {
            "text": "Fraktionschefin",
            "textId": "ryz5wk",
            "label": "Source",
            "start": 12,
            "end": 26
          },
          {
            "text": "Stadionvorlage",
            "textId": "w3we1e",
            "label": "Target",
            "start": 67,
            "end": 80
          },
          {
            "text": "eintreten",
            "textId": "f0bf40c9a5914de093461890807e756c",
            "label": "PolEx",
            "start": 96,
            "end": 104
          }
        ],
        "relations": [
          {
            "from": "ryz5wk",
            "to": "w3we1e",
            "label": "Positive",
            "color": "#d32f2f"
          }
        ]
      },
      "brush": "complete"
    },
  ]
}

udt_data = read_json(UDT_INPUT_PATH)

In [15]:
udt_data = udt_data["samples"]
ssa_data = convert_udt_to_ssa(udt_data)

[{'sent_id': '0', 'text': 'Oshkosh geniesst den Ausblick auf den Genfersee .', 'opinions': [{'Source': [['Oshkosh'], ['0:7']], 'Polar_expression': [['geniesst'], ['8:16']], 'Target': [['Ausblick'], ['21:29']], 'Polarity': 'Positive', 'Intensity': 'Average'}]}, {'sent_id': '1', 'text': 'Die Opposition will den Unmut ausnutzen und am nächsten Samstag eine weitere Kundgebung durchführen .', 'opinions': [{'Source': [['Opposition'], ['4:14']], 'Target': [['Unmut'], ['24:29']], 'Polar_expression': [['ausnutzen'], ['30:39']], 'Polarity': 'Negative', 'Intensity': 'Average'}]}, {'sent_id': '2', 'text': 'Deutsches Laisser-faire dagegen verträgt sich nicht mit der politischen und ökonomischen Realität .', 'opinions': [{'Source': [['Laisser-faire'], ['10:23']], 'Polar_expression': [['verträgt'], ['32:40']], 'Target': [['Realität'], ['89:97']], 'Polarity': 'Negative', 'Intensity': 'Average'}]}, {'sent_id': '3', 'text': 'Ihr geistiger Vater Valentin Oehen tritt heute tatsächlich an Versammlungen der

In [16]:
save_dict_as_json(ssa_data, OUTPUT_DATA_SSA)
ssa_data

[{'sent_id': '0',
  'text': 'Oshkosh geniesst den Ausblick auf den Genfersee .',
  'opinions': [{'Source': [['Oshkosh'], ['0:7']],
    'Polar_expression': [['geniesst'], ['8:16']],
    'Target': [['Ausblick'], ['21:29']],
    'Polarity': 'Positive',
    'Intensity': 'Average'}]},
 {'sent_id': '1',
  'text': 'Die Opposition will den Unmut ausnutzen und am nächsten Samstag eine weitere Kundgebung durchführen .',
  'opinions': [{'Source': [['Opposition'], ['4:14']],
    'Target': [['Unmut'], ['24:29']],
    'Polar_expression': [['ausnutzen'], ['30:39']],
    'Polarity': 'Negative',
    'Intensity': 'Average'}]},
 {'sent_id': '2',
  'text': 'Deutsches Laisser-faire dagegen verträgt sich nicht mit der politischen und ökonomischen Realität .',
  'opinions': [{'Source': [['Laisser-faire'], ['10:23']],
    'Polar_expression': [['verträgt'], ['32:40']],
    'Target': [['Realität'], ['89:97']],
    'Polarity': 'Negative',
    'Intensity': 'Average'}]},
 {'sent_id': '3',
  'text': 'Ihr geistiger 

In [17]:

!(source ~/envs/perin-venv/bin/activate && cd ../../external_repos/direct_parsing_to_sent_graph/evaluation && python evaluate_single_dataset.py ../../../etl/data/processed/UDTConverter/manual_annotation_ssa_annotated.json ../../../etl/data/processed/Perin_Preprocessing/manual_annotation.json)


Source F1: 0.800
Target F1: 0.867
Expression F1: 1.000
Unlabeled Sentiment Tuple F1: 0.800
Sentiment Tuple F1: 0.733
{
  "source/f1": 0.7999994466669827,
  "target/f1": 0.8666661088891813,
  "expression/f1": 0.9999994333335878,
  "sentiment_tuple/unlabeled_f1": 0.8000000000000002,
  "sentiment_tuple/precision": 0.7333333333333333,
  "sentiment_tuple/recall": 0.7333333333333333,
  "sentiment_tuple/f1": 0.7333333333333333
}

[0.7999994466669827, 0.8666661088891813, 0.9999994333335878, 0.8000000000000002, 0.7333333333333333, 0.7333333333333333, 0.7333333333333333]
