In [1]:
import json
from dataclasses import dataclass

In [2]:
with open("../data/raw/event_dataset_dev.json") as f:
    data_json = json.load(f)
data_json[0:3]

[{'tid': 2771,
  'info': "If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result.",
  'extraInfo': None,
  'labelData': [{'type': 'cause',
    'reason': [[3, 76]],
    'result': [[78, 149]]}]},
 {'tid': 3940,
  'info': "As grocery customers regularly visit the store, they are continually exposed to the firm's higher margin offerings, spurring lucrative general merchandise sales.",
  'extraInfo': None,
  'labelData': [{'type': 'cause',
    'reason': [[49, 114]],
    'result': [[116, 160]]}]},
 {'tid': 710,
  'info': 'Nevertheless, with voices amplified through structural shifts like the rise of digital media, consumers have more agency than ever: if they want LaCroix (or any other National Beverage brand), retailers eventually have to oblige.',
  'extraInfo': None,
  'labelData': [{'type': 'cause',
    'reason': [[18, 61]],
    'result': [[93, 120]]}]}]

In [3]:
ex = data_json[0]
ex

{'tid': 2771,
 'info': "If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result.",
 'extraInfo': None,
 'labelData': [{'type': 'cause', 'reason': [[3, 76]], 'result': [[78, 149]]}]}

In [4]:
text = ex["info"]
text

"If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result."

In [5]:
@dataclass
class Relation:
    type: str
    causes: list[str]
    effects: list[str]


@dataclass
class Example:
    id: int
    text: str
    relations: list[Relation]


processed = []

for example in data_json:
    id = example["tid"]
    text = example["info"]
    relations = []

    for relation in example["labelData"]:
        type = relation["type"]
        causes = [text[start:end] for start, end in relation["reason"]]
        effects = [text[start:end] for start, end in relation["result"]]
        relations.append(
            Relation(
                type=type,
                causes=causes,
                effects=effects,
            )
        )

    processed.append(
        Example(
            id=id,
            text=text,
            relations=relations,
        )
    )

processed[:3]

[Example(id=2771, text="If one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers as a result.", relations=[Relation(type='cause', causes=["one or more of Ecolab's customers were to experience a disastrous outcome"], effects=["the firm's reputation could suffer and it could lose multiple customers"])]),
 Example(id=3940, text="As grocery customers regularly visit the store, they are continually exposed to the firm's higher margin offerings, spurring lucrative general merchandise sales.", relations=[Relation(type='cause', causes=["hey are continually exposed to the firm's higher margin offerings"], effects=['spurring lucrative general merchandise sales'])]),
 Example(id=710, text='Nevertheless, with voices amplified through structural shifts like the rise of digital media, consumers have more agency than ever: if they want LaCroix (or any other National Beverage brand), retailers eventually have 

In [9]:
def process_relation(relation: Relation, text: str) -> str:
    start_index = int(float(1e9))
    end_index = -1

    for clause in relation.causes + relation.effects:
        start = text.index(clause)
        end = start + len(clause)

        start_index = min(start_index, start)
        end_index = max(end_index, end)

    assert start_index != int(float(1e9)) and end_index != -1
    return text[start_index:end_index].strip()


for example in processed[:10]:
    for relation in example.relations:
        print(relation)
        print(process_relation(relation, example.text))
        print()
    print()

Relation(type='cause', causes=["one or more of Ecolab's customers were to experience a disastrous outcome"], effects=["the firm's reputation could suffer and it could lose multiple customers"])
one or more of Ecolab's customers were to experience a disastrous outcome, the firm's reputation could suffer and it could lose multiple customers


Relation(type='cause', causes=["hey are continually exposed to the firm's higher margin offerings"], effects=['spurring lucrative general merchandise sales'])
hey are continually exposed to the firm's higher margin offerings, spurring lucrative general merchandise sales


Relation(type='cause', causes=[' voices amplified through structural shifts'], effects=[' consumers have more agency'])
voices amplified through structural shifts like the rise of digital media, consumers have more agency


Relation(type='cause', causes=['scale and growth driven by strong drug price inflation and increases in therapies'], effects=[' the company has historically gen