## PERIN

Preprocess the data from the stancer pipeline and bring it into canonical form so it can be used by the sentiment graph parser.

In [91]:
# IMPORTS
import pandas as pd
import json

DATASET_PATH = "../../external_repos/stancer_setup/data/extract.csv"
DATASET_OUTPUT_PATH = "../../etl/data/processed/prototype_data_perin.json"
# Full phrase or Head mode
FULL_PHRASE = False

In [92]:
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,doc_num,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text
0,0,öffnen,193,199,öffnen,Der Gegenentwurf ist der richtige Weg,0,38,$.,.,...,weitere Werbeverbote,169,189,N,Werbeverbote,177,189,neutral,"Predicate(type='neutral', args=(Head(sentence=...","Der Gegenentwurf ist der richtige Weg , um den..."
1,0,hinkt,29,34,hinken,die Schweiz,35,46,N,Schweiz,...,der Tabakprävention,51,70,N,Tabakprävention,55,70,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Im internationalen Vergleich hinkt die Schweiz...
2,0,ist,101,104,sein,politischer Sekretär Centre Patronal,19,55,N,Sekretär,...,Der Zusammenhang zwischen Werbung und Konsum,56,100,N,Zusammenhang,60,72,neutral,"Predicate(type='neutral', args=(Head(sentence=...",""" Patrick Eperon , politischer Sekretär Centre..."
3,0,abgestimmt,130,140,abstimmen,Die von Gesundheitsorganisationen,0,34,$.,.,...,13. Februar,118,129,N,Februar,122,129,neutral,"Predicate(type='neutral', args=(Head(sentence=...","Die von Gesundheitsorganisationen , Prävention..."
4,0,verschärft,17,27,verschärfen,Der Gegenentwurf,0,16,N,Gegenentwurf,...,die Gesetzgebung,28,44,N,Gesetzgebung,32,44,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Der Gegenentwurf verschärft die Gesetzgebung e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4485,246,sind,113,117,sein,Die Zahlen der auf Tabakkonsum zurückzuführend...,37,112,N,Zahlen,...,viel zu hoch,128,140,ADV,hoch,136,140,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Geld in Krebsforschung investieren « Die Zahle...
4486,246,starben,5,12,sterben,000 Menschen,35,47,N,Menschen,...,den Folgen von Tabakkonsum,51,77,N,Folgen,55,61,neutral,"Predicate(type='neutral', args=(Head(sentence=...",2007 starben demnach mehr als 80 ' 000 Mensche...
4487,246,sank,41,45,sinken,die Zahl der Raucher,46,66,N,Zahl,...,rund ein Viertel,120,136,N,Viertel,129,136,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Nach Angaben des Gesundheitsministeriums sank ...
4488,246,plant,24,29,planen,Die britische Regierung,0,23,N,Regierung,...,ein Verbot bunter Zigarettenschachteln,30,68,N,Verbot,34,40,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Die britische Regierung plant ein Verbot bunte...


In [93]:
df.rel_type.value_counts()

neutral    4130
pro         211
con         149
Name: rel_type, dtype: int64

In [94]:
def transform_to_semeval_ssa_format(df):
    sent_list = []
    sentences = list(set(df.full_sentence_text.to_list()))
    for i, sentence in enumerate(sentences):
        opinions_list = []
        sub_df = df[df["full_sentence_text"] == sentence]
        for row in sub_df.itertuples():
            # skip neutral case, not needed here
            if row.rel_type == "neutral":
                continue

            if FULL_PHRASE:
                # only-NPs
                opinions_list.append({
                    "Source": [[row.arg1], [f"{row.arg1_start}:{row.arg1_end}"]],
                    "Target": [[row.arg2_head], [f"{row.arg2_start}:{row.arg2_end}"]],
                    "Polar_expression": [[row.verb_form], [f"{row.verb_form_start}:{row.verb_form_end}"]],
                    "Polarity": "Positive" if row.rel_type == "pro" else "Negative" if row.rel_type == "con" else None,
                    # we currently do not care about intensity
                    "Intensity": "Average"
                })
            else:
                # non-NPs
                opinions_list.append({
                    "Source": [[row.arg1_head], [f"{row.arg1_head_start}:{row.arg1_head_end}"]],
                    "Target": [[row.arg2_head], [f"{row.arg2_head_start}:{row.arg2_head_end}"]],
                    "Polar_expression": [[row.verb_form], [f"{row.verb_form_start}:{row.verb_form_end}"]],
                    "Polarity": "Positive" if row.rel_type == "pro" else "Negative" if row.rel_type == "con" else None,
                    # we currently do not care about intensity
                    "Intensity": "Average"
                })

        if len(opinions_list) != 0:
            sent_list.append({
                "sent_id": str(i),
                "text": sentence,
                "opinions": opinions_list
            })
        else:
            continue
    return sent_list

In [95]:
# call function for each... dev, test, train-json
data = transform_to_semeval_ssa_format(df)

In [96]:
# convert to JSON format.
with open(DATASET_OUTPUT_PATH, mode="w") as f:
    json.dump(data, f, ensure_ascii=False)