## TrainTestSplit

Create a train-test-split for the datasets found in the pipeline

In [121]:
# imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import os
import pandas as pd
import math
import fasttext

In [122]:
# makedirs if not exist
os.makedirs("../../etl/data/intermediate/TrainTestSplit", exist_ok=True)

In [123]:
# parameters
FULL_DATA_PATH="../../etl/data/raw/01_extract.csv"
TRAIN_DATA_PATH="../../etl/data/intermediate/TrainTestSplit/01_train.csv"
TEST_DATA_PATH="../../etl/data/intermediate/TrainTestSplit/01_test.csv"
VAL_DATA_PATH="../../etl/data/intermediate/TrainTestSplit/01_val.csv"

# mode can be either "CRITERIA_FREE", "DOC_SPLIT", "LEMMA_SPLIT", "SYN_SPLIT"
SPLIT_MODE="DOC_SPLIT"
VERB_AGREEMENT_LEVEL = .6
INCLUDE_SENTS_N_PAS = [1,2,3,4,5]
RANDOM_STATE=42

INCLUDE_MANUAL_ANNOTATION=False
MANUAL_ANNOTATION_SIZE_PER_GROUP=70
MANUAL_ANNOTATION_PATH="../../etl/data/intermediate/TrainTestSplit/manual_annotation.csv"

# fasttext embeddings path
FASTTEXT_MODEL_BIN_PATH="../../external_repos/stancer_setup/models/cc.de.300.bin"

In [124]:
df = pd.read_csv(FULL_DATA_PATH)

In [125]:
# group by sentences
df

Unnamed: 0,doc_id,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text
0,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,abgestraft,26,36,abstrafen,Alexis Tsipras,5,19,N,Alexis,...,Dass Alexis Tsipras jetzt abgestraft wurde,0,43,$.,.,143,144,neutral,"Predicate(type='neutral', args=(Head(sentence=6, token=1), Head(sentence=6, token=-1)), strength=0, verb=4)","Dass Alexis Tsipras jetzt abgestraft wurde , hat viel mit der angestauten Unzufriedenheit über die langen Jahre des Sparens und Darbens zu tun ."
1,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,enttäuschen,140,151,enttäuschen,die neue Regierung,75,93,N,Regierung,...,die Hoffnungen auf einen spürbaren Aufschwung,94,139,N,Hoffnungen,98,108,neutral,"Predicate(type='neutral', args=(Head(sentence=18, token=13), Head(sentence=18, token=15)), strength=0, verb=20)","Wenn die Kreditgeber Athen nicht zusätzlichen Spielraum öffnen , wird auch die neue Regierung die Hoffnungen auf einen spürbaren Aufschwung enttäuschen ."
2,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,beenden,119,126,beenden,Will er dem Land etwas Gutes tun,0,33,$.,.,...,die politische Polarisierung beenden,90,127,N,Polarisierung,105,118,neutral,"Predicate(type='neutral', args=(Head(sentence=10, token=-1), Head(sentence=10, token=20)), strength=0, verb=21)","Will er dem Land etwas Gutes tun , dann sollte er nicht nur Steuern senken , sondern auch die politische Polarisierung beenden , die das Klima in Griechenland zuletzt so vergiftet hat ."
3,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb9508b4f1acc63a35ac,akzeptieren,69,80,akzeptieren,Pajtim Kasami,0,14,N,Pajtim,...,die Kurzarbeit nun doch,81,105,N,Kurzarbeit,85,95,pro,"Predicate(type='pro', args=(Head(sentence=12, token=0), Head(sentence=12, token=13)), strength=0, verb=11)","Pajtim Kasami , Ermir Lenjani , Birama Ndoye und Mickael Facchinetti akzeptieren die Kurzarbeit nun doch , weshalb Sion-Präsident Christian Constantin die Entlassungen zurückzieht ."
4,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb9508b4f1acc63a35ac,entliess,30,38,entlassen,der FC Sion,39,50,N,FC,...,Fussball Neun Spieler,8,29,N,Fussball,8,16,neutral,"Predicate(type='neutral', args=(Head(sentence=10, token=8), Head(sentence=10, token=3)), strength=0, verb=6)","( dpa ) Fussball Neun Spieler entliess der FC Sion Mitte März , als die Corona-Krise den Schweizer Fussball traf ."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832168,9701f0c776430a365f0619836f34658459be788b9a6646e86c252c477ef565a4,äussert,5,12,äussern,Frau A,13,19,N,Frau,...,massive Vorwürfe gegen ihre Vorgesetzten,20,60,N,Vorwürfe,28,36,neutral,"Predicate(type='neutral', args=(Head(sentence=13, token=2), Head(sentence=13, token=5)), strength=0, verb=1)",Dann äussert Frau A massive Vorwürfe gegen ihre Vorgesetzten .
832169,9701f0c776430a365f0619836f34658459be788b9a6646e86c252c477ef565a4,stelle,184,190,stellen,das Kommando,171,183,N,Kommando,...,einen Antrag auf ihren Ausschluss,191,224,N,Antrag,197,203,neutral,"Predicate(type='neutral', args=(Head(sentence=28, token=31), Head(sentence=28, token=34)), strength=0, verb=32)","Am 5. Dezember stellt der Chef der Abteilung Milizfeuerwehr und Zivilschutz Frau A vor die Wahl : Entweder trete sie per Ende des Jahres aus der Milizfeuerwehr aus , oder das Kommando stelle einen Antrag auf ihren Ausschluss ."
832170,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a744651af0e992064bf,ausgewiesen,77,88,ausweisen,Der Konzern,0,11,N,Konzern,...,eine Liquidität,32,47,N,Liquidität,37,47,con,"Predicate(type='con', args=(Head(sentence=19, token=1), Head(sentence=19, token=7)), strength=0, verb=13)",Der Konzern hatte im April noch eine Liquidität von gut vier Milliarden Euro ausgewiesen .
832171,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a744651af0e992064bf,belebt,8,14,beleben,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,...,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,24,41,con,"Predicate(type='con', args=(Head(sentence=25, token=4), Head(sentence=25, token=4)), strength=0, verb=1)",Seitdem belebt sich das Verkehrsaufkommen auf niedriger Basis jedoch wieder .


**How big is the problem with multi-PAS per sentence?**

Which should not be split accross the dataset splits.

In [126]:
df_occ = df.copy(deep=True)
df_occ = df_occ.merge(df_occ.groupby(["doc_id", "full_sentence_text"]).size().reset_index(name="counts"), on=["doc_id", "full_sentence_text"])
df_occ_freq = df_occ.groupby(["rel_type", "counts"]).size()
df_occ_freq

rel_type  counts
con       1          80668
          2           8149
          3            785
          4             68
          5              3
neutral   1         569679
          2          51984
          3           3348
          4            223
          5             27
          7              7
          8              8
pro       1         107311
          2           9187
          3            676
          4             45
          5              5
dtype: int64

In [127]:
# filter needed pas per sentence
# 0 - exclude sents, which have more than N specified pas.
mask = ~(df_occ["counts"].isin(INCLUDE_SENTS_N_PAS))
df_occ = df_occ[mask]
n_pas_sents = df_occ["full_sentence_text"].to_list()

df = df[~df.full_sentence_text.isin(n_pas_sents)]

In [128]:
INCLUDE_SENTS_N_PAS

[1, 2, 3, 4, 5]

### Complex sentences analysis

**How do the multi-PAS sentences look**?

Complexity and sentence analysis.

In [129]:
df_occ = df.copy(deep=True)
df_occ = df_occ.merge(df_occ.groupby(["doc_id", "full_sentence_text"]).size().reset_index(name="counts"), on=["doc_id", "full_sentence_text"])
pd.set_option('display.max_colwidth', None)
s1 = df_occ[df_occ["counts"] >= 2] \
    .sort_values(["full_sentence_text"]) \
    .sample(n=1)
s1_val = s1["full_sentence_text"].to_list()[0]
df_occ[df_occ["full_sentence_text"] == s1_val]

Unnamed: 0,doc_id,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text,counts
484618,0b7a6650825ab759a34b8c17b1659cf6688f252ca725a4821968f0ed0b301f3b,ausgegangen,95,106,ausgehen,Bei beiden,0,11,$.,.,...,89,94,PROAV,davon,89,94,neutral,"Predicate(type='neutral', args=(Head(sentence=22, token=-1), Head(sentence=22, token=17)), strength=0, verb=18)","Bei beiden , bei der « europäischen Normalität » und beim « skandinavischen Weg » , wird davon ausgegangen , dass sich die effektiven Souveränitätsverluste als geringer herausstellen würden denn befürchtet .",2
484619,0b7a6650825ab759a34b8c17b1659cf6688f252ca725a4821968f0ed0b301f3b,befürchtet,195,205,befürchten,die effektiven Souveränitätsverluste,119,155,N,Souveränitätsverluste,...,0,11,$.,.,206,207,neutral,"Predicate(type='neutral', args=(Head(sentence=22, token=24), Head(sentence=22, token=-1)), strength=0, verb=30)","Bei beiden , bei der « europäischen Normalität » und beim « skandinavischen Weg » , wird davon ausgegangen , dass sich die effektiven Souveränitätsverluste als geringer herausstellen würden denn befürchtet .",2


**Seeking for specific multi-PAS sentences:**

Ones where:
$MP^{arg_1}_1=MP^{arg_1}_2$ or  $MP^{arg_2}_1=MP^{arg_2}_2$

In [130]:
# get all docIDs and sents where this hold and then filter by them.

**Verb mixing**

How large is the likelihood that a verb-mediated relation is positive, then negative.

Potentially: try to remove this ambiguity.

***Agreement metric:*** What are particularly ambiguous verbs, and can "entity" type restrictions be learned around them? What are not very ambigious words?

=> Using a measure of [balance](https://stats.stackexchange.com/questions/239973/a-general-measure-of-data-set-imbalance)

In [131]:
def balance(seq):
    """
        Provides a measure of balancedness.
        input: sequence of class counts
        0 means unbalanced, which is better! more agreement!
        1 means balanced
    """
    from collections import Counter
    from numpy import log
    
    # define this as a high agreement
    if len(seq) == 1:
        # we'll define a single class as highly unbalanced
        return 0.0
    
    # n = len(seq)
    n = sum(seq)
    # classes = [(clas,float(count)) for clas,count in Counter(seq).items()]
    k = len(seq)
    
    H = -sum([ (count/n) * log((count/n)) for clas,count in enumerate(seq)]) #shannon entropy
    return H/(log(k))

In [132]:
balance([500, 500])

1.0

In [133]:
df_mixing = df.copy(deep=True)
df_mixing
df_mixing = df_mixing.groupby(["verb_lemma", "rel_type"]).size().reset_index(name="counts")
df_mixing = df_mixing.groupby(["verb_lemma"]).apply(lambda x: balance(x["counts"].to_list())).reset_index(name="balance").dropna()
df_mixing.sort_values("balance")
# df_mixing[df_mixing["verb_lemma"] == "verübeln"]

Unnamed: 0,verb_lemma,balance
735,rechtfertigen,0.0
935,verhungern,0.0
680,nachtrauern,0.0
943,verkraften,0.0
944,verkümmern,0.0
...,...,...
543,herunterstufen,1.0
537,herabwürdigen,1.0
538,herbeizitieren,1.0
981,verschliessen,1.0


In [134]:
# what is the distribution of the balancedness?
df_mixing.balance.value_counts()
# get all verbs with balance 0.0
agreeable_verbs = df_mixing[df_mixing["balance"] <= VERB_AGREEMENT_LEVEL]["verb_lemma"].to_list()
len(agreeable_verbs)

623

In [135]:
# what is the distribution of verbs and their agreement
df_analysis = df.copy(deep=True)
df_analysis = df_analysis.groupby(["verb_lemma", "rel_type"]).size().reset_index(name="counts")
df_analysis[df_analysis["verb_lemma"] == "nachgeben"]

Unnamed: 0,verb_lemma,rel_type,counts
1470,nachgeben,neutral,467


In [136]:
df

Unnamed: 0,doc_id,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text
0,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,abgestraft,26,36,abstrafen,Alexis Tsipras,5,19,N,Alexis,...,Dass Alexis Tsipras jetzt abgestraft wurde,0,43,$.,.,143,144,neutral,"Predicate(type='neutral', args=(Head(sentence=6, token=1), Head(sentence=6, token=-1)), strength=0, verb=4)","Dass Alexis Tsipras jetzt abgestraft wurde , hat viel mit der angestauten Unzufriedenheit über die langen Jahre des Sparens und Darbens zu tun ."
1,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,enttäuschen,140,151,enttäuschen,die neue Regierung,75,93,N,Regierung,...,die Hoffnungen auf einen spürbaren Aufschwung,94,139,N,Hoffnungen,98,108,neutral,"Predicate(type='neutral', args=(Head(sentence=18, token=13), Head(sentence=18, token=15)), strength=0, verb=20)","Wenn die Kreditgeber Athen nicht zusätzlichen Spielraum öffnen , wird auch die neue Regierung die Hoffnungen auf einen spürbaren Aufschwung enttäuschen ."
2,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,beenden,119,126,beenden,Will er dem Land etwas Gutes tun,0,33,$.,.,...,die politische Polarisierung beenden,90,127,N,Polarisierung,105,118,neutral,"Predicate(type='neutral', args=(Head(sentence=10, token=-1), Head(sentence=10, token=20)), strength=0, verb=21)","Will er dem Land etwas Gutes tun , dann sollte er nicht nur Steuern senken , sondern auch die politische Polarisierung beenden , die das Klima in Griechenland zuletzt so vergiftet hat ."
3,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb9508b4f1acc63a35ac,akzeptieren,69,80,akzeptieren,Pajtim Kasami,0,14,N,Pajtim,...,die Kurzarbeit nun doch,81,105,N,Kurzarbeit,85,95,pro,"Predicate(type='pro', args=(Head(sentence=12, token=0), Head(sentence=12, token=13)), strength=0, verb=11)","Pajtim Kasami , Ermir Lenjani , Birama Ndoye und Mickael Facchinetti akzeptieren die Kurzarbeit nun doch , weshalb Sion-Präsident Christian Constantin die Entlassungen zurückzieht ."
4,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb9508b4f1acc63a35ac,entliess,30,38,entlassen,der FC Sion,39,50,N,FC,...,Fussball Neun Spieler,8,29,N,Fussball,8,16,neutral,"Predicate(type='neutral', args=(Head(sentence=10, token=8), Head(sentence=10, token=3)), strength=0, verb=6)","( dpa ) Fussball Neun Spieler entliess der FC Sion Mitte März , als die Corona-Krise den Schweizer Fussball traf ."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832168,9701f0c776430a365f0619836f34658459be788b9a6646e86c252c477ef565a4,äussert,5,12,äussern,Frau A,13,19,N,Frau,...,massive Vorwürfe gegen ihre Vorgesetzten,20,60,N,Vorwürfe,28,36,neutral,"Predicate(type='neutral', args=(Head(sentence=13, token=2), Head(sentence=13, token=5)), strength=0, verb=1)",Dann äussert Frau A massive Vorwürfe gegen ihre Vorgesetzten .
832169,9701f0c776430a365f0619836f34658459be788b9a6646e86c252c477ef565a4,stelle,184,190,stellen,das Kommando,171,183,N,Kommando,...,einen Antrag auf ihren Ausschluss,191,224,N,Antrag,197,203,neutral,"Predicate(type='neutral', args=(Head(sentence=28, token=31), Head(sentence=28, token=34)), strength=0, verb=32)","Am 5. Dezember stellt der Chef der Abteilung Milizfeuerwehr und Zivilschutz Frau A vor die Wahl : Entweder trete sie per Ende des Jahres aus der Milizfeuerwehr aus , oder das Kommando stelle einen Antrag auf ihren Ausschluss ."
832170,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a744651af0e992064bf,ausgewiesen,77,88,ausweisen,Der Konzern,0,11,N,Konzern,...,eine Liquidität,32,47,N,Liquidität,37,47,con,"Predicate(type='con', args=(Head(sentence=19, token=1), Head(sentence=19, token=7)), strength=0, verb=13)",Der Konzern hatte im April noch eine Liquidität von gut vier Milliarden Euro ausgewiesen .
832171,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a744651af0e992064bf,belebt,8,14,beleben,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,...,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,24,41,con,"Predicate(type='con', args=(Head(sentence=25, token=4), Head(sentence=25, token=4)), strength=0, verb=1)",Seitdem belebt sich das Verkehrsaufkommen auf niedriger Basis jedoch wieder .


**How big is the problem of -1 in first span or a "." in the data?**

The -1 does not exist. No problem currently.

The "." is quite prevalent and strangely only occurrs in the neutral case. That's why a cleaning step was introduced below (see solution).

**Clean dataset**

1. As seen below we must remove the argument heads make sure we don't have any "." as arguments for the neutrals.
2. We also want to remove the reflexive cases for the pronouns.
3. Only include verbs that fulfill balancedness criteria, execute cell 17 & 18 for this first.

In [137]:
df_dirty = df.copy(deep=True)
df_dirty[
#(df_dirty["arg1_head"] == ".")
#|(df_dirty["arg2_head"] == ".")
#|(df_dirty["arg2"] == ".")
#|(df_dirty["arg2"] == ".")
#(df_dirty["arg1_start"] == -1)
#|(df_dirty["arg2_start"] == -1)
#|(df_dirty["arg2_head_start"] == -1)
#|(df_dirty["arg2_head_start"] == -1)
(df_dirty["arg1"] == df_dirty["arg2"])
& (df_dirty["full_sentence_text"].str.contains("sich")) # reflexive verbs, should be ignore?
].rel_type.value_counts()

pro        6611
neutral    5343
con        3666
Name: rel_type, dtype: int64

In [138]:
# 1 - include dotted (most likely with no target or other thing, or are these the -1 ones)
mask = ~((df["arg1_head"] == ".")
|(df["arg2_head"] == ".")
|(df["arg2"] == ".")
|(df["arg2"] == "."))

df = df[mask]


# 2 - exclude reflexive
mask = ~(df["arg1"] == df["arg2"])

df = df[mask]

# 3 - exclude balanced = ambiguous verbs.
mask = (df["verb_lemma"].isin(agreeable_verbs))

df = df[mask]

# 4 - exclude sents, where the arg1_head_start and arg2_head_start overlap (due to stancer data extraction)
mask = ~(df["arg1_head_start"] == df["arg2_head_start"])

df = df[mask]

In [139]:
print(len(agreeable_verbs))
df.rel_type.value_counts()

623


neutral    304564
pro         15049
con         13386
Name: rel_type, dtype: int64

**Balance the dataset**

Make sure that each class is represented equally.

In [140]:
df = df.groupby('rel_type')
df = df.apply(lambda x: x.sample(df.size().min(), random_state=RANDOM_STATE).reset_index(drop=True))
df.rel_type.value_counts()

con        13386
neutral    13386
pro        13386
Name: rel_type, dtype: int64

In [141]:
df = df.reset_index(level=0, drop=True)
df.rel_type.value_counts()

con        13386
neutral    13386
pro        13386
Name: rel_type, dtype: int64

**Analyzing the verb distribution**

In [142]:
!pip install plotly

Defaulting to user installation because normal site-packages is not writeable


In [143]:
import plotly.express as px

def analyse_verb_frequency(df):
    df_analysis = df.copy(deep=True)
    df_analysis = df_analysis.groupby(["verb_lemma"]).size().reset_index(name="counts")

    # get a verb that is needed in the distribution
    # print(df_analysis[df_analysis["counts"] == 1]["verb_lemma"])
    # the counts of the counts
    # df_analysis = df_analysis.groupby(["counts"]).size().reset_index(name="count_counts")
    # print(len(df_analysis))

    # generate a sufficient bin width
    bin_width= 10
    # here you can choose your rounding method, I've chosen math.ceil.
    nbins = math.ceil((df_analysis["counts"].max() - df_analysis["counts"].min()) / bin_width)


    fig = px.histogram(df_analysis, 
                       x="counts", 
                       nbins=nbins,
                       title="Verb Frequency over the number of verbs", 
                       labels={'counts': 'Frequency of verbs'}
                      ).update_layout(yaxis_title='Number of verbs')

    # fig.show()

    print(df_analysis.sort_values(by="counts", ascending=False))
    
analyse_verb_frequency(df)

        verb_lemma  counts
327        stellen    4213
210         freuen    1758
50       aufnehmen    1497
213         gelten    1151
322         sorgen    1003
..             ...     ...
316       schäumen       1
289   phantasieren       1
276      mutmassen       1
273   missionieren       1
238  ideologisiert       1

[477 rows x 2 columns]


**Option 1: Train-test-splitting**

Only problem: We may have sentences within the same documents with multiple PAS that are split accross the sets.

In [144]:
if SPLIT_MODE == "CRITERIA_FREE":
    train, test_val = train_test_split(df, test_size=0.3, stratify=df["rel_type"], random_state=RANDOM_STATE)
    test, val = train_test_split(test_val, test_size=0.5, stratify=test_val["rel_type"],random_state=RANDOM_STATE)

**Option 2: Train-test-splitting**

With respecting group distribution. Which means that all sentences $S_{1..N}$ from a document $A$ will either all be in the test set, all be in the validation set or all be in the training set. The reason for this is since it could happen that multiple PAS are detected within the same sentence and then the system is trained to one PAS and is evaluated on a completely different PAS, which is unfair.

In [145]:
if SPLIT_MODE=="DOC_SPLIT":
    # preserve groups between sentences (in this case doc_id is safe enough)
    splitter = GroupShuffleSplit(test_size=.30, n_splits=1, random_state=RANDOM_STATE)
    split = splitter.split(df, groups=df['doc_id'])
    train_inds, test_val_inds = next(split)

    train = df.iloc[train_inds]
    test_val = df.iloc[test_val_inds]

    splitter = GroupShuffleSplit(test_size=.5, n_splits=1, random_state=RANDOM_STATE)
    split = splitter.split(test_val, groups=test_val['doc_id'])
    test_inds, val_inds = next(split)

    test = test_val.iloc[test_inds]
    val = test_val.iloc[val_inds]

In [146]:
# a small test to see whether option no 2 achieves our goals.

X = np.ones(shape=(10, 2))
y = np.ones(shape=(10, 1))
groups = np.array([1, 1, 2, 2, 2, 3, 3, 3, 8, 8])
print(groups.shape)

gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=42)
gss.get_n_splits()

for train_idx, test_idx in gss.split(X, y, groups):
    print("TRAIN:", [groups[i] for i in train_idx], "TEST:", [groups[i] for i in test_idx])

(10,)
TRAIN: [1, 1, 3, 3, 3, 8, 8] TEST: [2, 2, 2]


## Hard-verb splitting

To test generalisability we may want to split verbs by train and test set.

**Solution**: We can simply use the group splitting argument on the verbs.

**Option 3: Train-test split with verb-splitting**

This means that given a verb $V$, all sentences which contain that verb will either be in $T_{RAIN}$, $V_{ALID}$ or $T_{EST}$.

In [147]:
if SPLIT_MODE == "LEMMA_SPLIT":
    # preserve groups between sentences (in this case verbs)
    splitter = GroupShuffleSplit(test_size=.30, n_splits=1, random_state=RANDOM_STATE)
    split = splitter.split(df, groups=df['verb_lemma'])
    train_inds, test_val_inds = next(split)

    train = df.iloc[train_inds]
    test_val = df.iloc[test_val_inds]

    test, val = train_test_split(test_val, test_size=0.5, stratify=test_val["rel_type"],random_state=RANDOM_STATE)

    #splitter = GroupShuffleSplit(test_size=.5, n_splits=1, random_state=RANDOM_STATE)
    #split = splitter.split(test_val, groups=test_val['verb_lemma'])
    #test_inds, val_inds = next(split)

    #test = test_val.iloc[test_inds]
    #val = test_val.iloc[val_inds]

**Option 4: Split by synonymy**

Possible idea: Maximally similar (homogenous) sets.
E.g. [this](https://www.sciencedirect.com/science/article/abs/pii/S0925231209000046)


In [148]:
# model = fasttext.load_model(FASTTEXT_MODEL_BIN_PATH)

In [149]:
import fasttext
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances

def get_fasttext_word_embeddings(verbs):
    # verb embeddings
    vEmbs = [model[w] for w in verbs]
    return vEmbs

if SPLIT_MODE == "SIMILARITY_SPLIT":
    rel_types = list(set(df.rel_type.to_list()))
    for rt in rel_types[:1]:
        sub_df = df[df.rel_type == rt].copy(deep=True)
        print(len(sub_df))
        verbs = list(set(sub_df.verb_lemma.to_list()))
        verb_embeddings = get_fasttext_word_embeddings(verbs)
        wd_arr = np.array(verb_embeddings).reshape(-1, 1)  # reshape to compute pairwise distance
        distances = pairwise_distances(wd_arr, wd_arr, metric="euclidean")  # pairwise distance matrix
        print(distances)

In [150]:
# verbs

In [151]:
# del model

### Save to file

Save the respective datasets to file

In [152]:
print("VALUE COUNTS: train")
print(train.rel_type.value_counts())
train.to_csv(TRAIN_DATA_PATH, index=False)

print("VALUE COUNTS: test")
print(test.rel_type.value_counts())
test.to_csv(TEST_DATA_PATH, index=False)

print("VALUE COUNTS: val")
print(val.rel_type.value_counts())
val.to_csv(VAL_DATA_PATH, index=False)

VALUE COUNTS: train
neutral    9407
con        9385
pro        9280
Name: rel_type, dtype: int64
VALUE COUNTS: test
pro        2071
con        1967
neutral    1966
Name: rel_type, dtype: int64
VALUE COUNTS: val
pro        2035
con        2034
neutral    2013
Name: rel_type, dtype: int64


We verify that this worked using an overlap metric by checking whether the overlap is 0 with respect to all verbs in the dataset.

In [153]:
# overlap /w regards to verbform
train_verbs = train.verb_form.to_list()
val_test_verbs = val.verb_form.to_list() + test.verb_form.to_list()
list(set(train_verbs) & set(val_test_verbs))

['fehlen',
 'überschätzen',
 'sträubte',
 'abgewählt',
 'widersetzte',
 'bestraften',
 'aufhören',
 'kürte',
 'versprechen',
 'ausgetrickst',
 'klaut',
 'bittet',
 'freuten',
 'freut',
 'benutzten',
 'entschuldigten',
 'verlieben',
 'glänzen',
 'geachtet',
 'einhält',
 'ermordet',
 'tötet',
 'erwerben',
 'stöhnen',
 'abzusehen',
 'überzeugt',
 'irritieren',
 'beweise',
 'weiss',
 'verschärft',
 'geworben',
 'geniessen',
 'haut',
 'versprochen',
 'überfiel',
 'behinderte',
 'gebrochen',
 'aufhebt',
 'verstand',
 'korrigiert',
 'raubte',
 'brechen',
 'bestraft',
 'belügen',
 'erntete',
 'bedankt',
 'darstelle',
 'einschränke',
 'behandeln',
 'eingetreten',
 'abzustrafen',
 'umgeht',
 'hetzte',
 'übergehen',
 'brillierte',
 'bewirken',
 'lügen',
 'gequält',
 'brilliert',
 'aufnahm',
 'propagiert',
 'geschwärmt',
 'loszuwerden',
 'angepriesen',
 'versucht',
 'plädieren',
 'steigern',
 'vorschreiben',
 'wehren',
 'geniesst',
 'rühmt',
 'anpreist',
 'benutze',
 'verantworten',
 'ergötzen',
 

In [154]:
# overlap /w regards to lemma
train_verbs = train.verb_lemma.to_list()
val_test_verbs = val.verb_lemma.to_list() + test.verb_lemma.to_list()
list(set(train_verbs) & set(val_test_verbs))

['fehlen',
 'überschätzen',
 'aufhören',
 'misslingen',
 'versprechen',
 'rentieren',
 'verlieben',
 'glänzen',
 'erwerben',
 'stöhnen',
 'irritieren',
 'entheben',
 'durchhalten',
 'brechen',
 'belügen',
 'benachteiligen',
 'behandeln',
 'übergehen',
 'bewirken',
 'lügen',
 'umkommen',
 'plädieren',
 'infizieren',
 'steigern',
 'vorschreiben',
 'wehren',
 'verwechseln',
 'reklamieren',
 'verantworten',
 'ergötzen',
 'zittern',
 'hinwegsehen',
 'entschuldigen',
 'erzwingen',
 'solidarisieren',
 'umsorgen',
 'offenbaren',
 'triumphieren',
 'bestreiten',
 'beherrschen',
 'avancieren',
 'abwählen',
 'erzielen',
 'klagen',
 'besänftigen',
 'verhaften',
 'darben',
 'verschleiern',
 'überfallen',
 'entschädigen',
 'warnen',
 'bewahren',
 'vorhalten',
 'bürgen',
 'feuern',
 'verwunden',
 'verbürgen',
 'gängeln',
 'deportieren',
 'unterliegen',
 'heimsuchen',
 'wettmachen',
 'qualifizieren',
 'zurückweisen',
 'schlechtmachen',
 'belächeln',
 'stürzen',
 'verpassen',
 'dienen',
 'foltern',
 'ja

In [155]:
analyse_verb_frequency(train)

    verb_lemma  counts
317    stellen    2885
204     freuen    1194
47   aufnehmen    1051
206     gelten     828
312     sorgen     711
..         ...     ...
326     tilgen       1
322   stänkern       1
153   entsagen       1
313    spinnen       1
52   ausbluten       1

[460 rows x 2 columns]


In [156]:
analyse_verb_frequency(val)

      verb_lemma  counts
244      stellen     660
162       freuen     274
40     aufnehmen     225
165       gelten     164
240       sorgen     158
..           ...     ...
229    schiessen       1
21   anherrschen       1
28    anstrengen       1
38     aufklären       1
273     verenden       1

[356 rows x 2 columns]


In [157]:
analyse_verb_frequency(test)

      verb_lemma  counts
249      stellen     668
158       freuen     290
33     aufnehmen     221
161       gelten     159
219    plädieren     136
..           ...     ...
12     abstrafen       1
198   langweilen       1
199      lechzen       1
286    verhetzen       1
178  hinwegsehen       1

[356 rows x 2 columns]


In [158]:
if INCLUDE_MANUAL_ANNOTATION:
    df_sample = df.copy(deep=True)
    res = df_sample.groupby('rel_type', group_keys=False).apply(lambda x: x.sample(MANUAL_ANNOTATION_SIZE_PER_GROUP))
    res.to_csv(MANUAL_ANNOTATION_PATH, index=False)

In [159]:
df.rel_type.value_counts()

con        13386
neutral    13386
pro        13386
Name: rel_type, dtype: int64

In [160]:
# how many multi-pas are still sustained?
df_occ = df.copy(deep=True)
print(f"All sentences in DS: { len(list(set(df_occ['full_sentence_text'].to_list()))) }")
df_occ = df_occ.merge(df_occ.groupby(["doc_id", "full_sentence_text"]).size().reset_index(name="counts"), on=["doc_id", "full_sentence_text"])
pd.set_option('display.max_colwidth', None)
s1 = df_occ[df_occ["counts"] >= 2] \
    .sort_values(["full_sentence_text"])

print(f" All multipas sentences: { len(list(set(s1['full_sentence_text'].to_list()))) }")

All sentences in DS: 39793
 All multipas sentences: 139
