## TrainTestSplit

Create a train-test-split for the datasets found in the pipeline

In [1]:
# imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
import os
import pandas as pd
import math

In [2]:
# makedirs if not exist
os.makedirs("../../etl/data/intermediate/TrainTestSplit", exist_ok=True)

In [3]:
# parameters
FULL_DATA_PATH="../../etl/data/raw/01_extract.csv"
TRAIN_DATA_PATH="../../etl/data/intermediate/TrainTestSplit/01_train.csv"
TEST_DATA_PATH="../../etl/data/intermediate/TrainTestSplit/01_test.csv"
VAL_DATA_PATH="../../etl/data/intermediate/TrainTestSplit/01_val.csv"

# mode can be either "CRITERIA_FREE", "DOC_SPLIT", "LEMMA_SPLIT"
SPLIT_MODE="LEMMA_SPLIT"
VERB_AGREEMENT_LEVEL = .6
INCLUDE_SENTS_N_PAS = [1]
RANDOM_STATE=42

In [4]:
df = pd.read_csv(FULL_DATA_PATH)

In [5]:
# group by sentences
df

Unnamed: 0,doc_id,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text
0,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e3...,abgestraft,26,36,abstrafen,Alexis Tsipras,5,19,N,Alexis,...,Dass Alexis Tsipras jetzt abgestraft wurde,0,43,$.,.,143,144,neutral,"Predicate(type='neutral', args=(Head(sentence=...","Dass Alexis Tsipras jetzt abgestraft wurde , h..."
1,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e3...,enttäuschen,140,151,enttäuschen,die neue Regierung,75,93,N,Regierung,...,die Hoffnungen auf einen spürbaren Aufschwung,94,139,N,Hoffnungen,98,108,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Wenn die Kreditgeber Athen nicht zusätzlichen ...
2,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e3...,beenden,119,126,beenden,Will er dem Land etwas Gutes tun,0,33,$.,.,...,die politische Polarisierung beenden,90,127,N,Polarisierung,105,118,neutral,"Predicate(type='neutral', args=(Head(sentence=...","Will er dem Land etwas Gutes tun , dann sollte..."
3,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb...,akzeptieren,69,80,akzeptieren,Pajtim Kasami,0,14,N,Pajtim,...,die Kurzarbeit nun doch,81,105,N,Kurzarbeit,85,95,pro,"Predicate(type='pro', args=(Head(sentence=12, ...","Pajtim Kasami , Ermir Lenjani , Birama Ndoye u..."
4,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb...,entliess,30,38,entlassen,der FC Sion,39,50,N,FC,...,Fussball Neun Spieler,8,29,N,Fussball,8,16,neutral,"Predicate(type='neutral', args=(Head(sentence=...",( dpa ) Fussball Neun Spieler entliess der FC ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832168,9701f0c776430a365f0619836f34658459be788b9a6646...,äussert,5,12,äussern,Frau A,13,19,N,Frau,...,massive Vorwürfe gegen ihre Vorgesetzten,20,60,N,Vorwürfe,28,36,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Dann äussert Frau A massive Vorwürfe gegen ihr...
832169,9701f0c776430a365f0619836f34658459be788b9a6646...,stelle,184,190,stellen,das Kommando,171,183,N,Kommando,...,einen Antrag auf ihren Ausschluss,191,224,N,Antrag,197,203,neutral,"Predicate(type='neutral', args=(Head(sentence=...",Am 5. Dezember stellt der Chef der Abteilung M...
832170,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a...,ausgewiesen,77,88,ausweisen,Der Konzern,0,11,N,Konzern,...,eine Liquidität,32,47,N,Liquidität,37,47,con,"Predicate(type='con', args=(Head(sentence=19, ...",Der Konzern hatte im April noch eine Liquiditä...
832171,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a...,belebt,8,14,beleben,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,...,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,24,41,con,"Predicate(type='con', args=(Head(sentence=25, ...",Seitdem belebt sich das Verkehrsaufkommen auf ...


**How big is the problem with multi-PAS per sentence?**

Which should not be split accross the dataset splits.

In [6]:
df_occ = df.copy(deep=True)
df_occ = df_occ.merge(df_occ.groupby(["doc_id", "full_sentence_text"]).size().reset_index(name="counts"), on=["doc_id", "full_sentence_text"])
df_occ_freq = df_occ.groupby(["rel_type", "counts"]).size()
df_occ_freq

rel_type  counts
con       1          80668
          2           8149
          3            785
          4             68
          5              3
neutral   1         569679
          2          51984
          3           3348
          4            223
          5             27
          7              7
          8              8
pro       1         107311
          2           9187
          3            676
          4             45
          5              5
dtype: int64

In [7]:
# filter needed pas per sentence
# 0 - exclude sents, which have more than N specified pas.
mask = ~(df_occ["counts"].isin(INCLUDE_SENTS_N_PAS))
df_occ = df_occ[mask]
n_pas_sents = df_occ["full_sentence_text"].to_list()

df = df[~df.full_sentence_text.isin(n_pas_sents)]

### Complex sentences analysis

**How do the multi-PAS sentences look**?

Complexity and sentence analysis.

In [8]:
pd.set_option('display.max_colwidth', None)
s1 = df_occ[df_occ["counts"] >= 2] \
    .sort_values(["full_sentence_text"]) \
    .sample(n=1)
s1_val = s1["full_sentence_text"].to_list()[0]
df_occ[df_occ["full_sentence_text"] == s1_val]

Unnamed: 0,doc_id,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text,counts
616300,086fbc038e6331ebb08719cfe0862c559619745be62c158b962269d0a3cd6d67,schützen,53,61,schützen,Israel,0,6,N,Israel,...,13,52,N,Religionsfreiheit,17,34,pro,"Predicate(type='pro', args=(Head(sentence=19, token=0), Head(sentence=19, token=3)), strength=0, verb=6)","Israel müsse die Religionsfreiheit der Palästinenser schützen und dürfe die Heiligkeit der Al-Aksa-Moschee nicht verletzen , erklärte der emiratische Staatsminister für auswärtige Angelegenheiten , Khalifa al-Marar .",2
616301,086fbc038e6331ebb08719cfe0862c559619745be62c158b962269d0a3cd6d67,verletzen,113,122,verletzen,Israel müsse die Religionsfreiheit der Palästinenser schützen und dürfe die Heiligkeit der Al-Aksa-Moschee nicht verletzen,0,123,$.,.,...,72,106,N,Heiligkeit,76,86,neutral,"Predicate(type='neutral', args=(Head(sentence=19, token=-1), Head(sentence=19, token=10)), strength=0, verb=14)","Israel müsse die Religionsfreiheit der Palästinenser schützen und dürfe die Heiligkeit der Al-Aksa-Moschee nicht verletzen , erklärte der emiratische Staatsminister für auswärtige Angelegenheiten , Khalifa al-Marar .",2


**Seeking for specific multi-PAS sentences:**

Ones where:
$MP^{arg_1}_1=MP^{arg_1}_2$ or  $MP^{arg_2}_1=MP^{arg_2}_2$

In [9]:
# get all docIDs and sents where this hold and then filter by them.

**Verb mixing**

How large is the likelihood that a verb-mediated relation is positive, then negative.

Potentially: try to remove this ambiguity.

***Agreement metric:*** What are particularly ambiguous verbs, and can "entity" type restrictions be learned around them? What are not very ambigious words?

=> Using a measure of [balance](https://stats.stackexchange.com/questions/239973/a-general-measure-of-data-set-imbalance)

In [10]:
def balance(seq):
    """
        Provides a measure of balancedness.
        input: sequence of class counts
        0 means unbalanced, which is better! more agreement!
        1 means balanced
    """
    from collections import Counter
    from numpy import log
    
    # define this as a high agreement
    if len(seq) == 1:
        # we'll define a single class as highly unbalanced
        return 0.0
    
    # n = len(seq)
    n = sum(seq)
    # classes = [(clas,float(count)) for clas,count in Counter(seq).items()]
    k = len(seq)
    
    H = -sum([ (count/n) * log((count/n)) for clas,count in enumerate(seq)]) #shannon entropy
    return H/(log(k))

In [11]:
balance([500, 500])

1.0

In [12]:
df_mixing = df.copy(deep=True)
df_mixing
df_mixing = df_mixing.groupby(["verb_lemma", "rel_type"]).size().reset_index(name="counts")
df_mixing = df_mixing.groupby(["verb_lemma"]).apply(lambda x: balance(x["counts"].to_list())).reset_index(name="balance").dropna()
df_mixing.sort_values("balance")
# df_mixing[df_mixing["verb_lemma"] == "verübeln"]

Unnamed: 0,verb_lemma,balance
578,irren,0.0
674,mühen,0.0
295,bewirken,0.0
676,nachgeben,0.0
293,beweisen,0.0
...,...,...
748,runterputzen,1.0
106,anwidern,1.0
860,unterjubeln,1.0
367,einlullen,1.0


In [13]:
# what is the distribution of the balancedness?
df_mixing.balance.value_counts()
# get all verbs with balance 0.0
agreeable_verbs = df_mixing[df_mixing["balance"] <= VERB_AGREEMENT_LEVEL]["verb_lemma"].to_list()
len(agreeable_verbs)

629

In [14]:
# what is the distribution of verbs and their agreement
df_analysis = df.copy(deep=True)
df_analysis = df_analysis.groupby(["verb_lemma", "rel_type"]).size().reset_index(name="counts")
df_analysis[df_analysis["verb_lemma"] == "nachgeben"]

Unnamed: 0,verb_lemma,rel_type,counts
1451,nachgeben,neutral,422


In [15]:
df

Unnamed: 0,doc_id,verb_form,verb_form_start,verb_form_end,verb_lemma,arg1,arg1_start,arg1_end,arg1_pos,arg1_head,...,arg2,arg2_start,arg2_end,arg2_pos,arg2_head,arg2_head_start,arg2_head_end,rel_type,pred_serial,full_sentence_text
0,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,abgestraft,26,36,abstrafen,Alexis Tsipras,5,19,N,Alexis,...,Dass Alexis Tsipras jetzt abgestraft wurde,0,43,$.,.,143,144,neutral,"Predicate(type='neutral', args=(Head(sentence=6, token=1), Head(sentence=6, token=-1)), strength=0, verb=4)","Dass Alexis Tsipras jetzt abgestraft wurde , hat viel mit der angestauten Unzufriedenheit über die langen Jahre des Sparens und Darbens zu tun ."
1,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,enttäuschen,140,151,enttäuschen,die neue Regierung,75,93,N,Regierung,...,die Hoffnungen auf einen spürbaren Aufschwung,94,139,N,Hoffnungen,98,108,neutral,"Predicate(type='neutral', args=(Head(sentence=18, token=13), Head(sentence=18, token=15)), strength=0, verb=20)","Wenn die Kreditgeber Athen nicht zusätzlichen Spielraum öffnen , wird auch die neue Regierung die Hoffnungen auf einen spürbaren Aufschwung enttäuschen ."
2,4bc8c13ddaa028e64a34ce08397157b846fb4de3ad26e34759aa57fdbeb50bcc,beenden,119,126,beenden,Will er dem Land etwas Gutes tun,0,33,$.,.,...,die politische Polarisierung beenden,90,127,N,Polarisierung,105,118,neutral,"Predicate(type='neutral', args=(Head(sentence=10, token=-1), Head(sentence=10, token=20)), strength=0, verb=21)","Will er dem Land etwas Gutes tun , dann sollte er nicht nur Steuern senken , sondern auch die politische Polarisierung beenden , die das Klima in Griechenland zuletzt so vergiftet hat ."
3,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb9508b4f1acc63a35ac,akzeptieren,69,80,akzeptieren,Pajtim Kasami,0,14,N,Pajtim,...,die Kurzarbeit nun doch,81,105,N,Kurzarbeit,85,95,pro,"Predicate(type='pro', args=(Head(sentence=12, token=0), Head(sentence=12, token=13)), strength=0, verb=11)","Pajtim Kasami , Ermir Lenjani , Birama Ndoye und Mickael Facchinetti akzeptieren die Kurzarbeit nun doch , weshalb Sion-Präsident Christian Constantin die Entlassungen zurückzieht ."
4,290f3971010f6d9385e896208f328948f5fb3f9bc0caeb9508b4f1acc63a35ac,entliess,30,38,entlassen,der FC Sion,39,50,N,FC,...,Fussball Neun Spieler,8,29,N,Fussball,8,16,neutral,"Predicate(type='neutral', args=(Head(sentence=10, token=8), Head(sentence=10, token=3)), strength=0, verb=6)","( dpa ) Fussball Neun Spieler entliess der FC Sion Mitte März , als die Corona-Krise den Schweizer Fussball traf ."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832168,9701f0c776430a365f0619836f34658459be788b9a6646e86c252c477ef565a4,äussert,5,12,äussern,Frau A,13,19,N,Frau,...,massive Vorwürfe gegen ihre Vorgesetzten,20,60,N,Vorwürfe,28,36,neutral,"Predicate(type='neutral', args=(Head(sentence=13, token=2), Head(sentence=13, token=5)), strength=0, verb=1)",Dann äussert Frau A massive Vorwürfe gegen ihre Vorgesetzten .
832169,9701f0c776430a365f0619836f34658459be788b9a6646e86c252c477ef565a4,stelle,184,190,stellen,das Kommando,171,183,N,Kommando,...,einen Antrag auf ihren Ausschluss,191,224,N,Antrag,197,203,neutral,"Predicate(type='neutral', args=(Head(sentence=28, token=31), Head(sentence=28, token=34)), strength=0, verb=32)","Am 5. Dezember stellt der Chef der Abteilung Milizfeuerwehr und Zivilschutz Frau A vor die Wahl : Entweder trete sie per Ende des Jahres aus der Milizfeuerwehr aus , oder das Kommando stelle einen Antrag auf ihren Ausschluss ."
832170,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a744651af0e992064bf,ausgewiesen,77,88,ausweisen,Der Konzern,0,11,N,Konzern,...,eine Liquidität,32,47,N,Liquidität,37,47,con,"Predicate(type='con', args=(Head(sentence=19, token=1), Head(sentence=19, token=7)), strength=0, verb=13)",Der Konzern hatte im April noch eine Liquidität von gut vier Milliarden Euro ausgewiesen .
832171,d0fc434ce0021b0dff7b52157d352daaff8d1a65f4640a744651af0e992064bf,belebt,8,14,beleben,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,...,das Verkehrsaufkommen,20,41,N,Verkehrsaufkommen,24,41,con,"Predicate(type='con', args=(Head(sentence=25, token=4), Head(sentence=25, token=4)), strength=0, verb=1)",Seitdem belebt sich das Verkehrsaufkommen auf niedriger Basis jedoch wieder .


**How big is the problem of -1 in first span or a "." in the data?**

The -1 does not exist. No problem currently.

The "." is quite prevalent and strangely only occurrs in the neutral case. That's why a cleaning step was introduced below (see solution).

**Clean dataset**

1. As seen below we must remove the argument heads make sure we don't have any "." as arguments for the neutrals.
2. We also want to remove the reflexive cases for the pronouns.
3. Only include verbs that fulfill balancedness criteria, execute cell 17 & 18 for this first.

In [16]:
df_dirty = df.copy(deep=True)
df_dirty[
#(df_dirty["arg1_head"] == ".")
#|(df_dirty["arg2_head"] == ".")
#|(df_dirty["arg2"] == ".")
#|(df_dirty["arg2"] == ".")
#(df_dirty["arg1_start"] == -1)
#|(df_dirty["arg2_start"] == -1)
#|(df_dirty["arg2_head_start"] == -1)
#|(df_dirty["arg2_head_start"] == -1)
(df_dirty["arg1"] == df_dirty["arg2"])
& (df_dirty["full_sentence_text"].str.contains("sich")) # reflexive verbs, should be ignore?
].rel_type.value_counts()

pro        6107
neutral    4871
con        3383
Name: rel_type, dtype: int64

In [17]:
# 1 - include dotted (most likely with no target or other thing, or are these the -1 ones)
mask = ~((df["arg1_head"] == ".")
|(df["arg2_head"] == ".")
|(df["arg2"] == ".")
|(df["arg2"] == "."))

df = df[mask]


# 2 - exclude reflexive
mask = ~(df["arg1"] == df["arg2"])

df = df[mask]

# 3 - exclude balanced = ambiguous verbs.
mask = (df["verb_lemma"].isin(agreeable_verbs))

df = df[mask]

# 4 - exclude sents, where the arg1_head_start and arg2_head_start overlap (due to stancer data extraction)
mask = ~(df["arg1_head_start"] == df["arg2_head_start"])

df = df[mask]

In [18]:
print(len(agreeable_verbs))
df.rel_type.value_counts()

629


neutral    280596
pro         15802
con         11753
Name: rel_type, dtype: int64

**Balance the dataset**

Make sure that each class is represented equally.

In [19]:
df = df.groupby('rel_type')
df = df.apply(lambda x: x.sample(df.size().min(), random_state=RANDOM_STATE).reset_index(drop=True))
df.rel_type.value_counts()

con        11753
neutral    11753
pro        11753
Name: rel_type, dtype: int64

In [20]:
df = df.reset_index(level=0, drop=True)
df.rel_type.value_counts()

con        11753
neutral    11753
pro        11753
Name: rel_type, dtype: int64

**Analyzing the verb distribution**

In [21]:
!pip install plotly


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [22]:
import plotly.express as px

def analyse_verb_frequency(df):
    df_analysis = df.copy(deep=True)
    df_analysis = df_analysis.groupby(["verb_lemma"]).size().reset_index(name="counts")

    # get a verb that is needed in the distribution
    # print(df_analysis[df_analysis["counts"] == 1]["verb_lemma"])
    # the counts of the counts
    # df_analysis = df_analysis.groupby(["counts"]).size().reset_index(name="count_counts")
    # print(len(df_analysis))

    # generate a sufficient bin width
    bin_width= 10
    # here you can choose your rounding method, I've chosen math.ceil.
    nbins = math.ceil((df_analysis["counts"].max() - df_analysis["counts"].min()) / bin_width)


    fig = px.histogram(df_analysis, 
                       x="counts", 
                       nbins=nbins,
                       title="Verb Frequency over the number of verbs", 
                       labels={'counts': 'Frequency of verbs'}
                      ).update_layout(yaxis_title='Number of verbs')

    # fig.show()

    print(df_analysis.sort_values(by="counts", ascending=False))
    
analyse_verb_frequency(df)

        verb_lemma  counts
334        stellen    3385
28      ankündigen    1535
209         freuen    1431
47       aufnehmen    1171
212         gelten    1046
..             ...     ...
46       auflehnen       1
237        hungern       1
239  ideologisiert       1
450      zermürben       1
51       ausbeuten       1

[483 rows x 2 columns]


**Option 1: Train-test-splitting**

Only problem: We may have sentences within the same documents with multiple PAS that are split accross the sets.

In [23]:
if SPLIT_MODE == "CRITERIA_FREE":
    train, test_val = train_test_split(df, test_size=0.3, stratify=df["rel_type"], random_state=RANDOM_STATE)
    test, val = train_test_split(test_val, test_size=0.5, stratify=test_val["rel_type"],random_state=RANDOM_STATE)

**Option 2: Train-test-splitting**

With respecting group distribution. Which means that all sentences $S_{1..N}$ from a document $A$ will either all be in the test set, all be in the validation set or all be in the training set. The reason for this is since it could happen that multiple PAS are detected within the same sentence and then the system is trained to one PAS and is evaluated on a completely different PAS, which is unfair.

In [24]:
if SPLIT_MODE=="DOC_SPLIT":
    # preserve groups between sentences (in this case doc_id is safe enough)
    splitter = GroupShuffleSplit(test_size=.30, n_splits=1, random_state=RANDOM_STATE)
    split = splitter.split(df, groups=df['doc_id'])
    train_inds, test_val_inds = next(split)

    train = df.iloc[train_inds]
    test_val = df.iloc[test_val_inds]

    splitter = GroupShuffleSplit(test_size=.5, n_splits=1, random_state=RANDOM_STATE)
    split = splitter.split(test_val, groups=test_val['doc_id'])
    test_inds, val_inds = next(split)

    test = test_val.iloc[test_inds]
    val = test_val.iloc[val_inds]

In [25]:
# a small test to see whether option no 2 achieves our goals.

X = np.ones(shape=(10, 2))
y = np.ones(shape=(10, 1))
groups = np.array([1, 1, 2, 2, 2, 3, 3, 3, 8, 8])
print(groups.shape)

gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=42)
gss.get_n_splits()

for train_idx, test_idx in gss.split(X, y, groups):
    print("TRAIN:", [groups[i] for i in train_idx], "TEST:", [groups[i] for i in test_idx])

(10,)
TRAIN: [1, 1, 3, 3, 3, 8, 8] TEST: [2, 2, 2]


## Hard-verb splitting

To test generalisability we may want to split verbs by train and test set.

**Solution**: We can simply use the group splitting argument on the verbs.

**Option 3: Train-test split with verb-splitting**

This means that given a verb $V$, all sentences which contain that verb will either be in $T_{RAIN}$, $V_{ALID}$ or $T_{EST}$.

In [26]:
if SPLIT_MODE == "LEMMA_SPLIT":
    # preserve groups between sentences (in this case verbs)
    splitter = GroupShuffleSplit(test_size=.30, n_splits=1, random_state=RANDOM_STATE)
    split = splitter.split(df, groups=df['verb_lemma'])
    train_inds, test_val_inds = next(split)

    train = df.iloc[train_inds]
    test_val = df.iloc[test_val_inds]

    test, val = train_test_split(test_val, test_size=0.5, stratify=test_val["rel_type"],random_state=RANDOM_STATE)

    #splitter = GroupShuffleSplit(test_size=.5, n_splits=1, random_state=RANDOM_STATE)
    #split = splitter.split(test_val, groups=test_val['verb_lemma'])
    #test_inds, val_inds = next(split)

    #test = test_val.iloc[test_inds]
    #val = test_val.iloc[val_inds]

### Save to file

Save the respective datasets to file

In [27]:
print("VALUE COUNTS: train")
print(train.rel_type.value_counts())
train.to_csv(TRAIN_DATA_PATH, index=False)

print("VALUE COUNTS: test")
print(test.rel_type.value_counts())
test.to_csv(TEST_DATA_PATH, index=False)

print("VALUE COUNTS: val")
print(val.rel_type.value_counts())
val.to_csv(VAL_DATA_PATH, index=False)

VALUE COUNTS: train
neutral    8244
con        7301
pro        6248
Name: rel_type, dtype: int64
VALUE COUNTS: test
pro        2752
con        2226
neutral    1755
Name: rel_type, dtype: int64
VALUE COUNTS: val
pro        2753
con        2226
neutral    1754
Name: rel_type, dtype: int64


We verify that this worked using an overlap metric by checking whether the overlap is 0 with respect to all verbs in the dataset.

In [28]:
# overlap /w regards to verbform
train_verbs = train.verb_form.to_list()
val_test_verbs = val.verb_form.to_list() + test.verb_form.to_list()
list(set(train_verbs) & set(val_test_verbs))

[]

In [29]:
# overlap /w regards to lemma
train_verbs = train.verb_lemma.to_list()
val_test_verbs = val.verb_lemma.to_list() + test.verb_lemma.to_list()
list(set(train_verbs) & set(val_test_verbs))

[]

In [30]:
analyse_verb_frequency(train)

         verb_lemma  counts
15       ankündigen    1535
27        aufnehmen    1171
147          gelten    1046
235          sorgen     791
138          finden     758
..              ...     ...
139       fingieren       1
145        fremdeln       1
304  vorverurteilen       1
148       geniessen       1
0         abdriften       1

[338 rows x 2 columns]


In [31]:
analyse_verb_frequency(val)

       verb_lemma  counts
85        stellen    1693
56         freuen     728
58       gewinnen     348
125  zurückweisen     262
51      erreichen     229
..            ...     ...
114     vorrücken       1
121    zerreissen       1
120    zerbrechen       1
115    vorspielen       1
35   beschliessen       1

[134 rows x 2 columns]


In [32]:
analyse_verb_frequency(test)

       verb_lemma  counts
89        stellen    1692
59         freuen     703
61       gewinnen     338
125  zurückweisen     257
54      erreichen     216
..            ...     ...
48     entfremden       1
51       ermatten       1
83       schocken       1
17      auflehnen       1
11       anmahnen       1

[134 rows x 2 columns]
