In [1]:
import json
import re

In [2]:
train_fn = "../data/negation_train.json"
dev_fn = "../data/negation_dev.json"
test_fn = "../data/negation_test.json"

## What are the multi-word cues

In [3]:
mwcs = set()
for fn in [train_fn, dev_fn, test_fn]:
    with open(fn) as fh:
        for line in json.load(fh):
            if len(line["negations"]) > 0:
                for neg in line["negations"]:
                    if len(neg["Cue"][0]) > 1:
                        mwcs.add(tuple(neg["Cue"][0]))
mwcs

{('Verken', 'eller'),
 ('har', 'til gode'),
 ('hverken', 'eller'),
 ('verken', 'eller')}

## Affixal Cues

In [4]:
affixals = set()
affplus = set()
prefixes = set()
suffixes = set()
for fn in [train_fn, dev_fn, test_fn]:
    with open(fn) as fh:
        for line in json.load(fh):
            if len(line["negations"]) > 0:
                for neg in line["negations"]:
                    if neg["Affixal"]:
                        #if neg["Cue"][0][0] == "aldri" or neg["Cue"][0][0] == "ikke" or neg["Cue"][0][0] == "manglende":
                        #    print(line)
                        affixals.add(tuple(neg["Cue"][0]))
                        #print(neg["Cue"], neg["Scope"])
                        a,b = neg["Cue"][1][0].split(":")
                        a = int(a)
                        b = int(b)
                        # print(t[a-2:b+2])
                        if a - 1 > 0 and line["text"][a - 1] != " ":
                            c = a -1
                            # print(neg["Cue"], neg["Scope"])
                            while c > 0 and line["text"][c] != " ":
                                c -= 1
                            affplus.add((neg["Cue"][0][0], line["text"][c:b]))
                            suffixes.add(neg["Cue"][0][0])
                        else:
                            c = b
                            while c < len(line["text"]) and line["text"][c] != " ":
                                c += 1
                            affplus.add((neg["Cue"][0][0], line["text"][a:c]))
                            prefixes.add(neg["Cue"][0][0])
                        
affixals, affplus, prefixes, suffixes

({('U',),
  ('fri',),
  ('frie',),
  ('fritt',),
  ('ikke-',),
  ('in',),
  ('løs',),
  ('løse',),
  ('løst',),
  ('mis',),
  ('tomme',),
  ('u',),
  ('utenom',)},
 {('U', 'Ubeskjeden'),
  ('U', 'Uendelig'),
  ('U', 'Uengasjerende'),
  ('U', 'Ujevn'),
  ('U', 'Ujålete'),
  ('U', 'Ukomplisert'),
  ('U', 'Ulike'),
  ('U', 'Unyansert'),
  ('U', 'Uredigert'),
  ('U', 'Usedvanlig'),
  ('U', 'Uspennende'),
  ('U', 'Usynlige'),
  ('U', 'Utrettelig'),
  ('U', 'Utro'),
  ('U', 'Utrolig'),
  ('U', 'Utvilsomt'),
  ('U', 'Uvanlig'),
  ('U', 'Uverdig'),
  ('fri', ' bekymringsfri'),
  ('fri', ' feilfri'),
  ('fri', ' filterfri'),
  ('fri', ' nøkkelfri'),
  ('fri', ' rusfri'),
  ('fri', ' voldsfri'),
  ('frie', ' dialogfrie'),
  ('frie', ' rusfrie'),
  ('frie', ' vedlikeholdsfrie'),
  ('fritt', ' bugfritt'),
  ('ikke-', 'ikke-ideelle'),
  ('ikke-', 'ikke-innvidde'),
  ('ikke-', 'ikke-kosher-hovedretten'),
  ('ikke-', 'ikke-låt'),
  ('ikke-', 'ikke-produksjonsklart'),
  ('ikke-', 'ikke-tema'),
  ('ikk

In [5]:
prefixes, suffixes

({'U', 'ikke-', 'in', 'mis', 'u', 'utenom'},
 {'fri', 'frie', 'fritt', 'løs', 'løse', 'løst', 'tomme'})

In [6]:
_, full_affix_negs = zip(*affplus)

In [7]:
prefixes = ['U', 'ikke-', 'in', 'mis', 'u', 'utenom']
suffixes = ['fri', 'frie', 'fritt', 'løs', 'løse', 'løst', 'tomme']
"^" + "|^".join(prefixes)
("$|".join(suffixes) + "$")

'fri$|frie$|fritt$|løs$|løse$|løst$|tomme$'

### Given a list of affixes, can we safely get the root?

In [8]:
for aff, fan in affplus:
    #print(aff, fan)
    pre = re.search(re.compile("^" + "|^".join(prefixes)), fan)
    suf = re.search(re.compile("$|".join(suffixes) + "$"), fan)
    #if pre is not None and pre.group(0) == aff and suf is not None:
    #    print("true prefix but also suffix: ", pre.group(0), suf.group(0), fan)
    #if suf is not None and suf.group(0) == aff and pre is not None:
    #    print("true suffix but also prefix: ", pre.group(0), suf.group(0), fan)
    x = "_"
    y = "_"
    if pre:
        x = pre.group(0)
        span = pre.span()
        #print(fan, span, fan[span[1]:])
    if suf:
        y = suf.group(0)
    #if not (x == "_" or y == "_"):
    print(aff, fan, x, y)

løs  endeløs _ løs
ikke- ikke-vestlig ikke- _
in ineffektiv in _
u uforståelig u _
u usikre u _
u ubarmhjertige u _
u utrettelige u _
u ubekvemt u _
U Uengasjerende U _
løst  skamløst _ løst
u ufokusert u _
u ubestemmelige u _
u uelegant u _
u ubetalelig u _
u uoppfinnsomt u _
frie  rusfrie _ frie
u uinteresserte u _
u uklar u _
frie  dialogfrie _ frie
løst Trådløst _ løst
u ufattelige u _
u upåmeldt u _
u ukjente u _
u uvanleg u _
løse  håpløse _ løse
u uviktige u _
u ukarismatisk u _
u uungåelige u _
u uoversiktlig u _
u ubetenksomt u _
u urørt u _
u utydelige u _
løse  tankeløse _ løse
u uformelle u _
løse  trådløse _ løse
u utallige u _
u uutholdelige u _
u uforløst u løst
u upresis u _
u ugripelige u _
u ubrukt u _
u utilgjengelig u _
løst  hemningsløst _ løst
u ufrelste u _
u uskikkelige u _
løs  sømløs _ løs
fri  rusfri _ fri
u uavklarte u _
u umulig u _
u ufrivillige u _
ikke- ikke-tema ikke- _
u usynlig u _
u uavhengig u _
løs  temperamentsløs _ løs
løse  sjelløse _ løse
u uve

#### How many affixes are there in train, dev, test

In [9]:
from collections import Counter
for fn in [train_fn, dev_fn, test_fn]:
    cnt = Counter()
    with open(fn) as fh:
        for line in json.load(fh):
            next_sen = False
            if len(line["negations"]) > 0:
                for neg in line["negations"]: 
                    for cue in neg["Cue"][0]:
                        if cue in prefixes or cue in suffixes:
                            #print(fn, cue)
                            #print(line)
                            cnt[cue] += 1
                            next_sen = True
                            break
                    if next_sen:
                        break
    print(cnt)

Counter({'u': 349, 'løs': 33, 'løst': 26, 'løse': 25, 'U': 16, 'fri': 7, 'ikke-': 5, 'frie': 3, 'utenom': 2, 'in': 1, 'tomme': 1, 'mis': 1})
Counter({'u': 56, 'løse': 11, 'løs': 7, 'U': 5, 'løst': 3, 'ikke-': 1})
Counter({'u': 51, 'løs': 5, 'løse': 2, 'løst': 2, 'fri': 1, 'U': 1})


#### Count all tyypes of cues

In [10]:
from collections import Counter
cues = set()
cs = []
for fn in [train_fn, dev_fn, test_fn]:
    cnt = Counter()
    with open(fn) as fh:
        for line in json.load(fh):
            next_sen = False
            if len(line["negations"]) > 0:
                for neg in line["negations"]: 
                    for cue in neg["Cue"][0]:
                        cnt[(cue, neg["Affixal"])] += 1
                        next_sen = True
                        break
                    if next_sen:
                        break
    #print(set(cnt.keys()))
    cs.append(cnt.keys())
    cues.update(cnt.keys())

### When using pre-defined affixes these cues need to be saved

In [11]:
bad = []
for fan, is_aff in cues:
    pre = re.search(re.compile("^" + "|^".join(prefixes)), fan)
    suf = re.search(re.compile("$|".join(suffixes) + "$"), fan)
    x = "_"
    y = "_"
    if pre:
        x = pre.group(0)
        span = pre.span()
        #print(fan, span, fan[span[1]:])
    if suf:
        y = suf.group(0)
    #if not (x == "_" or y == "_"):
    if is_aff and (suf or pre):
        pass
        #print("good boy: ", fan, x, y)
    elif not is_aff and (suf or pre):
        print("bad boy: ", fan, x, y)
        bad.append(fan)

bad boy:  inget in _
bad boy:  utan u _
bad boy:  Unntaket U _
bad boy:  uten u _
bad boy:  fri _ fri
bad boy:  inga måter in _
bad boy:  ingen in _
bad boy:  Uten U _
bad boy:  ingenting in _
bad boy:  uteblir u _
bad boy:  unntak u _
bad boy:  mistet mis _
bad boy:  unntatt u _
bad boy:  mister mis _
bad boy:  uteble u _
bad boy:  unngår u _
bad boy:  unntaket u _
bad boy:  ingen måte in _
bad boy:  unngå u _
bad boy:  intet in _


### Which cues are present in Dev/Test but not Train

In [12]:
cs = [[x for x,_ in c] for c in cs]

In [13]:
tr, de, te = cs
tr = set(tr)
de = set(de)
te = set(te)

#### Dev but not train

In [14]:
de - tr

{'Mangelen', 'bort', 'mangle', 'motsetning', 'uteble'}

#### Test but not train

In [15]:
te - tr

{'fravær', 'fraværet', 'inget', 'mistet', 'null', 'savnet', 'strippet'}

## Bug hunt

Transforming the json- and point-to-root formats to starsem should yield the same results.
It does not but why?

In [16]:
def get_neg_cols(s):
    cols = []
    for t in s:
        nc = sorted([tuple(t[7:][i:i+3]) for i in range(0, len(t[7:]), 3)])
        # print(nc)
        cols.append(nc)
    return cols

In [17]:
def read_sen(lines):
    ss = []
    s = []
    for line in lines:
        line = line.strip().split("\t")
        if len(line) <= 1:
            ss.append(s)
            s = []
        else:
            s.append(line)
    return ss

In [18]:
def compare_starsems(ss1, ss2):
    for s1, s2 in zip(ss1, ss2):
        if len(s1[0]) > 8:
            n1 = get_neg_cols(s1)
            n2 = get_neg_cols(s2)
            # print(n1, n2)
            if n1 != n2:
                for c1, c2 in zip(n1, n2):
                    if c1 != c2:
                        print(s1[0][1],c1, c2)
                    pass

### Differences between json and ptr for train, dev & test

In [19]:
for x in ["train", "dev", "test"]:
    starsem = open(f"neg_{x}.starsem").readlines()
    ptr = open(f"ptr_{x}.starsem").readlines()
    ss1 = read_sen(starsem)
    ss2 = read_sen(ptr)
    print(x)
    compare_starsems(ss1, ss2)
    print()

train
101554-08-01 [('u', '_', '_')] [('u', 'trolig', '_')]
701363-10-07 [('u', '_', '_')] [('u', 'musikalsk', '_')]
003803-21-02 [('u', '_', '_')] [('u', 'påklagelig', '_')]
200099-11-01 [('_', 'knappeløs', '_'), ('løs', '_', '_')] [('_', 'knappeløs', '_'), ('løs', 'knappe', '_')]
202043-05-01 [('Ikke', '_', '_')] [('skjermIkke', '_', '_')]
202043-05-01 [('_', 'elegant', '_')] [('_', 'elegantKresent', '_')]
301338-06-01 [('_', '_', '_'), ('_', '_', '_'), ('u', '_', '_')] [('_', '_', '_'), ('_', '_', '_'), ('u', 'sikker', '_')]
111141-01-01 [('løst', '_', '_')] [('løst', 'håp', '_')]
107326-04-02 [('_', '_', '_'), ('mangler', 'mangler', '_')] [('_', '_', '_'), ('mangler', '_', '_')]
111170-03-03 [('_', '_', '_'), ('_', 'o', '_')] [('_', '_', '_'), ('_', 'ok', '_')]
100866-06-01 [('u', '_', '_')] [('u', 'ventede', '_')]
001670-05-03 [('u', '_', '_')] [('u', 'vanleg', '_')]
001965-06-02 [('_', '_', '_'), ('_', 'dop', '_')] [('_', '_', '_'), ('_', 'dop.', '_')]
500043-20-04 [('Ingen', 'In

`
train

**affix is handled regularly in ptr but has an empty scope according to json**

108182-03-05 [('u', '\_', '\_')] [('u', 'tvilsomt', '\_')]

101554-08-01 [('u', '\_', '\_')] [('u', 'trolig', '\_')]

701363-10-07 [('u', '\_', '\_')] [('u', 'musikalsk', '\_')]

003803-21-02 [('u', '\_', '\_')] [('u', 'påklagelig', '\_')]

200099-11-01 [('\_', 'knappeløs', '\_'), ('løs', '\_', '\_')] [('\_', 'knappeløs', '\_'), ('løs', 'knappe', '\_')]

**there is no whitespace in the string but the annotation in json fixes that (only in (cue/scope)**

202043-05-01 [('Ikke', '\_', '\_')] [('skjermIkke', '\_', '\_')]

202043-05-01 [('\_', 'elegant', '\_')] [('\_', 'elegantKresent', '\_')]

**affix is handled regularly in ptr but has an empty scope according to json**

301338-06-01 [('\_', '\_', '\_'), ('\_', '\_', '\_'), ('u', '\_', '\_')] [('\_', '\_', '\_'), ('\_', '\_', '\_'), ('u', 
'sikker', '\_')]

111141-01-01 [('løst', '\_', '\_')] [('løst', 'håp', '\_')]


dev

**there is no whitespace in the string but the annotation in json fixes that (only in (cue/scope)**

602282-02-03 [('\_', 'det', '\_')] [('\_', 'det.Her', '\_')]


test

**affix is handled regularly in ptr but has an empty scope according to json**

301662-05-03 [('løs', '\_', '\_')] [('løs', 'kompromiss', '\_')]`