In [32]:
import os
import json
from copy import deepcopy
from typing import Dict, List, Tuple
import UDLib as udlib

In [48]:
CONLLU_PATH = '../conllu'
OUT_PATH = '../conllu-processed'
SNACS_PATH = '../snacs-output/ewt'

In [4]:
def get_annotations(path: str) -> List[Dict[str, List[str]]]:
    result = []
    with open(path, 'r', encoding='utf-8') as inp:
        for line in inp:
            result.append(json.loads(line))
    return result

In [54]:
def get_obliques_with_types(
        t: udlib.UDTree,
        annotation: Dict[str, List[str]]) -> List[Tuple[str]]:
    assert len(t.keys) == len(annotation['tokens'])
    tag_dict = dict(zip(t.keys, annotation['tags']))
    result = []
    # DFS on the tree. Get the real root
    for edge in t.graph['0']:
        root = edge.head
    stack = [root]
    while stack:
        current_node = stack.pop()
        if t.nodes[current_node].DEPREL.split(':')[0] == 'obl':
            # Find the first case child and retrieve its annotation
            for edge in t.graph[current_node]:
                if edge.directionality == 'down':
                    child = edge.head
                    if t.nodes[child].DEPREL.split(':')[0] == 'case':
                        result.append((t.nodes[child].FORM + ' ' + t.nodes[current_node].FORM,
                                       tag_dict[child]))
                        break
            else:
                result.append((t.nodes[current_node].FORM, 'caseless'))
        for edge in t.graph[current_node]:
            if edge.directionality == 'down':
                stack.append(edge.head)
    return result

In [28]:
ADVMOD_TAGS = {
    'Locus',
    'Time',
    'EndTime',
    'Goal',
    'Source',
    'Purpose',
    'Duration',
    'Circumstance',
    'ComparisonRef',
    'Manner',
    'Extent'
}

In [59]:
def transform_obl(t: udlib.UDTree,
                  annotation: Dict[str, List[str]]) -> udlib.UDTree:
    assert len(t.keys) == len(annotation['tokens'])
    tag_dict = dict(zip(t.keys, annotation['tags']))
    result = deepcopy(t)
    # Get the real root
    for edge in t.graph['0']:
        root = edge.head
    # DFS on the tree
    stack = [root]
    while stack:
        current_node = stack.pop()
        if t.nodes[current_node].DEPREL.split(':')[0] == 'obl':
            # Find the first _case_ child and retrieve its annotation
            for edge in t.graph[current_node]:
                if edge.directionality == 'down':
                    child = edge.head
                    if t.nodes[child].DEPREL.split(':')[0] == 'case':
                        tag = tag_dict[child].split('|')[0].split('.')[-1]
                        break
            else:
                tag = 'caseless'
            if tag in ADVMOD_TAGS:
                result.nodes[current_node].DEPREL = 'advmod'
            else:
                result.nodes[current_node].DEPREL = 'iobj'
        for edge in t.graph[current_node]:
            if edge.directionality == 'down':
                stack.append(edge.head)
    return result

In [20]:
train_trees = udlib.conllu2trees(os.path.join(CONLLU_PATH, 'en_ewt-ud-train.conllu'))
train_annotations = get_annotations(os.path.join(SNACS_PATH, 'train'))

In [46]:
from collections import Counter
all_tags = Counter()
for t, a in zip(train_trees, train_annotations):
    result = get_obliques_with_types(t, a)
    for form, tag in result:
        if tag == 'caseless':
            print(repr(form), ' '.join(t.nodes[el].FORM for el in t.keys))
        all_tags[tag] += 1

'on' Speaking of Fallujah , we have only Al - Jazeera to rely on for our news from there .
'as' He could n't tell when things were becoming more unstable as opposed to less .
'on' The best option , now , under the present difficult circumstances is for Colombo to do its own dirty work , although New Delhi can always be counted on to render good neighborly help because of the shared belief that religion , ethnicity and language can not be the basis for secession .
'with' While most U.S. secretaries of state -- save perhaps Dean Rusk -- have gobbled up credit for outcomes that they had little to do with , few have been as brazen as Colin Powell .
'that' Unfortunately for them , the general has decided to take seriously Colin Powell 's frequent boasts that it was on his nudging that the Indians made conciliatory gestures toward Islamabad .
'that' Despite many wrinkles , India remains a part - democracy and merely signing on to a piece of paper that calls for an independent Kashmir does no

'from' The newly married couple living on the corner in front of the entrance of George 's house ( Friar Park ) refers to him as an overbearing person to stay away from .
'at' Like Iran , Syria 's military is nothing to scoff at .
'year' The Abramoff scandal threatens to subsume all the hard - fought GOP gains in Congress , and the 2006 midterms are less than a year away .
'for' 25 - Percentage of overall worldwide carbon dioxide emissions the United States is responsible for .
'on' The people abandoned their farms and religious sites and the ground they stood on became moorland .
'at' In Chicago , what is the best restaurant to dine at on a budget ?
'for' So I have a VERY long list of people I want to make Xmas gifts for and one of them is my best friend .
'way' took a bit of searching to discover that POP is probably PoP - plaster of Paris and yes plaster of Paris will slow way down if it is not exposed to air - in fact , if you are building up plaster ( " the way you would icing " )

In [47]:
all_tags.most_common()

[('O-P-p.Locus', 1187),
 ('O-P-p.Time', 1041),
 ('O-P-p.Goal', 675),
 ('I_', 482),
 ('O-P-p.Topic', 429),
 ('O-P-p.Recipient|p.Goal', 344),
 ('O-P-p.Purpose', 302),
 ('B-DISC', 287),
 ('O-P-p.Agent', 279),
 ('O-P-p.Identity', 217),
 ('O-P-p.Duration', 206),
 ('caseless', 204),
 ('I~-P-p.Theme', 203),
 ('O-P-p.Source', 199),
 ('O-P-p.Circumstance', 184),
 ('O-P-p.Beneficiary', 173),
 ('O-P-p.ComparisonRef', 166),
 ('O-P-??', 151),
 ('O-P-p.Theme', 116),
 ('O-P-p.Ancillary', 101),
 ('O-P-p.Agent|p.Ancillary', 99),
 ('O-P-p.Stimulus|p.Topic', 95),
 ('O-P-p.Originator|p.Source', 93),
 ('O-P-p.Goal|p.Locus', 82),
 ('B-P-p.Explanation', 77),
 ('O-P-p.Manner', 68),
 ('O-P-p.Beneficiary|p.Goal', 59),
 ('O-P-p.Explanation', 57),
 ('O-P-p.Path', 53),
 ('B-PP-p.Extent', 45),
 ('O-P-p.Means', 41),
 ('B-P-p.Source', 40),
 ('O-P-p.StartTime', 40),
 ('B-PP-p.Manner|p.Locus', 38),
 ('B-P-p.Locus', 36),
 ('I~-P-p.Topic', 36),
 ('O-INF', 33),
 ('I~-P-p.Goal', 33),
 ('O-P-p.Cost', 33),
 ('I~-P-p.SocialRe

In [30]:
print(train_trees[0])

# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001
# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.
1	Al	Al	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
2	-	-	PUNCT	HYPH	_	1	punct	1:punct	SpaceAfter=No
3	Zaman	Zaman	PROPN	NNP	Number=Sing	1	flat	1:flat	_
4	:	:	PUNCT	:	_	1	punct	1:punct	_
5	American	american	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
6	forces	force	NOUN	NNS	Number=Plur	7	nsubj	7:nsubj	_
7	killed	kill	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_
8	Shaikh	Shaikh	PROPN	NNP	Number=Sing	7	obj	7:obj	_
9	Abdullah	Abdullah	PROPN	NNP	Number=Sing	8	flat	8:flat	_
10	al	al	PROPN	NNP	Number=Sing	8	flat	8:flat	SpaceAfter=No
11	-	-	PUNCT	HYPH	_	8	punct	8:punct	SpaceAfter=No
12	Ani	Ani	PROPN	NNP	Number=Sing	8	flat	8:flat	SpaceAfter=No
13	,	,	PUNCT	,	_	8	punct	8:punct	_
14	t

In [43]:
print(transform_obl(train_trees[0], train_annotations[0]))

# newdoc id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001
# text = Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.
1	Al	Al	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
2	-	-	PUNCT	HYPH	_	1	punct	1:punct	SpaceAfter=No
3	Zaman	Zaman	PROPN	NNP	Number=Sing	1	flat	1:flat	_
4	:	:	PUNCT	:	_	1	punct	1:punct	_
5	American	american	ADJ	JJ	Degree=Pos	6	amod	6:amod	_
6	forces	force	NOUN	NNS	Number=Plur	7	nsubj	7:nsubj	_
7	killed	kill	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_
8	Shaikh	Shaikh	PROPN	NNP	Number=Sing	7	obj	7:obj	_
9	Abdullah	Abdullah	PROPN	NNP	Number=Sing	8	flat	8:flat	_
10	al	al	PROPN	NNP	Number=Sing	8	flat	8:flat	SpaceAfter=No
11	-	-	PUNCT	HYPH	_	8	punct	8:punct	SpaceAfter=No
12	Ani	Ani	PROPN	NNP	Number=Sing	8	flat	8:flat	SpaceAfter=No
13	,	,	PUNCT	,	_	8	punct	8:punct	_
14	t

In [60]:
with open(os.path.join(OUT_PATH, 'en_ewt-ud-train.conllu'), 'w', encoding='utf-8') as out:
    for tree, annotation in zip(train_trees, train_annotations):
        print(transform_obl(tree, annotation), file=out)
        print('', file=out)

In [61]:
dev_trees = udlib.conllu2trees(os.path.join(CONLLU_PATH, 'en_ewt-ud-dev.conllu'))
dev_annotations = get_annotations(os.path.join(SNACS_PATH, 'dev'))
with open(os.path.join(OUT_PATH, 'en_ewt-ud-dev.conllu'), 'w', encoding='utf-8') as out:
    for tree, annotation in zip(dev_trees, dev_annotations):
        print(transform_obl(tree, annotation), file=out)
        print('', file=out)

In [62]:
test_trees = udlib.conllu2trees(os.path.join(CONLLU_PATH, 'en_ewt-ud-test.conllu'))
test_annotations = get_annotations(os.path.join(SNACS_PATH, 'test'))
with open(os.path.join(OUT_PATH, 'en_ewt-ud-test.conllu'), 'w', encoding='utf-8') as out:
    for tree, annotation in zip(test_trees, test_annotations):
        print(transform_obl(tree, annotation), file=out)
        print('', file=out)