In [2]:
import sqlite3
import pymorphy2
import json
from collections import Counter
from itertools import combinations as combs
from queue import SimpleQueue

In [1]:
def normalise_key(k):
    """Converts 0-based indexing to 1-based indexing."""
    return str(int(k)+1)

In [3]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format (http://universaldependencies.org/format.html)
    to graphs. Returns a dictionary of nodes (wordforms and POS tags indexed by line numbers)
    together with a graph of the dependencies encoded as adjacency lists of
    (node_key, relation_label, direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        key = fields[0]
        # Ignore compound surface keys for aux, du, etc.
        if '-' in key:
            continue
        # lemma would be better, but there are no lemmas in Russian PUD
        # take care of this at a later stage
        wordform = fields[1] 
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        nodes[key] = { 'wordform': wordform, 'pos': pos }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, 'up'))
        graph[parent].append((key, relation, 'down'))
    return (nodes, graph)

In [4]:
def extract_raw_sentences(record):
    """Extracts target and source sentences from the target record."""
    lines = record[3].splitlines()
    for l in lines:
        if l.startswith('# text = '):
            target = l.strip('\n')[len('# text = '):]
            for l2 in lines:
                if l2.startswith('# text_en = '):
                    source = l2.strip('\n')[len('# text_en = '):]
                    return (source, target)
            else:
                raise ValueError('No source sentence found')
    else:
        raise ValueError('No target sentence found')

In [5]:
def preprocess_alignment(alignment_str):
    """Extracts unaligned words and one-to-many alignments.
    returns remaining edges as a list."""
    en_degrees = Counter()
    fr_degrees = Counter()
    unaligned_en = []
    unaligned_fr = []
    one_to_many_en = {}
    one_to_many_fr = {}
    alignment_edges = alignment_str.split()
    real_edges = []
    resulting_edges = []
    for edge in alignment_edges:
        en, fr = edge.split('-')
        if en == 'X':
            unaligned_fr.append(fr)
        elif fr == 'X':
            unaligned_en.append(en)
        else:
            en_degrees[en] += 1
            fr_degrees[fr] += 1
            real_edges.append((en, fr))
    for edge in real_edges:
        en, fr = edge
        if en_degrees[en] > 1:
            if en not in one_to_many_en:
                one_to_many_en[en] = []
            one_to_many_en[en].append(fr)
        elif fr_degrees[fr] > 1:
            if fr not in one_to_many_fr:
                one_to_many_fr[fr] = []
            one_to_many_fr[fr].append(en)
        else:
            resulting_edges.append(edge)
    return (
        unaligned_en,
        unaligned_fr,
        one_to_many_en,
        one_to_many_fr,
        resulting_edges
    )

In [6]:
def get_path(node1, node2, graph):
    if node1 == node2:
        return []
    
    # BFS with edge labels for paths
    q = SimpleQueue()
    # Remembers where we came from and the edge label
    sources = {}
    
    q.put(node1)
    visited = set()
    visited.add(node1)
    
    while not q.empty():
        current = q.get()
        for neighbour, relation, direction in graph[current]:
            if neighbour == node2:
                path = [relation+'_'+direction]
                source = current
                while source != node1:
                    prev_node, prev_relation, prev_direction = sources[source]
                    path.append(prev_relation+'_'+prev_direction)
                    source = prev_node
                return list(reversed(path))
            elif neighbour not in visited:
                sources[neighbour] = (current, relation, direction)
                q.put(neighbour)
            visited.add(neighbour)
            
    raise ValueError("UD graph is not connected.")

In [82]:
def strip_directions(path):
    return list(map(lambda x: x.split('_')[0], path))

In [5]:
def get_node_depth(node, graph):
    # BFS
    cur_depth = 0
    q = SimpleQueue()
    q.put(('0',0))
    visited = set()
    visited.add('0')
    while not q.empty():
        current_node, current_depth = q.get()
        for neighbour, *_ in graph[current_node]:
            if neighbour == node:
                return current_depth+1
            elif neighbour not in visited:
                q.put((neighbour, current_depth+1))
            visited.add(neighbour)
    raise IndexError("Target node unreachable")

In [12]:
def get_upgoing_relation(node, graph):
    rel = None
    for edge in graph[node]:
        if edge[-1] == 'up':
            rel = edge[1]
            break
    assert rel != None
    return rel

In [6]:
conn = sqlite3.connect('pud.db')
cursor = conn.cursor()

In [7]:
en_ru = [r for r in cursor.execute('select * from `en-ru` where `verified` = 1')]
en_fr = [r for r in cursor.execute('select * from `en-fr` where `verified` = 1')]
en_zh = [r for r in cursor.execute('select * from `en-zh` where `verified` = 1')]

In [55]:
fr_n, fr_g = conll2graph(en_fr[35][3])

In [8]:
en_zh[0]

('n01001',
 'n01001011',
 "# newdoc id = n01001\n# sent_id = n01001011\n# text = “While much of the digital transition is unprecedented in the United States, the peaceful transition of power is not,” Obama special assistant Kori Schulman wrote in a blog post Monday.\n1\t“\t“\tPUNCT\t``\t_\t20\tpunct\t20:punct\tSpaceAfter=No\n2\tWhile\twhile\tSCONJ\tIN\t_\t9\tmark\t9:mark\t_\n3\tmuch\tmuch\tADJ\tJJ\tDegree=Pos\t9\tnsubj\t9:nsubj\t_\n4\tof\tof\tADP\tIN\t_\t7\tcase\t7:case\t_\n5\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t7\tdet\t7:det\t_\n6\tdigital\tdigital\tADJ\tJJ\tDegree=Pos\t7\tamod\t7:amod\t_\n7\ttransition\ttransition\tNOUN\tNN\tNumber=Sing\t3\tnmod\t3:nmod:of\t_\n8\tis\tbe\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t9\tcop\t9:cop\t_\n9\tunprecedented\tunprecedented\tADJ\tJJ\tDegree=Pos\t20\tadvcl\t20:advcl:while\t_\n10\tin\tin\tADP\tIN\t_\t13\tcase\t13:case\t_\n11\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t13\tdet\t13:det\t_\n12\tUnited\tUnited\tPROPN

In [36]:
def none_stats(corpus):
    en_fr_POS_none = Counter()
    en_fr_POS = Counter()
    en_fr_edge_none = Counter()
    en_fr_edge = Counter()

    fr_POS_none = Counter()
    fr_POS = Counter()
    fr_edge_none = Counter()
    fr_edge = Counter()

    for record in corpus:
        _, _, en_conll, fr_conll, alignment_str, _ = record
        en_n, en_g = conll2graph(en_conll)
        fr_n, fr_g = conll2graph(fr_conll)
        align_dict = json.loads(alignment_str)
        for k, v in align_dict.items():
            if k == 'X':
                for el in v:
                    pos = fr_n[el]['pos']
                    edge = get_upgoing_relation(el, fr_g)
                    fr_POS_none[pos] += 1
                    fr_edge_none[edge] += 1
            else:
                pos = en_n[k]['pos']
                edge = get_upgoing_relation(k, en_g)
                if v == ['X']:
                    en_fr_POS_none[pos] += 1
                    en_fr_edge_none[edge] += 1
                else:
                    en_fr_POS[pos] += 1
                    en_fr_edge[edge] += 1
                    for el in v:
                        pos_fr = fr_n[el]['pos']
                        edge_fr = get_upgoing_relation(el, fr_g)
                        fr_POS[pos] += 1
                        fr_edge[edge] += 1

    anti_rating_POS = Counter()
    anti_rating_edge = Counter()

    for key in en_fr_POS:
        anti_rating_POS[key] = en_fr_POS_none[key] / (en_fr_POS_none[key] + en_fr_POS[key]) * 100
    for key in en_fr_edge:
        anti_rating_edge[key] = en_fr_edge_none[key] / (en_fr_edge_none[key] + en_fr_edge[key]) * 100

    print(anti_rating_POS.most_common(7))
    print()
    print(anti_rating_edge.most_common(7))
    
    print()
    print()
    
    rating_POS = Counter()
    rating_edge = Counter()

    for key in en_fr_POS:
        rating_POS[key] = fr_POS_none[key] / (fr_POS_none[key] + fr_POS[key]) * 100
    for key in en_fr_edge:
        rating_edge[key] = fr_edge_none[key] / (fr_edge_none[key] + fr_edge[key]) * 100

    print(rating_POS.most_common(7))
    print()
    print(rating_edge.most_common(7))

In [37]:
none_stats(en_fr)

[('DET', 13.91304347826087), ('ADV', 10.413885180240321), ('CCONJ', 10.265486725663717), ('AUX', 8.695652173913043), ('PRON', 8.183632734530939), ('VERB', 6.8679980516317585), ('ADJ', 6.217277486910995)]

[('cc:preconj', 71.42857142857143), ('aux:pass', 50.0), ('compound:prt', 45.45454545454545), ('fixed', 14.285714285714285), ('det:predet', 12.5), ('advmod', 10.419313850063533), ('acl', 10.16042780748663)]


[('X', 13.333333333333334), ('PRON', 10.795454545454545), ('CCONJ', 9.269162210338681), ('ADV', 8.089097303634233), ('VERB', 7.542147293700088), ('ADP', 5.008944543828265), ('DET', 4.385964912280701)]

[('discourse', 66.66666666666666), ('fixed', 62.5), ('goeswith', 50.0), ('iobj', 33.33333333333333), ('ccomp', 21.29032258064516), ('det:predet', 12.5), ('obl:tmod', 10.0)]


In [38]:
none_stats(en_ru)

[('ADV', 12.601626016260163), ('PRON', 8.641975308641975), ('CCONJ', 8.333333333333332), ('VERB', 7.184750733137831), ('SCONJ', 7.142857142857142), ('DET', 6.976744186046512), ('AUX', 6.779661016949152)]

[('cc:preconj', 33.33333333333333), ('expl', 20.0), ('fixed', 13.333333333333334), ('compound:prt', 12.068965517241379), ('advmod', 11.642949547218628), ('obl:npmod', 11.11111111111111), ('xcomp', 9.961685823754788)]


[('X', 50.0), ('PART', 28.723404255319153), ('DET', 23.03030303030303), ('CCONJ', 22.22222222222222), ('ADV', 10.638297872340425), ('PRON', 10.374149659863946), ('NOUN', 8.828213879408418)]

[('discourse', 80.0), ('iobj', 76.59574468085107), ('cc:preconj', 66.66666666666666), ('fixed', 35.0), ('acl', 16.037735849056602), ('advmod', 14.176663031624864), ('nmod', 13.412698412698413)]


In [39]:
none_stats(en_zh)

[('CCONJ', 26.159554730983302), ('PRON', 23.40966921119593), ('ADP', 17.24137931034483), ('AUX', 14.000000000000002), ('DET', 13.47517730496454), ('SCONJ', 12.4031007751938), ('ADV', 11.98044009779951)]

[('cc:preconj', 50.0), ('nmod:poss', 26.878612716763005), ('cc', 25.555555555555554), ('det:predet', 25.0), ('case', 18.098958333333336), ('aux', 17.80821917808219), ('nmod:npmod', 15.789473684210526)]


[('X', 62.16216216216216), ('PUNCT', 50.0), ('ADV', 36.98959167333867), ('AUX', 35.55555555555556), ('VERB', 19.269102990033225), ('ADP', 14.722536806342015), ('PRON', 14.025245441795231)]

[('dep', 97.95918367346938), ('dislocated', 66.66666666666666), ('cop', 66.66666666666666), ('obl:tmod', 64.58333333333334), ('punct', 50.0), ('advmod', 37.7431906614786), ('mark', 32.535885167464116)]


In [11]:
align_dict

{'3': ['2'],
 '4': ['3'],
 '5': ['4'],
 '6': ['5', '6'],
 '8': ['8'],
 '9': ['9'],
 '10': ['10'],
 '12': ['12'],
 '14': ['14'],
 '16': ['16'],
 '18': ['18'],
 '19': ['20'],
 '20': ['19'],
 '22': ['21'],
 '24': ['24']}

In [92]:
# Testing code
# n = 44
# en_n, en_g = conll2graph(en_fr[n][2])
# fr_n, fr_g = conll2graph(en_fr[n][3])
# (unaligned_en, unalined_fr, one_to_many_en, one_to_many_fr, 
#      alignment_edges) = preprocess_alignment(en_fr[n][4])
# print(len(alignment_edges))
# for en, fr in sorted(alignment_edges, key = lambda x: int(x[0])):
#     print(en_n[normalise_key(en)]['wordform'] + '->' + fr_n[normalise_key(fr)]['wordform'])
# print()

for i, record in enumerate(en_ru):
    en_n, en_g = conll2graph(record[2])
    fr_n, fr_g = conll2graph(record[3])
    (unaligned_en, unalined_fr, one_to_many_en, one_to_many_fr, 
     alignment_edges) = preprocess_alignment(record[4])
    source_sent, target_sent = extract_raw_sentences(record)
    # Extract highest-positioned counterparts from one-to-many alignments
    for node_en, nodes_fr in one_to_many_en.items():
        min_depth = 1000
        arg_min = 'X'
        for n_fr in nodes_fr:
            current_depth = get_node_depth(normalise_key(n_fr), fr_g)
            if current_depth < min_depth:
                min_depth = current_depth
                arg_min = n_fr
        if arg_min == 'X':
            raise ValueError("Minimum-depth node not found")
        alignment_edges.append((node_en, arg_min))
    for node_fr, nodes_en in one_to_many_fr.items():
        min_depth = 1000
        arg_min = 'X'
        for n_en in nodes_en:
            current_depth = get_node_depth(normalise_key(n_en), en_g)
            if current_depth < min_depth:
                min_depth = current_depth
                arg_min = n_en
        if arg_min == 'X':
            raise ValueError("Minimum-depth node not found")
        alignment_edges.append((arg_min, node_fr))
    for c in combs(alignment_edges, 2):
        p, q = c
        en1, fr1 = map(normalise_key, p)
        en2, fr2 = map(normalise_key, q)
        if fr_n[fr1]['pos'] == 'CCONJ' or fr_n[fr2]['pos'] == 'CCONJ':
            continue # CCONJs were not aligned for Russian
        path_en = get_path(en1, en2, en_g)
        path_fr = get_path(fr1, fr2, fr_g)
        if len(path_en) == 2:
            path_en_stripped = strip_directions(path_en)
            path_fr_stripped = strip_directions(path_fr)
            if path_en_stripped != path_fr_stripped and path_en_stripped == list(reversed(path_fr_stripped)):
                print(i+1)
                print(source_sent, f'[{en_n[en1]["wordform"]}->{en_n[en2]["wordform"]}]', path_en)
                print(target_sent, f'[{fr_n[fr1]["wordform"]}->{fr_n[fr2]["wordform"]}]', path_fr)
                print()

111
The judge in Duffy's fraud and breach of trust trial ultimately ruled they were within the Senate's rules when he cleared Duffy of all charges. [Duffy->charges] ['obj_up', 'obl_down']
Судья в процессе по делу Даффи (Duffy) о мошенничестве и злоупотреблении доверием в конечном счете принял решение, что таковые имели место в пределах правил Сената, сняв при этом с Даффи все обвинения. [Даффи->обвинения] ['obl_up', 'obj_down']

260
News of the company’s deceit - which had run for years - wiped tens of billions of euros from VW's value and cost chief executive Martin Winterkorn his job. [value->tens] ['obl_up', 'obj_down']
Новости о мошенничестве компании были на слуху уже несколько лет, тем самым понизив стоимость «Фольксвагена» на десятки миллиардов евро и лишив главного исполнительного директора Мартина Винтеркорна его должности. [стоимость->десятки] ['obj_up', 'obl_down']

275
This means that they have not benefited from the uplift that the fall in sterling has given to overseas as