In [1]:
import sqlite3
import pymorphy2
import json
from collections import Counter
from itertools import combinations as combs
from queue import SimpleQueue

In [2]:
def normalise_key(k):
    """Converts 0-based indexing to 1-based indexing."""
    return str(int(k)+1)

In [54]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format (http://universaldependencies.org/format.html)
    to graphs. Returns a dictionary of nodes (wordforms and POS tags indexed by line numbers)
    together with a graph of the dependencies encoded as adjacency lists of
    (node_key, relation_label, direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        key = fields[0]
        # Ignore compound surface keys for aux, du, etc.
        if '-' in key:
            continue
        # lemma would be better, but there are no lemmas in Russian PUD
        # take care of this at a later stage
        wordform = fields[1] 
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        nodes[key] = { 'wordform': wordform, 'pos': pos }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, 'up'))
        graph[parent].append((key, relation, 'down'))
    return (nodes, graph)

In [4]:
def extract_raw_sentences(record):
    """Extracts target and source sentences from the target record."""
    lines = record[3].splitlines()
    for l in lines:
        if l.startswith('# text = '):
            target = l.strip('\n')[len('# text = '):]
            for l2 in lines:
                if l2.startswith('# text_en = '):
                    source = l2.strip('\n')[len('# text_en = '):]
                    return (source, target)
            else:
                raise ValueError('No source sentence found')
    else:
        raise ValueError('No target sentence found')

In [5]:
def preprocess_alignment(alignment_str):
    """Extracts unaligned words and one-to-many alignments.
    returns remaining edges as a list."""
    en_degrees = Counter()
    fr_degrees = Counter()
    unaligned_en = []
    unaligned_fr = []
    one_to_many_en = {}
    one_to_many_fr = {}
    alignment_edges = alignment_str.split()
    real_edges = []
    resulting_edges = []
    for edge in alignment_edges:
        en, fr = edge.split('-')
        if en == 'X':
            unaligned_fr.append(fr)
        elif fr == 'X':
            unaligned_en.append(en)
        else:
            en_degrees[en] += 1
            fr_degrees[fr] += 1
            real_edges.append((en, fr))
    for edge in real_edges:
        en, fr = edge
        if en_degrees[en] > 1:
            if en not in one_to_many_en:
                one_to_many_en[en] = []
            one_to_many_en[en].append(fr)
        elif fr_degrees[fr] > 1:
            if fr not in one_to_many_fr:
                one_to_many_fr[fr] = []
            one_to_many_fr[fr].append(en)
        else:
            resulting_edges.append(edge)
    return (
        unaligned_en,
        unaligned_fr,
        one_to_many_en,
        one_to_many_fr,
        resulting_edges
    )

In [6]:
def get_path(node1, node2, graph):
    if node1 == node2:
        return []
    
    # BFS with edge labels for paths
    q = SimpleQueue()
    # Remembers where we came from and the edge label
    sources = {}
    
    q.put(node1)
    visited = set()
    visited.add(node1)
    
    while not q.empty():
        current = q.get()
        for neighbour, relation, direction in graph[current]:
            if neighbour == node2:
                path = [relation+'_'+direction]
                source = current
                while source != node1:
                    prev_node, prev_relation, prev_direction = sources[source]
                    path.append(prev_relation+'_'+prev_direction)
                    source = prev_node
                return list(reversed(path))
            elif neighbour not in visited:
                sources[neighbour] = (current, relation, direction)
                q.put(neighbour)
            visited.add(neighbour)
            
    raise ValueError("UD graph is not connected.")

In [82]:
def strip_directions(path):
    return list(map(lambda x: x.split('_')[0], path))

In [37]:
def get_node_depth(node, graph):
    # BFS
    cur_depth = 0
    q = SimpleQueue()
    q.put(('0',0))
    visited = set()
    visited.add('0')
    while not q.empty():
        current_node, current_depth = q.get()
        for neighbour, *_ in graph[current_node]:
            if neighbour == node:
                return current_depth+1
            elif neighbour not in visited:
                q.put((neighbour, current_depth+1))
            visited.add(neighbour)
    raise IndexError("Target node unreachable")

In [7]:
conn = sqlite3.connect('pud_30_12.db')
cursor = conn.cursor()

In [8]:
en_ru = [r for r in cursor.execute('select * from `en-ru`')][:500]
en_fr = [r for r in cursor.execute('select * from `en-fr`')][:776]

In [55]:
fr_n, fr_g = conll2graph(en_fr[35][3])

In [62]:
en_n

{'1': {'wordform': 'The', 'pos': 'DET'},
 '2': {'wordform': 'company', 'pos': 'NOUN'},
 '3': {'wordform': 'told', 'pos': 'VERB'},
 '4': {'wordform': 'the', 'pos': 'DET'},
 '5': {'wordform': 'BBC', 'pos': 'PROPN'},
 '6': {'wordform': 'it', 'pos': 'PRON'},
 '7': {'wordform': 'would', 'pos': 'AUX'},
 '8': {'wordform': 'be', 'pos': 'AUX'},
 '9': {'wordform': 'the', 'pos': 'DET'},
 '10': {'wordform': 'responsibility', 'pos': 'NOUN'},
 '11': {'wordform': 'of', 'pos': 'ADP'},
 '12': {'wordform': 'each', 'pos': 'DET'},
 '13': {'wordform': 'airline', 'pos': 'NOUN'},
 '14': {'wordform': 'brand', 'pos': 'NOUN'},
 '15': {'wordform': 'to', 'pos': 'PART'},
 '16': {'wordform': 'decide', 'pos': 'VERB'},
 '17': {'wordform': 'whether', 'pos': 'SCONJ'},
 '18': {'wordform': 'to', 'pos': 'PART'},
 '19': {'wordform': 'charge', 'pos': 'VERB'},
 '20': {'wordform': 'passengers', 'pos': 'NOUN'},
 '21': {'wordform': 'an', 'pos': 'DET'},
 '22': {'wordform': 'access', 'pos': 'NOUN'},
 '23': {'wordform': 'fee', 'po

In [41]:
get_node_depth('16', fr_g)

5

In [92]:
# Testing code
# n = 44
# en_n, en_g = conll2graph(en_fr[n][2])
# fr_n, fr_g = conll2graph(en_fr[n][3])
# (unaligned_en, unalined_fr, one_to_many_en, one_to_many_fr, 
#      alignment_edges) = preprocess_alignment(en_fr[n][4])
# print(len(alignment_edges))
# for en, fr in sorted(alignment_edges, key = lambda x: int(x[0])):
#     print(en_n[normalise_key(en)]['wordform'] + '->' + fr_n[normalise_key(fr)]['wordform'])
# print()

for i, record in enumerate(en_ru):
    en_n, en_g = conll2graph(record[2])
    fr_n, fr_g = conll2graph(record[3])
    (unaligned_en, unalined_fr, one_to_many_en, one_to_many_fr, 
     alignment_edges) = preprocess_alignment(record[4])
    source_sent, target_sent = extract_raw_sentences(record)
    # Extract highest-positioned counterparts from one-to-many alignments
    for node_en, nodes_fr in one_to_many_en.items():
        min_depth = 1000
        arg_min = 'X'
        for n_fr in nodes_fr:
            current_depth = get_node_depth(normalise_key(n_fr), fr_g)
            if current_depth < min_depth:
                min_depth = current_depth
                arg_min = n_fr
        if arg_min == 'X':
            raise ValueError("Minimum-depth node not found")
        alignment_edges.append((node_en, arg_min))
    for node_fr, nodes_en in one_to_many_fr.items():
        min_depth = 1000
        arg_min = 'X'
        for n_en in nodes_en:
            current_depth = get_node_depth(normalise_key(n_en), en_g)
            if current_depth < min_depth:
                min_depth = current_depth
                arg_min = n_en
        if arg_min == 'X':
            raise ValueError("Minimum-depth node not found")
        alignment_edges.append((arg_min, node_fr))
    for c in combs(alignment_edges, 2):
        p, q = c
        en1, fr1 = map(normalise_key, p)
        en2, fr2 = map(normalise_key, q)
        if fr_n[fr1]['pos'] == 'CCONJ' or fr_n[fr2]['pos'] == 'CCONJ':
            continue # CCONJs were not aligned for Russian
        path_en = get_path(en1, en2, en_g)
        path_fr = get_path(fr1, fr2, fr_g)
        if len(path_en) == 2:
            path_en_stripped = strip_directions(path_en)
            path_fr_stripped = strip_directions(path_fr)
            if path_en_stripped != path_fr_stripped and path_en_stripped == list(reversed(path_fr_stripped)):
                print(i+1)
                print(source_sent, f'[{en_n[en1]["wordform"]}->{en_n[en2]["wordform"]}]', path_en)
                print(target_sent, f'[{fr_n[fr1]["wordform"]}->{fr_n[fr2]["wordform"]}]', path_fr)
                print()

111
The judge in Duffy's fraud and breach of trust trial ultimately ruled they were within the Senate's rules when he cleared Duffy of all charges. [Duffy->charges] ['obj_up', 'obl_down']
Судья в процессе по делу Даффи (Duffy) о мошенничестве и злоупотреблении доверием в конечном счете принял решение, что таковые имели место в пределах правил Сената, сняв при этом с Даффи все обвинения. [Даффи->обвинения] ['obl_up', 'obj_down']

260
News of the company’s deceit - which had run for years - wiped tens of billions of euros from VW's value and cost chief executive Martin Winterkorn his job. [value->tens] ['obl_up', 'obj_down']
Новости о мошенничестве компании были на слуху уже несколько лет, тем самым понизив стоимость «Фольксвагена» на десятки миллиардов евро и лишив главного исполнительного директора Мартина Винтеркорна его должности. [стоимость->десятки] ['obj_up', 'obl_down']

275
This means that they have not benefited from the uplift that the fall in sterling has given to overseas as