In [11]:
import sqlite3
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from queue import Queue
from itertools import combinations as combs

In [2]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format 
    (http://universaldependencies.org/format.html) to graphs. 
    Returns a dictionary of nodes (wordforms and POS tags indexed 
    by line numbers) together with a graph of the dependencies encoded 
    as adjacency lists of (node_key, relation_label, direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        key = fields[0]
        # Ignore compound surface keys for aux, du, etc.
        # Ignore hidden additional nodes for orphan handling
        if '-' in key or '.' in key:
            continue
        wordform = fields[1] 
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        nodes[key] = {
            'wordform': wordform,
            'pos': pos,
            'relation': relation,
            'parent': parent
        }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, 'up'))
        graph[parent].append((key, relation, 'down'))
    return (nodes, graph)

In [3]:
def get_node_depth(node, graph):
    """A BFS-based implementation."""
    cur_depth = 0
    q = Queue()
    q.put(('0',0))
    visited = set()
    visited.add('0')
    while not q.empty():
        current_node, current_depth = q.get()
        for neighbour, *_ in graph[current_node]:
            if neighbour == node:
                return current_depth+1
            elif neighbour not in visited:
                q.put((neighbour, current_depth+1))
            visited.add(neighbour)
    raise IndexError("Target node unreachable")

In [4]:
def highest_or_none(indices, graph):
    if indices[0] == 'X':
        return None
    min_depth = 1000
    argmin = None
    for i in indices:
        key = str(i)
        depth = get_node_depth(key, graph)
        if depth < min_depth:
            min_depth = depth
            argmin = key
    assert argmin is not None
    return argmin

In [6]:
def get_path(node1, node2, graph):
    if node1 == node2:
        return []
    
    # BFS with edge labels for paths
    q = Queue()
    # Remembers where we came from and the edge label
    sources = {}
    
    q.put(node1)
    visited = set()
    visited.add(node1)
    
    while not q.empty():
        current = q.get()
        for neighbour, relation, direction in graph[current]:
            if neighbour == node2:
                path = [relation+'_'+direction]
                source = current
                while source != node1:
                    prev_node, prev_relation, prev_direction = sources[source]
                    path.append(prev_relation+'_'+prev_direction)
                    source = prev_node
                return list(reversed(path))
            elif neighbour not in visited:
                sources[neighbour] = (current, relation, direction)
                q.put(neighbour)
            visited.add(neighbour)
            
    raise ValueError("UD graph is not connected.")

In [23]:
conn = sqlite3.connect('pud_2019_05_25.db')
cursor = conn.cursor()
en = []
ko = []
alignments = []
for en_, ko_, alignment_str in cursor.execute(
    'SELECT `en`, `ru`, `alignment` FROM `en-ko` WHERE `verified` = 1'
):
    en.append(en_)
    ko.append(ko_)
    alignments.append(json.loads(alignment_str))

In [24]:
confusion_dict_pos = {}
confusion_dict_paths = {}

strip_direction = lambda x: x.split('_')[0]

for i in range(len(en)):
    en_n, en_g = conll2graph(en[i])
    ko_n, ko_g = conll2graph(ko[i])
    alignment = alignments[i]
    # Simplify the alignment to a set of one-to-one pairs
    one_to_one = []
    for k, v in alignment.items():
        if k == 'X':
            # Do not analyse stuff added on the Ko side for now
            continue
        head = k
        tail = str(highest_or_none(v, ko_g))
        one_to_one.append((head, tail))
    # POS confusion dict
    for pair in one_to_one:
        head, tail = pair
        # Skip technical additional nodes
        if '.' in head:
            continue
        try:
            en_pos = en_n[head]['pos']
        except KeyError:
            print(i, en[i])
            continue
        if tail == 'None':
            ko_pos = 'None'
        else:
            ko_pos = ko_n[tail]['pos']
        if en_pos not in confusion_dict_pos:
            confusion_dict_pos[en_pos] = Counter()
        confusion_dict_pos[en_pos][ko_pos] += 1
    # Path confusion dict
    for pair in combs(one_to_one, 2):
        (en_head, ko_head), (en_tail, ko_tail) = pair
        # Skip technical additional nodes
        if '.' in head:
            continue
        en_path_arr = get_path(en_head, en_tail, en_g)
        if len(en_path_arr) > 1:
            continue
        en_path = strip_direction(en_path_arr[0])
        if ko_head == ko_tail:
            ko_path = 'Nodes collapsed'
        elif ko_head == 'None' and ko_tail == 'None':
            ko_path = 'Both endpoints unaligned'
        elif ko_head == 'None' or ko_tail == 'None':
            ko_path = 'One endpoint unaligned'
        else:
            ko_path_arr = get_path(ko_head, ko_tail, ko_g)
            ko_path = '->'.join(
                list(map(strip_direction, ko_path_arr))
            )
        if en_path not in confusion_dict_paths:
            confusion_dict_paths[en_path] = Counter()
        confusion_dict_paths[en_path][ko_path] += 1

In [27]:
def confusion_dict2matrix(cd):
    'Takes as input a map[string -> Counter[string -> int]]. Returns a Pandas dataframe.'
    row_keys = sorted(cd)
    column_keys = row_keys + ['Other']
    conf_matrix = np.zeros(
        (len(row_keys), len(column_keys)),
        int
    )
    conf_df = pd.DataFrame(conf_matrix)
    conf_df.index = row_keys
    conf_df.columns = column_keys
    for row_key, counter in cd.items():
        for k, val in counter.items():
            if k in column_keys:
                column_key = k
            else:
                column_key = 'Other'
            conf_df.loc[row_key][column_key] += val
    return conf_df

In [28]:
confusion_dict2matrix(confusion_dict_pos)

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X,Other
ADJ,190,0,44,0,0,42,0,678,20,4,4,146,0,0,0,50,0,75
ADP,5,0,14,0,0,1,0,126,0,3,0,7,0,0,0,21,0,273
ADV,56,0,211,1,17,5,0,146,1,10,5,2,0,0,0,28,0,132
AUX,33,0,0,4,0,3,0,8,0,1,0,0,0,0,0,10,0,43
CCONJ,2,0,0,0,16,1,0,2,0,0,0,1,0,0,0,1,0,12
DET,6,0,9,0,0,25,0,25,0,2,0,0,0,0,0,6,0,39
INTJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
NOUN,12,0,14,6,0,10,0,2976,4,3,8,38,0,0,0,58,0,204
NUM,0,0,1,0,0,48,0,157,148,0,0,1,0,0,0,0,0,24
PART,14,0,2,0,0,0,0,1,0,3,0,0,0,0,0,30,0,9


In [30]:
confusion_dict2matrix(confusion_dict_pos).to_csv('en_ko_pos.csv', index=False)

In [29]:
confusion_dict2matrix(confusion_dict_paths)

Unnamed: 0,acl,acl:relcl,advcl,advmod,amod,appos,aux,aux:pass,case,cc,...,nummod,obj,obl,obl:npmod,obl:tmod,orphan,parataxis,punct,xcomp,Other
acl,0,42,3,2,0,0,1,0,0,0,...,0,3,0,0,0,0,0,0,0,87
acl:relcl,0,39,5,2,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,96
advcl,0,2,35,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,161
advmod,0,4,12,198,4,0,30,0,0,0,...,1,5,0,0,0,0,0,0,0,334
amod,0,93,1,21,109,0,0,0,0,0,...,16,4,0,0,0,0,0,0,0,337
appos,0,26,1,0,0,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,53
aux,0,1,2,0,0,0,4,0,0,0,...,0,1,0,0,0,0,0,0,0,50
aux:pass,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
case,0,5,0,13,0,0,0,0,0,0,...,13,13,0,0,1,0,0,0,0,309
cc,0,0,1,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,22


In [31]:
confusion_dict2matrix(confusion_dict_paths).to_csv('en_ko_paths.csv', index=False)