In [1]:
import sqlite3
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from queue import Queue
from itertools import combinations as combs

In [2]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format 
    (http://universaldependencies.org/format.html) to graphs. 
    Returns a dictionary of nodes (wordforms and POS tags indexed 
    by line numbers) together with a graph of the dependencies encoded 
    as adjacency lists of (node_key, relation_label, direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        key = fields[0]
        # Ignore compound surface keys for aux, du, etc.
        # Ignore hidden additional nodes for orphan handling
        if '-' in key or '.' in key:
            continue
        wordform = fields[1] 
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        nodes[key] = {
            'wordform': wordform,
            'pos': pos,
            'relation': relation,
            'parent': parent
        }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, 'up'))
        graph[parent].append((key, relation, 'down'))
    return (nodes, graph)

In [3]:
def get_node_depth(node, graph):
    """A BFS-based implementation."""
    cur_depth = 0
    q = Queue()
    q.put(('0',0))
    visited = set()
    visited.add('0')
    while not q.empty():
        current_node, current_depth = q.get()
        for neighbour, *_ in graph[current_node]:
            if neighbour == node:
                return current_depth+1
            elif neighbour not in visited:
                q.put((neighbour, current_depth+1))
            visited.add(neighbour)
    raise IndexError("Target node unreachable")

In [4]:
def highest_or_none(indices, graph):
    if indices[0] == 'X':
        return None
    min_depth = 1000
    argmin = None
    for i in indices:
        key = str(i)
        depth = get_node_depth(key, graph)
        if depth < min_depth:
            min_depth = depth
            argmin = key
    assert argmin is not None
    return argmin

In [5]:
def get_path(node1, node2, graph):
    if node1 == node2:
        return []
    
    # BFS with edge labels for paths
    q = Queue()
    # Remembers where we came from and the edge label
    sources = {}
    
    q.put(node1)
    visited = set()
    visited.add(node1)
    
    while not q.empty():
        current = q.get()
        for neighbour, relation, direction in graph[current]:
            if neighbour == node2:
                path = [relation+'_'+direction]
                source = current
                while source != node1:
                    prev_node, prev_relation, prev_direction = sources[source]
                    path.append(prev_relation+'_'+prev_direction)
                    source = prev_node
                return list(reversed(path))
            elif neighbour not in visited:
                sources[neighbour] = (current, relation, direction)
                q.put(neighbour)
            visited.add(neighbour)
            
    raise ValueError("UD graph is not connected.")

In [20]:
def confusion_dict2matrix(cd):
    'Takes as input a map[string -> Counter[string -> int]]. Returns a Pandas dataframe.'
    row_keys = sorted(cd)
    additional_column_keys = set()
    for val in cd.values():
        for key in val:
            if val not in row_keys:
                additional_column_keys.add(key)
    column_keys = row_keys + sorted(additional_column_keys)
    conf_matrix = np.zeros(
        (len(row_keys), len(column_keys)),
        int
    )
    conf_df = pd.DataFrame(conf_matrix)
    conf_df.index = row_keys
    conf_df.columns = column_keys
    for row_key, counter in cd.items():
        for column_key, val in counter.items():
#             if k in column_keys:
#                 column_key = k
#             else:
#                 column_key = 'Other'
            conf_df.loc[row_key][column_key] += val
    return conf_df

In [7]:
def compute_confusion_dicts(en, ko, alignments):
    confusion_dict_pos = {}
    confusion_dict_paths = {}

    strip_direction = lambda x: x.split('_')[0]

    for i in range(len(en)):
        en_n, en_g = conll2graph(en[i])
        ko_n, ko_g = conll2graph(ko[i])
        alignment = alignments[i]
        # Simplify the alignment to a set of one-to-one pairs
        one_to_one = []
        for k, v in alignment.items():
            if k == 'X':
                # Do not analyse stuff added on the Ko side for now
                continue
            head = k
            tail = str(highest_or_none(v, ko_g))
            one_to_one.append((head, tail))
        # POS confusion dict
        for pair in one_to_one:
            head, tail = pair
            # Skip technical additional nodes
            if '.' in head:
                continue
            try:
                en_pos = en_n[head]['pos']
            except KeyError:
                print(i, en[i])
                continue
            if tail == 'None':
                ko_pos = 'None'
            else:
                ko_pos = ko_n[tail]['pos']
            if en_pos not in confusion_dict_pos:
                confusion_dict_pos[en_pos] = Counter()
            confusion_dict_pos[en_pos][ko_pos] += 1
        # Path confusion dict
        for pair in combs(one_to_one, 2):
            (en_head, ko_head), (en_tail, ko_tail) = pair
            # Skip technical additional nodes
            if '.' in head:
                continue
            en_path_arr = get_path(en_head, en_tail, en_g)
            if len(en_path_arr) > 1:
                continue
            en_path = strip_direction(en_path_arr[0])
            if ko_head == ko_tail:
                ko_path = 'Nodes collapsed'
            elif ko_head == 'None' and ko_tail == 'None':
                ko_path = 'Both endpoints unaligned'
            elif ko_head == 'None' or ko_tail == 'None':
                ko_path = 'One endpoint unaligned'
            else:
                ko_path_arr = get_path(ko_head, ko_tail, ko_g)
                ko_path = '->'.join(
                    list(map(strip_direction, ko_path_arr))
                )
            if en_path not in confusion_dict_paths:
                confusion_dict_paths[en_path] = Counter()
            confusion_dict_paths[en_path][ko_path] += 1
        
    return confusion_dict_pos, confusion_dict_paths

In [8]:
def extract_data_for_lang(target_lang='fr', dbpath='pud_current.db'):
    conn = sqlite3.connect('pud_current.db')
    cursor = conn.cursor()
    en = []
    ko = []
    alignments = []
    for en_, ko_, alignment_str in cursor.execute(
        f'SELECT `en`, `ru`, `alignment` FROM `en-{target_lang}` WHERE `verified` = 1'
    ):
        en.append(en_)
        ko.append(ko_)
        alignments.append(json.loads(alignment_str))
    conn.close()
    return en, ko, alignments

In [9]:
# Different datasets may have different number of 
# source blocks due to unverified sentences.
en_fr, fr, en_fr_align = extract_data_for_lang()
en_ru, ru, en_ru_align = extract_data_for_lang('ru')
en_zh, zh, en_zh_align = extract_data_for_lang('zh')
en_ko, ko, en_ko_align = extract_data_for_lang('ko')

In [16]:
en_fr_cd_pos, en_fr_cd_paths = compute_confusion_dicts(en_fr, fr, en_fr_align)

In [21]:
confusion_dict2matrix(en_fr_cd_pos)

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,...,DET.1,NOUN.1,NUM.1,None,PRON,PROPN,SCONJ,SYM,VERB,X
ADJ,1066,17,64,0,0,2,0,200,2,0,...,2,200,2,95,1,17,0,0,62,2
ADP,14,398,24,0,1,4,0,31,0,0,...,4,31,0,31,1,1,1,0,28,0
ADV,41,34,478,1,5,6,0,66,1,0,...,6,66,1,78,8,2,5,0,24,0
AUX,0,0,2,4,0,0,0,0,0,0,...,0,0,0,8,0,0,0,0,78,0
CCONJ,1,4,16,0,483,0,0,1,0,0,...,0,1,0,58,0,1,0,1,0,0
DET,31,5,16,0,0,19,0,11,7,0,...,19,11,7,16,4,1,0,0,5,0
INTJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
NOUN,142,13,14,1,0,1,0,3554,11,0,...,1,3554,11,157,16,28,0,6,64,1
NUM,8,1,0,0,0,11,0,32,391,0,...,11,32,391,5,1,2,0,0,1,0
PART,0,1,51,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,2,0


In [13]:
confusion_dict2matrix(en_fr_cd_paths)

Unnamed: 0,acl,acl:relcl,advcl,advmod,amod,appos,aux,aux:pass,case,cc,...,nsubj:pass,nummod,obj,obl,obl:npmod,obl:tmod,orphan,parataxis,xcomp,Other
acl,6,7,3,0,8,0,0,0,0,0,...,1,0,2,2,0,0,0,0,25,73
acl:relcl,3,101,1,1,5,3,0,0,0,0,...,1,0,2,1,0,0,0,0,0,72
advcl,2,0,106,2,0,0,0,0,1,0,...,1,0,0,16,0,0,0,4,5,115
advmod,0,3,1,399,9,2,0,0,7,0,...,1,0,6,32,0,1,0,0,7,268
amod,1,2,0,10,889,6,0,0,2,0,...,0,4,1,5,0,0,0,0,0,251
appos,0,0,0,0,2,92,0,0,0,0,...,0,0,0,0,0,1,0,0,0,38
aux,0,0,0,2,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,60,12
aux:pass,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
case,0,0,0,1,0,0,0,0,404,1,...,0,0,12,18,0,0,0,0,0,83
cc,0,0,0,8,0,0,0,0,0,388,...,0,0,0,0,0,0,0,0,0,150


In [25]:
en_ru_cd_pos, en_ru_cd_paths = compute_confusion_dicts(en_ru, ru, en_ru_align)
en_zh_cd_pos, en_zh_cd_paths = compute_confusion_dicts(en_zh, zh, en_zh_align)
en_ko_cd_pos, en_ko_cd_paths = compute_confusion_dicts(en_ko, ko, en_ko_align)

In [27]:
confusion_dict2matrix(en_fr_cd_pos).to_csv('en_fr_pos.csv')
confusion_dict2matrix(en_ru_cd_pos).to_csv('en_ru_pos.csv')
confusion_dict2matrix(en_zh_cd_pos).to_csv('en_zh_pos.csv')
confusion_dict2matrix(en_ko_cd_pos).to_csv('en_ko_pos.csv')
confusion_dict2matrix(en_fr_cd_paths).to_csv('en_fr_paths.csv')
confusion_dict2matrix(en_ru_cd_paths).to_csv('en_ru_paths.csv')
confusion_dict2matrix(en_zh_cd_paths).to_csv('en_zh_paths.csv')
confusion_dict2matrix(en_ko_cd_paths).to_csv('en_ko_paths.csv')