In [33]:
import sqlite3
import json
import re
from collections import Counter

## Preprocessing

In [18]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format 
    (http://universaldependencies.org/format.html) to graphs. 
    Returns a dictionary of nodes (wordforms and POS tags indexed 
    by line numbers) together with a graph of the dependencies encoded 
    as adjacency lists of (node_key, relation_label, direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        key = fields[0]
        # Ignore compound surface keys for aux, du, etc.
        # Ignore hidden additional nodes for orphan handling
        if '-' in key or '.' in key:
            continue
        wordform = fields[1] 
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        nodes[key] = {
            'wordform': wordform,
            'pos': pos,
            'relation': relation,
            'parent': parent
        }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, 'up'))
        graph[parent].append((key, relation, 'down'))
    return (nodes, graph)

In [10]:
def extract_blocks(fobj):
    blocks = []
    tmp = []
    for line in fobj:
        stripped = line.strip()
        # Either a block has just ended or we are between blocks
        if not stripped:
            if not tmp:
                continue
            else:
                blocks.append('\n'.join(tmp))
                tmp = []
        else:
            tmp.append(stripped)
    if tmp:
        blocks.append('\n'.join(tmp))
    return blocks

In [4]:
DATADIR = '../data 3/'

In [5]:
! exa '../data 3/'

[1;34mcorrected[0m                          en_esl-ud-test.conllu.amalign.json
[1;32men_esl-ud-dev.conllu[0m               en_esl-ud-test.conllu.malign
[1;32men_esl-ud-dev.conllu.align[0m         en_esl-ud-test.conllu.malign.json
[1;32men_esl-ud-dev.conllu.align.json[0m    [1;32men_esl-ud-train.conllu[0m
en_esl-ud-dev.conllu.amalign       [1;32men_esl-ud-train.conllu.align[0m
en_esl-ud-dev.conllu.amalign.json  [1;32men_esl-ud-train.conllu.align.json[0m
en_esl-ud-dev.conllu.malign        en_esl-ud-train.conllu.amalign
en_esl-ud-dev.conllu.malign.json   en_esl-ud-train.conllu.amalign.json
[1;32men_esl-ud-test.conllu[0m              en_esl-ud-train.conllu.malign
[1;32men_esl-ud-test.conllu.align[0m        en_esl-ud-train.conllu.malign.json
[1;32men_esl-ud-test.conllu.align.json[0m   [1;32mREADME.txt[0m
en_esl-ud-test.conllu.amalign      


In [7]:
with open(f'{DATADIR}/en_esl-ud-train.conllu.amalign.json', 'r') as inp:
    alignment = json.load(inp)

In [9]:
from pprint import pprint
pprint(list(enumerate(alignment[0])))

[(0, ' 0100_2000_6-doc2664.xml-34\n'),
 (1,
  [[[['I'], ['I']],
    [['was'], ['was']],
    [['shoked'], ['shocked']],
    [['because'], ['because']],
    [['I'], ['I']],
    [['had'], ['had']],
    [['alredy'], ['already']],
    [['spoken'], ['spoken']],
    [['with'], ['with']],
    [['them'], ['them']],
    [['and'], ['and']],
    [['I'], ['I']],
    [['had'], ['had']],
    [['taken'], ['got']],
    [['two'], ['two']],
    [['autographs'], ['autographs']],
    [['.'], ['.']]],
   [[[1], [1]],
    [[2], [2]],
    [[3], [3]],
    [[4], [4]],
    [[5], [5]],
    [[6], [6]],
    [[7], [7]],
    [[8], [8]],
    [[9], [9]],
    [[10], [10]],
    [[11], [11]],
    [[12], [12]],
    [[13], [13]],
    [[14], [14]],
    [[15], [15]],
    [[16], [16]],
    [[17], [17]]],
   [0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])]


In [17]:
alignment_dict = {}
with open(f'{DATADIR}/en_esl-ud-train.conllu', 'r') as inp:
    blocks_uncorrected = extract_blocks(inp)
with open(f'{DATADIR}/corrected/en_cesl-ud-train.conllu', 'r') as inp:
    blocks_corrected = extract_blocks(inp)
assert len(blocks_uncorrected) == len(blocks_corrected)

In [13]:
blocks_uncorrected[0]

'# sent_id = 0100_2000_6-doc2664.xml-34\n# text = I was shoked because I had alredy spoken with them and I had taken two autographs .\n# error_annotation = I was <ns type="S"><i>shoked</i><c>shocked</c></ns> because I had <ns type="S"><i>alredy</i><c>already</c></ns> spoken with them and I had <ns type="RV"><i>taken</i><c>got</c></ns> two autographs.\n1\tI\t_\tPRON\tPRP\t_\t3\tnsubj\t_\t_\n2\twas\t_\tAUX\tVBD\t_\t3\tcop\t_\t_\n3\tshoked\t_\tADJ\tJJ\t_\t0\troot\t_\t_\n4\tbecause\t_\tSCONJ\tIN\t_\t8\tmark\t_\t_\n5\tI\t_\tPRON\tPRP\t_\t8\tnsubj\t_\t_\n6\thad\t_\tAUX\tVBD\t_\t8\taux\t_\t_\n7\talredy\t_\tADV\tRB\t_\t8\tadvmod\t_\t_\n8\tspoken\t_\tVERB\tVBN\t_\t3\tadvcl\t_\t_\n9\twith\t_\tADP\tIN\t_\t10\tcase\t_\t_\n10\tthem\t_\tPRON\tPRP\t_\t8\tobl\t_\t_\n11\tand\t_\tCCONJ\tCC\t_\t14\tcc\t_\t_\n12\tI\t_\tPRON\tPRP\t_\t14\tnsubj\t_\t_\n13\thad\t_\tAUX\tVBD\t_\t14\taux\t_\t_\n14\ttaken\t_\tVERB\tVBN\t_\t8\tconj\t_\t_\n15\ttwo\t_\tNUM\tCD\t_\t16\tnummod\t_\t_\n16\tautographs\t_\tNOUN\tNNS\t_\t

In [16]:
sent_id_pattern = re.compile(r'# sent_id = (.*?)\n#')
sent_id_pattern.match(blocks_uncorrected[0]).group(1)

'0100_2000_6-doc2664.xml-34'

In [29]:
get_sent_id = lambda block: sent_id_pattern.match(block).group(1)
for b_unc in blocks_uncorrected:
    sent_id = get_sent_id(b_unc)
    alignment_dict[sent_id] = {
        'uncorrected': b_unc,
        'corrected': None,
        'alignment': None
    }
    alignment_dict[sent_id]['uncorrected'] = b_unc
    
for b_cor in blocks_corrected:
    sent_id = get_sent_id(b_cor)
    assert sent_id in alignment_dict
    alignment_dict[sent_id]['corrected'] = b_cor
    
for alignment_tuple in alignment:
    sent_id = alignment_tuple[0].strip()
    assert sent_id in alignment_dict
    alignment_dict[sent_id]['alignment'] = alignment_tuple[1]

## Analysis

In [39]:
def first_or_none(lst):
    # Return the first element of a list if it exists and is not an empty string
    if len(lst) > 0 and lst[0]:
        return lst[0]
    else:
        return None

In [31]:
def get_upgoing_relation(node, graph):
    rel = None
    for edge in graph[node]:
        if edge[-1] == 'up':
            rel = edge[1]
            break
    assert rel != None
    return rel

In [44]:
confusion_dict_pos = {}
confusion_dict_edge_type = {}

for k, v in alignment_dict.items():
    alignment_words, alignment_indices, alignment_inside_edit = v['alignment']
    
    nodes_unc, graph_unc = conll2graph(v['uncorrected'])
    nodes_cor, graph_cor = conll2graph(v['corrected'])
    
    # Alignments are two-element tuples of single-element lists.
    # Extract only alignments of elements inside edits and
    # convert them to simple two-element lists.
    alignment_indices = [
        [first_or_none(sub_el) for sub_el in el]
        for i, el in enumerate(alignment_indices)
        if alignment_inside_edit[i]
    ]
    
    for head, tail in alignment_indices:
        head = str(head)
        tail = str(tail)
        if head == 'None':
            head_pos = head_edge_type = 'None'
            tail_pos = nodes_cor[tail]['pos']
            tail_edge_type = get_upgoing_relation(tail, graph_cor)
        elif tail == 'None':
            tail_pos = tail_edge_type = 'None'
            head_pos = nodes_unc[head]['pos']
            head_edge_type = get_upgoing_relation(head, graph_unc)
        else:
            head_pos = nodes_unc[head]['pos']
            head_edge_type = get_upgoing_relation(head, graph_unc)
            tail_pos = nodes_cor[tail]['pos']
            tail_edge_type = get_upgoing_relation(tail, graph_cor)
            
        if head_pos not in confusion_dict_pos:
            confusion_dict_pos[head_pos] = Counter()
        confusion_dict_pos[head_pos][tail_pos] += 1
        
        if head_edge_type not in confusion_dict_edge_type:
            confusion_dict_edge_type[head_edge_type] = Counter()
        confusion_dict_edge_type[head_edge_type][tail_edge_type] += 1

In [47]:
import numpy as np
import pandas as pd
def confusion_dict2matrix(cd):
    'Takes as input a map[string -> Counter[string -> int]]. Returns a Pandas dataframe.'
    row_keys = sorted(cd)
    column_keys = row_keys + ['Other']
    conf_matrix = np.zeros(
        (len(row_keys), len(column_keys)),
        int
    )
    conf_df = pd.DataFrame(conf_matrix)
    conf_df.index = row_keys
    conf_df.columns = column_keys
    for row_key, counter in cd.items():
        for k, val in counter.items():
            if k in column_keys:
                column_key = k
            else:
                column_key = 'Other'
            conf_df.loc[row_key][column_key] += val
    return conf_df

In [48]:
confusion_dict2matrix(confusion_dict_pos)

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,None,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X,Other
ADJ,538,6,51,1,0,27,0,48,1,28,0,2,1,0,0,0,10,0,0
ADP,8,919,33,4,4,16,0,9,0,279,11,11,0,2,13,0,30,0,0
ADV,38,23,472,3,6,11,0,5,0,69,4,16,0,6,11,0,5,1,0
AUX,1,1,3,591,1,1,0,1,0,93,8,7,0,0,0,0,136,0,0
CCONJ,2,4,5,1,73,1,0,1,0,18,0,0,0,1,5,0,0,0,0
DET,21,14,14,3,1,537,0,12,2,328,14,60,0,3,5,0,3,0,0
INTJ,0,0,0,0,0,0,5,0,0,0,1,0,0,2,0,0,0,0,0
NOUN,66,6,6,2,0,23,0,1857,1,80,10,5,14,1,0,0,41,0,0
NUM,8,0,0,0,0,6,0,5,48,1,0,1,0,0,0,0,0,0,0
,27,335,63,135,42,763,0,68,2,0,50,233,2,362,58,0,84,1,0


In [49]:
confusion_dict2matrix(confusion_dict_edge_type)

Unnamed: 0,None,acl,acl:relcl,advcl,advmod,amod,appos,aux,aux:pass,case,...,obl,obl:npmod,obl:tmod,orphan,parataxis,punct,root,vocative,xcomp,Other
,0,3,3,17,56,17,1,74,11,313,...,55,1,0,0,5,362,31,0,17,0
acl,6,40,3,3,0,2,0,3,1,2,...,0,0,0,0,0,0,0,0,4,0
acl:relcl,3,1,108,3,1,0,0,20,5,0,...,0,0,0,0,0,0,1,0,0,0
advcl,10,4,3,250,5,1,0,20,3,6,...,10,0,1,0,4,0,4,0,7,0
advmod,71,2,0,2,544,17,0,1,0,18,...,6,1,0,0,0,5,4,0,7,0
amod,26,1,3,0,21,289,0,0,0,3,...,3,0,0,0,0,0,1,0,2,0
appos,0,0,0,0,0,0,16,0,0,0,...,0,0,0,0,1,0,0,0,0,0
aux,62,0,12,15,3,1,0,353,7,0,...,0,0,0,0,3,0,37,0,2,0
aux:pass,13,0,1,2,0,0,0,10,17,0,...,0,0,0,0,0,0,9,0,1,0
case,255,1,0,9,29,3,0,1,0,933,...,1,0,0,0,1,3,5,0,6,0
