In [1]:
import sqlite3
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from queue import Queue
from itertools import combinations as combs

In [3]:
import PUDAnalisysLib as PAL
import importlib

In [15]:
PAL = importlib.reload(PAL)

In [50]:
# Only process one-to-one
# Iterate over align function words; check if those have
# unaligned dependents with the same edge label.

def add_alignments(en, ru, content_word_alignments):
    en_n, en_g = PAL.conll2graph(en)
    ru_n, ru_g = PAL.conll2graph(ru)
    ru_aligned = set()
    align_stats = Counter()
    for t, hs in content_word_alignments.items():
        for h in hs:
            ru_aligned.add(h)
    for edge in PAL.one_to_one(content_word_alignments):
        t, h = edge
        t_children = [
            child for child in en_g.get(t, []) if child[2] == 'down' and \
                                                  child[1] != 'punct' and \
                                                  child[0] not in content_word_alignments
        ]
        h_children = [
            child for child in ru_g.get(h, []) if child[2] == 'down' and \
                                                  child[1] != 'punct' and \
                                                  child[0] not in ru_aligned
        ]
        if not t_children or not h_children:
            # Nothing to align
            continue
        elif len(t_children) == 1 and len(h_children) == 1:
            # Align if edge labels are similar
            t_child = t_children[0]
            h_child = h_children[0]
            if t_child[1] == h_child[1]:
                align_stats[(
                    t_child[1],
                    h_child[1]
                )] += 1
        else:
            # Optimistically align pairs of children with
            # the same edge labels disregarding their order
            while t_children and h_children:
                t_child = t_children.pop()
                for i in range(len(h_children)):
                    h_child = h_children[i]
                    if t_child[1] == h_child[1]:
                        # Align
                        align_stats[(
                            t_child[1],
                            h_child[1]
                        )] += 1
                        # Remove from target children and restart
                        h_children = h_children[:i] + h_children[i+1:]
                        break
    return align_stats

In [51]:
def compute_stats_for_additional_alignments(lang):
    en, fr, alignments_en_fr = PAL.get_data_for_lang(lang)
    simple_alignments = Counter()
    for en_, fr_, align in zip(en, fr, alignments_en_fr):
        simple_alignments.update(add_alignments(en_, fr_, align))
    return simple_alignments.most_common()

## Only align single children

In [46]:
compute_stats_for_additional_alignments('fr')

[(('det', 'det'), 686),
 (('case', 'case'), 395),
 (('punct', 'punct'), 136),
 (('nmod:poss', 'nmod:poss'), 84),
 (('mark', 'mark'), 79),
 (('nsubj', 'nsubj'), 22),
 (('fixed', 'fixed'), 20),
 (('obj', 'obj'), 3),
 (('aux:pass', 'aux:pass'), 3),
 (('aux', 'aux'), 2),
 (('cop', 'cop'), 2),
 (('advmod', 'advmod'), 2),
 (('advcl', 'advcl'), 1)]

In [47]:
compute_stats_for_additional_alignments('ru')

[(('case', 'case'), 348),
 (('punct', 'punct'), 267),
 (('cc', 'cc'), 191),
 (('det', 'det'), 46),
 (('mark', 'mark'), 4),
 (('advmod', 'advmod'), 3),
 (('cc:preconj', 'cc:preconj'), 2),
 (('aux:pass', 'aux:pass'), 1)]

In [48]:
compute_stats_for_additional_alignments('zh')

[(('case', 'case'), 192),
 (('punct', 'punct'), 177),
 (('det', 'det'), 34),
 (('mark', 'mark'), 4),
 (('cc', 'cc'), 3),
 (('cop', 'cop'), 1),
 (('aux:pass', 'aux:pass'), 1)]

In [49]:
compute_stats_for_additional_alignments('ko')

[(('punct', 'punct'), 118),
 (('det', 'det'), 38),
 (('cc', 'cc'), 26),
 (('nmod:poss', 'nmod:poss'), 11),
 (('advmod', 'advmod'), 3),
 (('nummod', 'nummod'), 1)]

## Try to align more

In [52]:
compute_stats_for_additional_alignments('fr')

[(('det', 'det'), 1549),
 (('case', 'case'), 1348),
 (('nmod:poss', 'nmod:poss'), 185),
 (('aux:pass', 'aux:pass'), 157),
 (('mark', 'mark'), 149),
 (('cop', 'cop'), 137),
 (('aux', 'aux'), 114),
 (('nsubj', 'nsubj'), 73),
 (('fixed', 'fixed'), 37),
 (('nsubj:pass', 'nsubj:pass'), 14),
 (('advmod', 'advmod'), 12),
 (('obj', 'obj'), 11),
 (('expl', 'expl'), 9),
 (('obl', 'obl'), 5),
 (('ccomp', 'ccomp'), 4),
 (('acl:relcl', 'acl:relcl'), 2),
 (('advcl', 'advcl'), 1),
 (('conj', 'conj'), 1),
 (('parataxis', 'parataxis'), 1)]

In [53]:
compute_stats_for_additional_alignments('ru')

[(('case', 'case'), 756),
 (('cc', 'cc'), 357),
 (('mark', 'mark'), 122),
 (('det', 'det'), 92),
 (('aux:pass', 'aux:pass'), 90),
 (('nsubj', 'nsubj'), 60),
 (('cop', 'cop'), 40),
 (('advmod', 'advmod'), 18),
 (('aux', 'aux'), 11),
 (('nsubj:pass', 'nsubj:pass'), 6),
 (('cc:preconj', 'cc:preconj'), 5),
 (('obl', 'obl'), 5),
 (('amod', 'amod'), 4),
 (('obj', 'obj'), 3),
 (('ccomp', 'ccomp'), 2),
 (('fixed', 'fixed'), 1),
 (('appos', 'appos'), 1),
 (('nummod', 'nummod'), 1),
 (('flat', 'flat'), 1),
 (('nmod', 'nmod'), 1)]

In [54]:
compute_stats_for_additional_alignments('zh')

[(('case', 'case'), 391),
 (('aux', 'aux'), 79),
 (('det', 'det'), 74),
 (('cop', 'cop'), 69),
 (('aux:pass', 'aux:pass'), 47),
 (('mark', 'mark'), 8),
 (('cc', 'cc'), 3),
 (('ccomp', 'ccomp'), 2),
 (('nummod', 'nummod'), 1),
 (('nsubj', 'nsubj'), 1),
 (('appos', 'appos'), 1)]

In [55]:
compute_stats_for_additional_alignments('ko')

[(('det', 'det'), 85),
 (('cop', 'cop'), 55),
 (('cc', 'cc'), 54),
 (('nmod:poss', 'nmod:poss'), 20),
 (('nsubj', 'nsubj'), 12),
 (('advmod', 'advmod'), 12),
 (('advcl', 'advcl'), 3),
 (('nummod', 'nummod'), 1),
 (('acl:relcl', 'acl:relcl'), 1),
 (('nsubj:pass', 'nsubj:pass'), 1),
 (('aux', 'aux'), 1)]

In [67]:
def edge_label_counts(blocks):
    counter = Counter()
    for b in blocks:
        nodes_dict, _ = PAL.conll2graph(b)
        for n in nodes_dict.values():
            counter[n['relation']] += 1
    return counter

In [70]:
en, zh, _ = PAL.get_data_for_lang('zh')

In [71]:
edge_label_counts(zh)

Counter({'punct': 2896,
         'mark': 291,
         'nmod': 702,
         'case': 1319,
         'nummod': 808,
         'compound': 1775,
         'nsubj': 1774,
         'advcl': 514,
         'xcomp': 475,
         'discourse:sp': 87,
         'advmod': 1330,
         'dep': 397,
         'obj': 1522,
         'amod': 419,
         'appos': 248,
         'flat:name': 142,
         'obl:tmod': 214,
         'acl:relcl': 448,
         'mark:relcl': 626,
         'obl': 578,
         'case:loc': 351,
         'root': 999,
         'mark:prt': 337,
         'det': 338,
         'cop': 251,
         'clf': 356,
         'ccomp': 403,
         'aux': 685,
         'csubj': 72,
         'conj': 383,
         'cc': 283,
         'obl:patient': 39,
         'acl': 19,
         'aux:pass': 79,
         'nsubj:pass': 70,
         'obl:agent': 22,
         'discourse': 1,
         'vocative': 1,
         'iobj': 15,
         'flat': 91,
         'dislocated': 5,
         'parataxis': 2,
    

In [72]:
en, fr, _ = PAL.get_data_for_lang('fr')

In [73]:
edge_label_counts(fr)

Counter({'punct': 2550,
         'mark': 450,
         'fixed': 452,
         'det': 3585,
         'advmod': 1002,
         'amod': 1392,
         'nsubj': 1422,
         'case': 3427,
         'nmod': 1819,
         'advcl': 219,
         'obl': 1404,
         'dislocated': 3,
         'parataxis': 105,
         'cop': 226,
         'aux': 569,
         'root': 999,
         'flat:name': 227,
         'appos': 275,
         'obl:tmod': 79,
         'acl:relcl': 225,
         'obj': 1093,
         'xcomp': 407,
         'discourse': 30,
         'nmod:poss': 277,
         'ccomp': 305,
         'iobj': 36,
         'cc': 545,
         'conj': 653,
         'nsubj:pass': 200,
         'aux:pass': 227,
         'expl': 85,
         'nummod': 243,
         'acl': 28,
         'compound': 78,
         'det:predet': 20,
         'csubj': 23,
         'vocative': 1,
         'flat': 17,
         'dep': 9,
         'orphan': 4,
         'goeswith': 3,
         'csubj:pass': 1})

In [69]:
edge_label_counts(en)

Counter({'punct': 2447,
         'mark': 555,
         'nsubj': 1391,
         'case': 2499,
         'det': 2044,
         'amod': 1335,
         'nmod': 1076,
         'cop': 316,
         'advcl': 292,
         'compound': 810,
         'obl': 1237,
         'root': 999,
         'advmod': 856,
         'flat': 229,
         'parataxis': 96,
         'nmod:tmod': 39,
         'acl:relcl': 211,
         'obj': 873,
         'aux': 410,
         'obl:npmod': 18,
         'cc': 574,
         'nmod:poss': 365,
         'acl': 193,
         'ccomp': 135,
         'xcomp': 271,
         'conj': 634,
         'nsubj:pass': 239,
         'aux:pass': 274,
         'dislocated': 2,
         'reparandum': 1,
         'nummod': 254,
         'appos': 142,
         'fixed': 104,
         'expl': 62,
         'csubj': 27,
         'iobj': 10,
         'obl:tmod': 17,
         'orphan': 7,
         'compound:prt': 69,
         'discourse': 1,
         'vocative': 1,
         'nmod:npmod': 19,
    