In [1]:
from collections import Counter
from math import log2

In [12]:
def get_counts(blocks):
    pos_counts = Counter()
    edge_label_counts = Counter()
    for b in blocks:
        for l in b.splitlines():
            if l.startswith('#'):
                continue
            fields = l.split('\t')
            pos = fields[3]
            edge = fields[7]
            if pos not in {'X', '_', 'PUNCT'}:
                pos_counts[pos] += 1
            if edge not in {'_', 'punct', 'root'}:
                # Strip refinements to make things more similar
                edge_label_counts[edge.split(':')[0]] += 1
    return (pos_counts, edge_label_counts)

In [18]:
def KL_divergences(path1, path2):
    with open(path1, 'r', encoding='utf-8') as inp:
        blocks1 = inp.read().split('\n\n')
    with open(path2, 'r', encoding='utf-8') as inp:
        blocks2 = inp.read().split('\n\n')
    blocks1 = [b for b in blocks1 if b]
    blocks2 = [b for b in blocks2 if b]
    
    pos_counts1, edge_counts1 = get_counts(blocks1)
    pos_counts2, edge_counts2 = get_counts(blocks2)
    
    # Report violations of absolute continuity for POS
    pos_unique_to_1 = set.difference(
        set(pos_counts1.keys()),
        set(pos_counts2.keys())
    )
    pos_unique_to_2 = set.difference(
        set(pos_counts2.keys()),
        set(pos_counts1.keys())
    )
    if pos_unique_to_1:
        print(f'POS found only in {path1}:', pos_unique_to_1)
    if pos_unique_to_2:
        print(f'POS found only in {path2}:', pos_unique_to_2)
    
    # Now blissfully ignore them
    common_pos = set.intersection(set(pos_counts1.keys()), set(pos_counts2.keys()))
    
    # Report violations of absoluter continuity for edge labels
    edges_unique_to_1 = set.difference(
        set(edge_counts1.keys()),
        set(edge_counts2.keys())
    )
    edges_unique_to_2 = set.difference(
        set(edge_counts2.keys()),
        set(edge_counts1.keys())
    )
    if edges_unique_to_1:
        print(f'Edge labels found only in {path1}:', edges_unique_to_1)
    if edges_unique_to_2:
        print(f'Edge labels found only in {path2}:', edges_unique_to_2)
        
    # Now blissfully ignore them
    common_edge_labels = set.intersection(set(edge_counts1.keys()), set(edge_counts2.keys()))
    
    KL_divergence_POS = 0
    KL_divergence_edges = 0
    
    total_pos1 = sum(pos_counts1.values())
    total_pos2 = sum(pos_counts2.values())
    total_edges1 = sum(edge_counts1.values())
    total_edges2 = sum(edge_counts2.values())
    
    for pos in common_pos:
        P_x = pos_counts1[pos] / total_pos1
        Q_x = pos_counts2[pos] / total_pos2
        KL_divergence_POS += -1 * P_x * log2( Q_x / P_x )
        
    for edge in common_edge_labels:
        P_x = edge_counts1[edge] / total_edges1
        Q_x = edge_counts2[edge] / total_edges2
        KL_divergence_edges += -1 * P_x * log2( Q_x / P_x )
        
    return (KL_divergence_POS, KL_divergence_edges)

In [21]:
! exa *pud-ud*

ar_pud-ud-test.conllu  fr_pud-ud-test.conllu  ru_pud-ud-test.conllu
cs_pud-ud-test.conllu  id_pud-ud-test.conllu  zh_pud-ud-test.conllu
en_pud-ud-test.conllu  ja_pud-ud-test.conllu  zh_pud-ud-test.conllu.txt


In [22]:
other_languages = [
    'ar',
    'cs',
    'fr',
    'id',
    'ja',
    'ru',
    'zh'
]

In [23]:
kl_divs = []
for lang_code in other_languages:
    kl_divs.append((
        KL_divergences('en_pud-ud-test.conllu', f'{lang_code}_pud-ud-test.conllu'),
        lang_code
    ))

POS found only in en_pud-ud-test.conllu: {'INTJ'}
Edge labels found only in en_pud-ud-test.conllu: {'reparandum'}
POS found only in en_pud-ud-test.conllu: {'INTJ'}
Edge labels found only in en_pud-ud-test.conllu: {'dislocated', 'reparandum', 'dep', 'goeswith'}
POS found only in en_pud-ud-test.conllu: {'INTJ'}
Edge labels found only in en_pud-ud-test.conllu: {'reparandum'}
POS found only in en_pud-ud-test.conllu: {'INTJ', 'SCONJ'}
Edge labels found only in en_pud-ud-test.conllu: {'expl', 'reparandum', 'orphan'}
Edge labels found only in id_pud-ud-test.conllu: {'clf'}
POS found only in en_pud-ud-test.conllu: {'INTJ'}
Edge labels found only in en_pud-ud-test.conllu: {'reparandum', 'conj', 'vocative', 'discourse', 'goeswith', 'expl', 'orphan', 'xcomp', 'flat', 'parataxis'}
POS found only in en_pud-ud-test.conllu: {'INTJ'}
Edge labels found only in en_pud-ud-test.conllu: {'reparandum', 'dep', 'dislocated'}
POS found only in en_pud-ud-test.conllu: {'SYM', 'INTJ'}
Edge labels found only in en

In [26]:
# Sort by KL divergence in POS distribution
[el[1] for el in sorted(kl_divs, key = lambda x: x[0][0])]

['cs', 'ru', 'fr', 'id', 'zh', 'ar', 'ja']

In [27]:
# Sort by KL divergence in edge-label distribution
[el[1] for el in sorted(kl_divs, key = lambda x: x[0][1])]

['fr', 'id', 'cs', 'ar', 'ru', 'zh', 'ja']