In [1]:
import sqlite3
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from queue import Queue
from itertools import combinations as combs

In [4]:
import PUDAnalisysLib as PAL

In [59]:
from scipy.stats import spearmanr
from scipy.stats.mstats import gmean

In [3]:
## Compute correlations of counts of all edge labels in block pairs
## Use geometric mean as the measure of structural coherence

In [18]:
from collections import defaultdict

get_rel = lambda node: node['relation'].split(':')[0]

dict_factory = lambda: { 'en': [], 'fr': [] }

def compute_correlations(lang):
    en, fr, _ = PAL.get_data_for_lang(lang)
    vector_pairs = defaultdict(dict_factory)
    for en_b, fr_b in zip(en, fr):
        en_n, _ = PAL.conll2graph(en_b)
        fr_n, _ = PAL.conll2graph(fr_b)
        en_rel_c = Counter()
        fr_rel_c = Counter()
        for n in en_n.values():
            en_rel_c[get_rel(n)] += 1
        for n in fr_n.values():
            fr_rel_c[get_rel(n)] += 1
        # Make sure that the set of relations is congruent
        for k, v in en_rel_c.items():
            vector_pairs[k]['en'].append(v)
            if k not in fr_rel_c:
                vector_pairs[k]['fr'].append(0)
        for k, v in fr_rel_c.items():
            vector_pairs[k]['fr'].append(v)
            if k not in en_rel_c:
                vector_pairs[k]['en'].append(0)
    correlations = {}
    for k, v in vector_pairs.items():
        # Discard rare relations
        if k in {'punct', 'root'} or len(v['en']) < 10:
            continue
        correlations[k] = spearmanr(
            v['en'],
            v['fr']
        )
    return correlations

In [75]:
function_rels = [
    'conj',
    'cc',
    'case',
    'flat',
    'aux',
    'det',
    'mark',
    'cop'
]

def print_cors(lang):
    cors = compute_correlations(lang)
    print('Content relations:')
    content_rels = [el for el in cors if el not in function_rels]
    for key in sorted(content_rels):
        print(f'{key:11}: {cors[key].correlation:.2}')
    print()
    print('Pseudo-geometric mean:')
    cors_nums = [el.correlation+1 for k, el in cors.items() if (k in content_rels) and not pd.isna(el.correlation)]
    print(gmean(cors_nums)-1)
    print()
    print('Function relations:')
    for key in sorted(function_rels):
        print(f'{key:11}: {cors[key].correlation:.2}')
    print()
    print('Pseudo-geometric mean:')
    cors_nums = [el.correlation+1 for k, el in cors.items() if (k not in content_rels) and not pd.isna(el.correlation)]
    print(gmean(cors_nums)-1)

In [76]:
print_cors('fr')

Content relations:
acl        : -0.0052
advcl      : 0.029
advmod     : 0.39
amod       : 0.57
appos      : 0.14
ccomp      : -0.086
compound   : 0.14
csubj      : -0.34
discourse  : -0.036
expl       : -0.29
fixed      : -0.3
iobj       : -0.62
nmod       : 0.48
nsubj      : 0.78
nummod     : 0.41
obj        : 0.39
obl        : 0.52
parataxis  : -0.14
xcomp      : -0.1

Pseudo-geometric mean:
0.03605456614686342

Function relations:
aux        : -0.038
case       : 0.67
cc         : 0.58
conj       : 0.71
cop        : -0.097
det        : 0.56
flat       : 0.15
mark       : 0.25

Pseudo-geometric mean:
0.3118279168559286


In [77]:
print_cors('ru')

Content relations:
acl        : 0.026
advcl      : -0.1
advmod     : 0.3
amod       : 0.56
appos      : -0.17
ccomp      : -0.14
compound   : 0.048
csubj      : -0.7
discourse  : 0.71
expl       : -0.72
fixed      : -0.54
iobj       : -0.1
nmod       : 0.41
nsubj      : 0.69
nummod     : 0.39
obj        : 0.27
obl        : 0.42
orphan     : -0.48
parataxis  : 0.0092
xcomp      : -0.27

Pseudo-geometric mean:
-0.08080484893397

Function relations:
aux        : 0.028
case       : 0.54
cc         : 0.61
conj       : 0.66
cop        : -0.21
det        : -0.015
flat       : 0.22
mark       : -0.043

Pseudo-geometric mean:
0.18226162361306208


In [78]:
print_cors('zh')

Content relations:
acl        : -0.16
advcl      : -0.27
advmod     : 0.27
amod       : 0.15
appos      : -0.11
ccomp      : -0.051
clf        : nan
compound   : 0.27
csubj      : -0.75
dep        : -0.15
discourse  : 0.49
expl       : nan
fixed      : -0.25
iobj       : -0.99
nmod       : 0.16
nsubj      : 0.56
nummod     : 0.29
obj        : 0.29
obl        : 0.27
parataxis  : -0.052
xcomp      : -0.41

Pseudo-geometric mean:
-0.24415707213530946

Function relations:
aux        : -0.14
case       : 0.41
cc         : 0.34
conj       : 0.47
cop        : -0.29
det        : 0.075
flat       : 0.24
mark       : 0.0051

Pseudo-geometric mean:
0.10909795312745896


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [79]:
print_cors('ko')

Content relations:
acl        : 0.043
advcl      : -0.049
advmod     : 0.3
amod       : 0.04
appos      : -0.54
ccomp      : -0.68
compound   : 0.34
csubj      : -0.9
dep        : -0.032
expl       : nan
fixed      : -0.19
goeswith   : -0.72
iobj       : -0.82
nmod       : 0.12
nsubj      : 0.46
nummod     : 0.48
obj        : 0.25
obl        : 0.13
parataxis  : nan
xcomp      : nan

Pseudo-geometric mean:
-0.28242743519261326

Function relations:
aux        : -0.32
case       : nan
cc         : 0.26
conj       : 0.55
cop        : -0.33
det        : 0.0032
flat       : -0.093
mark       : nan

Pseudo-geometric mean:
-0.03450472405257832
