In [142]:
import sqlite3
import pymorphy2
import json
from collections import Counter
from itertools import combinations as combs
from queue import SimpleQueue

In [2]:
from pymorphy2 import MorphAnalyzer
ma = MorphAnalyzer()

In [52]:
def normalise_key(k):
    """Converts 0-based indexing to 1-based indexing."""
    return str(int(k)+1)

In [101]:
def conll2graph(record):
    """Converts sentences described using CoNLL-U format (http://universaldependencies.org/format.html)
    to graphs. Returns a dictionary of nodes (wordforms and POS tags indexed by line numbers)
    together with a graph of the dependencies encoded as adjacency lists of
    (node_key, relation_label, direction[up or down]) tuples."""
    graph = {}
    nodes = {}
    for line in record.splitlines():
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        key = fields[0]
        # lemma would be better, but there are no lemmas in Russian PUD
        # take care of this at a later stage
        wordform = fields[1] 
        pos = fields[3]
        parent = fields[6]
        relation = fields[7]
        nodes[key] = { 'wordform': wordform, 'pos': pos }
        if key not in graph:
            graph[key] = []
        if parent not in graph:
            graph[parent] = []
        graph[key].append((parent, relation, 'up'))
        graph[parent].append((key, relation, 'down'))
    return (nodes, graph)

In [136]:
def extract_raw_sentences(record):
    """Extracts target and source sentences from the target record."""
    lines = record[3].splitlines()
    for l in lines:
        if l.startswith('# text = '):
            target = l.strip('\n')[len('# text = '):]
            for l2 in lines:
                if l2.startswith('# text_en = '):
                    source = l2.strip('\n')[len('# text_en = '):]
                    return (source, target)
            else:
                raise ValueError('No source sentence found')
    else:
        raise ValueError('No target sentence found')

In [152]:
def preprocess_alignment(alignment_str):
    """Extracts unaligned words and one-to-many alignments.
    returns remaining edges as a list."""
    en_degrees = Counter()
    fr_degrees = Counter()
    unaligned_en = []
    unaligned_fr = []
    one_to_many_en = {}
    one_to_many_fr = {}
    alignment_edges = alignment_str.split()
    real_edges = []
    resulting_edges = []
    for edge in alignment_edges:
        en, fr = edge.split('-')
        if en == 'X':
            unaligned_fr.append(fr)
        elif fr == 'X':
            unaligned_en.append(en)
        else:
            en_degrees[en] += 1
            fr_degrees[fr] += 1
            real_edges.append((en, fr))
    for edge in real_edges:
        en, fr = edge
        if en_degrees[en] > 1:
            if en not in one_to_many_en:
                one_to_many_en[en] = []
            one_to_many_en[en].append(fr)
        elif fr_degrees[fr] > 1:
            if fr not in one_to_many_fr:
                one_to_many_fr[fr] = []
            one_to_many_fr[fr].append(en)
        else:
            resulting_edges.append(edge)
    return (
        unaligned_en,
        unaligned_fr,
        one_to_many_en,
        one_to_many_fr,
        resulting_edges
    )

In [110]:
def get_path(node1, node2, graph):
    if node1 == node2:
        return []
    
    # BFS with edge labels for paths
    q = SimpleQueue()
    # Remembers where we came from and the edge label
    sources = {}
    
    q.put(node1)
    visited = set()
    visited.add(node1)
    
    while not q.empty():
        current = q.get()
        for neighbour, relation, direction in graph[current]:
            if neighbour == node2:
                path = [relation+'_'+direction]
                source = current
                while source != node1:
                    prev_node, prev_relation, prev_direction = sources[source]
                    path.append(prev_relation+'_'+prev_direction)
                    source = prev_node
                return list(reversed(path))
            elif neighbour not in visited:
                sources[neighbour] = (current, relation, direction)
                q.put(neighbour)
            visited.add(neighbour)
            
    raise ValueError("UD graph is not connected.")

In [62]:
# We use edit distance to compare paths.
# Copied this from https://www.python-course.eu/levenshtein_distance.php
# with some modifications
# 's' and 't' are lists with POS tags as 'characters'.
def iterative_levenshtein(s, t):
    if len(s) == 0:
        return len(t)
    if len(t) == 0:
        return len(s)
    rows = len(s)+1
    cols = len(t)+1
    dist = [[0 for j in range(cols)] for i in range(rows)]
    # source prefixes can be transformed into empty strings 
    # by deletions:
    for i in range(1, rows):
        dist[i][0] = i
    # target prefixes can be created from an empty source string
    # by inserting the characters
    for i in range(1, cols):
        dist[0][i] = i
        
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0
            else:
                cost = 1
            dist[row][col] = min(dist[row-1][col] + 1,      # deletion
                                 dist[row][col-1] + 1,      # insertion
                                 dist[row-1][col-1] + cost) # substitution
 
    return dist[row][col]

In [63]:
# Some unimaginative tests
iterative_levenshtein(['obj', 'acl:relcl', 'nsubj'],
                     ['obj', 'conj', 'nsubj'])

1

In [64]:
iterative_levenshtein(['obj', 'conj', 'nsubj'],
                     ['obj', 'conj', 'nsubj'])

0

In [65]:
iterative_levenshtein(
    ['obj', 'conj', 'nsubj'],
    list(reversed(['obj', 'conj', 'nsubj']))
)

2

In [67]:
iterative_levenshtein(
    ['obj', 'conj', 'nsubj'],
    []
)

3

In [66]:
iterative_levenshtein([], [])

0

In [None]:
from itertools import combinations as combs
def get_corpus_stats(records):
    """en stands for any first language in the parallel corpus,
    ru for any second language."""
    # Get POS distribution for unaligned words
    en_unaligned_pos = {}
    en_unaligned_by_pos = {}
    ru_unaligned_pos = {}
    ru_unaligned_by_pos = {}
    for i, r in enumerate(records):
        unalgnd_en, unalgnd_ru = get_unaligned(r[4])
        nodes_en, graph_en, digraph_en = conll2graph(r[2])
        nodes_ru, graph_ru, digraph_ru = conll2graph(r[3])
        for key, pos in get_unaligned_pos(unalgnd_ru, nodes_ru).items():
            ru_unaligned_pos[pos] = ru_unaligned_pos.get(pos, 0) + 1
            if pos not in ru_unaligned_by_pos:
                ru_unaligned_by_pos[pos] = []
            ru_unaligned_by_pos[pos].append((nodes_ru[key]['wordform'], i))
        for key, pos in get_unaligned_pos(unalgnd_en, nodes_en).items():
            en_unaligned_pos[pos] = en_unaligned_pos.get(pos, 0) + 1
            if pos not in en_unaligned_by_pos:
                en_unaligned_by_pos[pos] = []
            en_unaligned_by_pos[pos].append((nodes_en[key]['wordform'], i))

    # Get paths between nodes in the target language
    # corresponding to 1-edge paths between nodes
    # in the source language.
    path_pairs = {}
    for i, r in enumerate(records):
        
            
        
    return (
        en_unaligned_pos,
        en_unaligned_by_pos,
        ru_unaligned_pos,
        ru_unaligned_by_pos,
        path_pairs
    )

In [5]:
def get_edges(s):
    en_ru = {}
    for edge in s.split():
        head, tail = edge.split('-')
        if head == 'X' or tail == 'X':
            continue
        if head not in en_ru:
            en_ru[head] = []
        head = str(int(head)+1)
        tail = str(int(tail)+1)
        en_ru[head].append(tail)
    return en_ru

In [6]:
def get_unaligned_pos(unaligned, nodes):
    unaligned_pos = {}
    for u in unaligned:
        key = str(int(u)+1)
        unaligned_pos[key] = nodes[key]['pos']
    return unaligned_pos

In [7]:
def lemmatise_ru(wordform):
    return ma.parse(wordform)[0].normal_form

In [85]:
conn = sqlite3.connect('pud_25_12.db')
cursor = conn.cursor()

In [88]:
# Get POS distribution for unaligned words
en_unaligned_pos = {}
en_unaligned_by_pos = {}
ru_unaligned_pos = {}
ru_unaligned_by_pos = {}
edges_stats = {}
for i, r in enumerate(records):
#     en_ru_edges = get_edges(r)
#     for head, val in en_ru_edges.items():
#         for tail in val:
#             edges_stats
    unalgnd_en, unalgnd_ru = get_unaligned(r[4])
    nodes_en, graph_en = conll2graph(r[2])
    nodes_ru, graph_ru = conll2graph(r[3])
    for key, pos in get_unaligned_pos(unalgnd_ru, nodes_ru).items():
        ru_unaligned_pos[pos] = ru_unaligned_pos.get(pos, 0) + 1
        if pos not in ru_unaligned_by_pos:
            ru_unaligned_by_pos[pos] = []
        ru_unaligned_by_pos[pos].append((nodes_ru[key]['wordform'], i))
    for key, pos in get_unaligned_pos(unalgnd_en, nodes_en).items():
        en_unaligned_pos[pos] = en_unaligned_pos.get(pos, 0) + 1
        if pos not in en_unaligned_by_pos:
            en_unaligned_by_pos[pos] = []
        en_unaligned_by_pos[pos].append((nodes_en[key]['wordform'], i))

In [87]:
sum(ru_unaligned_pos.values())

465

In [106]:
Counter(el[0].lower() for el in en_unaligned_by_pos['NOUN']).most_common(10)

[('world', 2),
 ('side', 2),
 ('time', 2),
 ('post', 1),
 ('shoulders', 1),
 ('example', 1),
 ('market', 1),
 ('levels', 1),
 ('targets', 1),
 ('element', 1)]

In [107]:
Counter(el[0].lower() for el in en_unaligned_by_pos['VERB']).most_common(10)

[('have', 3),
 ('take', 2),
 ('teaching', 1),
 ('face', 1),
 ('cast', 1),
 ('secure', 1),
 ('ruin', 1),
 ('needed', 1),
 ('carrying', 1),
 ('read', 1)]

In [131]:
[el for el in en_unaligned_by_pos['VERB'] if el[0] == 'needed']

[('needed', 47)]

In [110]:
Counter(el[0].lower() for el in en_unaligned_by_pos['ADJ']).most_common(10)

[('many', 3),
 ('more', 2),
 ('own', 2),
 ('much', 1),
 ('underlying', 1),
 ('old', 1),
 ('positive', 1),
 ('like', 1),
 ('best', 1),
 ('hawkish', 1)]

In [111]:
Counter(el[0].lower() for el in en_unaligned_by_pos['ADV']).most_common(10)

[('just', 4),
 ('so', 2),
 ('right', 2),
 ('how', 2),
 ('eventually', 2),
 ('here', 1),
 ('again', 1),
 ('slightly', 1),
 ('still', 1),
 ('earlier', 1)]

In [104]:
Counter(lemmatise(el[0].lower()) for el in ru_unaligned_by_pos['NOUN']).most_common(10)

[('город', 4),
 ('проблема', 3),
 ('компания', 3),
 ('размер', 3),
 ('штат', 3),
 ('ситуация', 3),
 ('место', 3),
 ('следующий', 3),
 ('сумма', 2),
 ('результат', 2)]

In [83]:
ru_unaligned_by_pos['X']

[('Wi-Fi', 35),
 ('же', 217),
 ('же', 224),
 ('Именно', 254),
 ('Лишь', 278),
 ('же', 355)]

In [81]:
ru_unaligned_by_pos['PROPN']

[('производителей', 7),
 ('соглашений', 48),
 ('КНР', 245),
 ('VW', 261),
 ('Великобритании', 279),
 ('Америки', 321),
 ('защиты', 333),
 ('Эмиратов', 365)]

In [103]:
Counter(lemmatise(el[0].lower()) for el in ru_unaligned_by_pos['VERB']).most_common(10)

[('мочь', 3),
 ('е.', 3),
 ('позволить', 2),
 ('удаться', 2),
 ('обратить', 2),
 ('сидеть', 2),
 ('обладать', 2),
 ('продолжать', 2),
 ('продолжить', 2),
 ('говорить', 2)]

In [105]:
Counter(lemmatise(el[0].lower()) for el in ru_unaligned_by_pos['ADJ']).most_common(10)

[('такой', 6),
 ('самый', 4),
 ('другой', 3),
 ('никакой', 2),
 ('должный', 2),
 ('столичный', 1),
 ('инновационный', 1),
 ('деловой', 1),
 ('многие', 1),
 ('опасный', 1)]

In [112]:
Counter(lemmatise(el[0].lower()) for el in ru_unaligned_by_pos['ADV']).most_common(10)

[('уже', 5),
 ('только', 3),
 ('там', 3),
 ('весь', 3),
 ('здесь', 2),
 ('очень', 2),
 ('снова', 2),
 ('совсем', 2),
 ('ещё', 2),
 ('сегодня', 1)]

In [13]:
en_fr = [r for r in cursor.execute('select * from `en-fr`')][:740]

In [103]:
en_n, en_g = conll2graph(en_fr[647][2])
fr_n, fr_g = conll2graph(en_fr[647][3])

In [104]:
en_n

{'1': {'wordform': 'Lenny', 'pos': 'PROPN'},
 '2': {'wordform': 'is', 'pos': 'AUX'},
 '3': {'wordform': 'a', 'pos': 'DET'},
 '4': {'wordform': 'persistent', 'pos': 'ADJ'},
 '5': {'wordform': 'bachelor', 'pos': 'NOUN'},
 '6': {'wordform': 'who', 'pos': 'PRON'},
 '7': {'wordform': 'has', 'pos': 'VERB'},
 '8': {'wordform': 'poor', 'pos': 'ADJ'},
 '9': {'wordform': 'luck', 'pos': 'NOUN'},
 '10': {'wordform': 'with', 'pos': 'ADP'},
 '11': {'wordform': 'women', 'pos': 'NOUN'},
 '12': {'wordform': '.', 'pos': 'PUNCT'}}

In [105]:
fr_n

{'1': {'wordform': 'Lenny', 'pos': 'PROPN'},
 '2': {'wordform': 'est', 'pos': 'AUX'},
 '3': {'wordform': 'un', 'pos': 'DET'},
 '4': {'wordform': 'célibataire', 'pos': 'NOUN'},
 '5': {'wordform': 'endurci', 'pos': 'ADJ'},
 '6': {'wordform': 'et', 'pos': 'CCONJ'},
 '7': {'wordform': 'qui', 'pos': 'PRON'},
 '8': {'wordform': 'n’', 'pos': 'ADV'},
 '9': {'wordform': 'a', 'pos': 'VERB'},
 '10': {'wordform': 'aucune', 'pos': 'ADJ'},
 '11': {'wordform': 'chance', 'pos': 'NOUN'},
 '12': {'wordform': 'avec', 'pos': 'ADP'},
 '13': {'wordform': 'les', 'pos': 'DET'},
 '14': {'wordform': 'femmes', 'pos': 'NOUN'},
 '15': {'wordform': '.', 'pos': 'PUNCT'}}

In [111]:
get_path('1', '9', en_g)

['nsubj_up', 'acl:relcl_down', 'obj_down']

In [112]:
get_path('1', '11', fr_g)

['nsubj_up', 'conj_down', 'obj_down']

In [46]:
alignment = en_fr[647][4]

In [47]:
alignment

'0-0 4-3 3-4 6-8 7-9 7-7 8-10 10-13 X-5'

In [59]:
# Check aligned pairs
for c in combs(alignment.split(), 2):
    p, q = c
    if 'X' in p + q:
        continue
    en1, fr1 = map(normalise_key, p.split('-'))
    en2, fr2 = map(normalise_key, q.split('-'))
    print(f'{en_n[en1]["wordform"]}->{en_n[en2]["wordform"]} vs. {fr_n[fr1]["wordform"]}->{fr_n[fr2]["wordform"]}')

Lenny->bachelor vs. Lenny->célibataire
Lenny->persistent vs. Lenny->endurci
Lenny->has vs. Lenny->a
Lenny->poor vs. Lenny->aucune
Lenny->poor vs. Lenny->n’
Lenny->luck vs. Lenny->chance
Lenny->women vs. Lenny->femmes
bachelor->persistent vs. célibataire->endurci
bachelor->has vs. célibataire->a
bachelor->poor vs. célibataire->aucune
bachelor->poor vs. célibataire->n’
bachelor->luck vs. célibataire->chance
bachelor->women vs. célibataire->femmes
persistent->has vs. endurci->a
persistent->poor vs. endurci->aucune
persistent->poor vs. endurci->n’
persistent->luck vs. endurci->chance
persistent->women vs. endurci->femmes
has->poor vs. a->aucune
has->poor vs. a->n’
has->luck vs. a->chance
has->women vs. a->femmes
poor->poor vs. aucune->n’
poor->luck vs. aucune->chance
poor->women vs. aucune->femmes
poor->luck vs. n’->chance
poor->women vs. n’->femmes
luck->women vs. chance->femmes


In [73]:
distances = []
for c in combs(alignment.split(), 2):
    p, q = c
    if 'X' in p + q:
        continue
    en1, fr1 = map(normalise_key, p.split('-'))
    en2, fr2 = map(normalise_key, q.split('-'))
    path_en = get_path(en1, en2, en_g)
    path_fr = get_path(fr1, fr2, fr_g)
    if not path_en or not path_fr:
        dist = 0 # One-to-many alignment
    else:
        dist = iterative_levenshtein(path_en, path_fr)
    distances.append(dist)

In [74]:
distances

[0,
 0,
 1,
 1,
 3,
 1,
 3,
 0,
 1,
 1,
 3,
 1,
 3,
 1,
 1,
 3,
 1,
 3,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 2,
 2,
 2]

In [153]:
preprocess_alignment(en_fr[0][4])

([],
 [],
 {'2': ['6', '4', '5'], '8': ['13', '12'], '20': ['27', '30']},
 {'16': ['12', '11'], '45': ['31', '32']},
 [('6', '9'),
  ('5', '10'),
  ('16', '19'),
  ('15', '20'),
  ('18', '23'),
  ('28', '34'),
  ('26', '35'),
  ('27', '36'),
  ('25', '38'),
  ('24', '39'),
  ('23', '41'),
  ('33', '47'),
  ('1', '1'),
  ('9', '14'),
  ('29', '43')])

In [167]:
def get_path_pairs(en_fr, corpus_name):
    all_distances = []
    average_distances = []
    source_path_lengths_l = []
    all_distances_l = []
    path_pairs = {}
    one_to_many_tmp = []
    for i, record in enumerate(en_fr):
        en_n, en_g = conll2graph(record[2])
        fr_n, fr_g = conll2graph(record[3])
        # TODO: report unaligned, one-to-many
        (unaligned_en, unalined_fr, 
         one_to_many_en, one_to_many_fr, 
         alignment_edges) = preprocess_alignment(record[4])
        distances = []
        source_sent, target_sent = extract_raw_sentences(record)
        for c in combs(alignment_edges, 2):
            p, q = c
            en1, fr1 = map(normalise_key, p)
            en2, fr2 = map(normalise_key, q)
            if fr_n[fr1]['pos'] == 'CCONJ' or fr_n[fr2]['pos'] == 'CCONJ':
                continue # CCONJs were not aligned for Russian
            path_en = get_path(en1, en2, en_g)
            path_fr = get_path(fr1, fr2, fr_g)
            dist = iterative_levenshtein(path_en, path_fr)
            source_path_lengths_l.append(len(path_en))
            all_distances_l.append(dist)
            if len(path_en) == 1 and dist > 0:
                key = path_en[0] + ' vs .' '->'.join(path_fr)
                if key not in path_pairs:
                    path_pairs[key] = {
                        'path_en': path_en[0],
                        'path_fr': '->'.join(path_fr),
                        'count': 0,
                        'examples': []
                    }
                path_pairs[key]['count'] += 1
                path_pairs[key]['examples'].append(
                    (
                        f'{en_n[en1]["wordform"]}->{en_n[en2]["wordform"]}',
                        f'{fr_n[fr1]["wordform"]}->{fr_n[fr2]["wordform"]}',
                        source_sent,
                        target_sent,
                        i+1
                    )
                )
            distances.append(dist)
            all_distances.append(dist)
        try:
            average_distances.append(sum(distances)/len(distances))
        except ZeroDivisionError:
            print(i+1)
    path_pairs_sorted = sorted(path_pairs.values(), key = lambda x: x['count'], reverse=True)
    with open(f'path_divergences_{corpus_name}.json', 'w', encoding='utf-8') as out:
        json.dump(path_pairs_sorted, out, ensure_ascii=False, indent=4)
    # Dump into csv with one example for each type
    with open(f'path_pairs_examples_{corpus_name}.csv', 'w', encoding='utf-8') as out:
        out.write('\t'.join(['PathEn',
                            'PathFr',
                            'Count',
                            'ExampleWordsEn',
                            'ExampleWordsFr',
                            'ExampleSentenceEn',
                            'ExampleSentenceFr',
                            'SentenceID']) + '\n')
        for pp in path_pairs_sorted:
            out.write(f"{pp['path_en']}\t{pp['path_fr']}\t{pp['count']}\t" +\
            '\t'.join(str(el) for el in pp['examples'][0]) + '\n')

In [168]:
en_ru = [r for r in cursor.execute('select * from `en-ru`')][:400]
get_path_pairs(en_ru, 'en_ru')

# with open('en_fr_average_distances.dat', 'w') as out:
#     out.write(','.join(str(el) for el in average_distances))
# with open('en_fr_all_distances.dat', 'w') as out:
#     out.write(','.join(str(el) for el in all_distances))

291


In [126]:
path_pairs[0]

(['nmod_down'], ['nmod_down'])

In [120]:
path_pairs_1 = [el for el in path_pairs if len(el[0]) == 1]

In [127]:
len(path_pairs_1)

8814

In [124]:
with open('path_pairs.csv', 'w') as out:
    for el in path_pairs_1:
        out.write(f'{el[0][0]}\t{"->".join(el[1])}\n')

In [157]:
for k in path_pairs:
    print(json.dumps(path_pairs[key], ensure_ascii=False, indent=2))
    break

{
  "path_en": "flat_down",
  "path_fr": "flat:name_down",
  "count": 83,
  "examples": [
    [
      "Joe->Sternlieb",
      "Joe->Sternlieb",
      "“We face a lot of competition, and we think transit can help,” said Joe Sternlieb, president of the Georgetown BID.",
      "« Nous sommes confrontés à une forte concurrence, et nous pensons que les transports pourront aider » a déclaré Joe Sternlieb, le président du centre des affaires de Georgetown.",
      14
    ],
    [
      "Mr->Panvalkar",
      "M.->Panvalkar",
      "There was a time, Mr Panvalkar said, when he felt that they should leave the building.",
      "M. Panvalkar a dit qu'à un moment donné, il a senti qu'ils devraient quitter l'immeuble.",
      21
    ],
    [
      "Andre->Price",
      "Andre->Price",
      "She killed Andre Price III by pressing his face into an air mattress in her sitting room before trying to do the same to her daughter, Angel, police said.",
      "La police a indiqué qu'elle avait tué Andre P

In [163]:
with open('path_divergences.json', 'w', encoding='utf-8') as out:
    json.dump(path_pairs_sorted, out, ensure_ascii=False, indent=4)

In [166]:
# Dump into csv with one example for each type
with open('path_pairs_examples.csv', 'w', encoding='utf-8') as out:
    out.write('\t'.join(['PathEn',
                        'PathFr',
                        'Count',
                        'ExampleWordsEn',
                        'ExampleWordsFr',
                        'ExampleSentenceEn',
                        'ExampleSentenceFr',
                        'SentenceID']) + '\n')
    for pp in path_pairs_sorted:
        out.write(f"{pp['path_en']}\t{pp['path_fr']}\t{pp['count']}\t" + '\t'.join(str(el) for el in pp['examples'][0]) + '\n')

In [95]:
with open('en_fr_len_dep.csv', 'w') as out:
    out.write('SourcePathLen,Dist\n')
    for i, spl in enumerate(source_path_lengths_l):
        out.write(f'{spl}, {all_distances_l[i]}\n')

In [102]:
en_ru = [r for r in cursor.execute('select * from `en-ru`')][:400]
all_distances = []
average_distances = []
source_path_lengths_l = []
all_distances_l = []
for i, record in enumerate(en_ru):
    en_n, en_g = conll2graph(record[2])
    fr_n, fr_g = conll2graph(record[3])
    alignment = record[4]
    distances = []
    for c in combs(alignment.split(), 2):
        p, q = c
        if 'X' in p + q:
            continue
        en1, fr1 = map(normalise_key, p.split('-'))
        en2, fr2 = map(normalise_key, q.split('-'))
        path_en = get_path(en1, en2, en_g)
        path_fr = get_path(fr1, fr2, fr_g)
        if not path_en or not path_fr:
            dist = 0 # One-to-many alignment
        else:
            dist1 = iterative_levenshtein(path_en, path_fr)
            dist2 = iterative_levenshtein(path_en, list(reversed(path_fr)))
            dist = min(dist1, dist2)
            source_path_lengths_l.append(len(path_en))
            all_distances_l.append(dist)
        distances.append(dist)
        all_distances.append(dist)
    try:
        average_distances.append(sum(distances)/len(distances))
    except ZeroDivisionError:
        print(i+1)
        
# with open('en_ru_average_distances.dat', 'w') as out:
#     out.write(','.join(str(el) for el in average_distances))
# with open('en_ru_all_distances.dat', 'w') as out:
#     out.write(','.join(str(el) for el in all_distances))

291


In [93]:
with open('en_ru_len_dep.csv', 'w') as out:
    out.write('SourcePathLen,Dist\n')
    for i, spl in enumerate(source_path_lengths_l):
        out.write(f'{spl}, {all_distances_l[i]}\n')

In [137]:
extract_raw_sentences(en_fr[0])

('“While much of the digital transition is unprecedented in the United States, the peaceful transition of power is not,” Obama special assistant Kori Schulman wrote in a blog post Monday.',
 "« Alors que la plus grande partie de la transition numérique est sans précédent aux États-Unis, la transition sereine du pouvoir, elle, ne l'est pas,» a publié Kori Schulman, assistante spéciale d'Obama, dans un blog ce lundi.")

In [130]:
en_fr[0][3]

"# newdoc id = n01001\n# sent_id = n01001011\n# text = « Alors que la plus grande partie de la transition numérique est sans précédent aux États-Unis, la transition sereine du pouvoir, elle, ne l'est pas,» a publié Kori Schulman, assistante spéciale d'Obama, dans un blog ce lundi.\n# text_en = “While much of the digital transition is unprecedented in the United States, the peaceful transition of power is not,” Obama special assistant Kori Schulman wrote in a blog post Monday.\n1\t«\t_\tPUNCT\t``\t_\t29\tpunct\t_\t_\n2\tAlors\t_\tADV\tRB\t_\t12\tmark\t_\t_\n3\tque\t_\tSCONJ\tIN\t_\t2\tfixed\t_\t_\n4\tla\t_\tDET\tDT\tGender=Fem|Number=Sing\t7\tdet\t_\t_\n5\tplus\t_\tADV\tRBR\t_\t6\tadvmod\t_\t_\n6\tgrande\t_\tADJ\tJJ\tGender=Fem|Number=Sing\t7\tamod\t_\t_\n7\tpartie\t_\tNOUN\tNN\tGender=Fem|Number=Sing\t12\tnsubj\t_\t_\n8\tde\t_\tADP\tIN\t_\t10\tcase\t_\t_\n9\tla\t_\tDET\tDT\tGender=Fem|Number=Sing\t10\tdet\t_\t_\n10\ttransition\t_\tNOUN\tNN\tGender=Fem|Number=Sing\t7\tnmod\t_\t_\n11\tnu