In [17]:
from io import open
import numpy as np
import pandas as pd

from nltk import FreqDist
from nltk import WittenBellProbDist
from nltk.util import ngrams

from conllu import parse_incr

corpora = {}
corpora['en'] = 'UD_English-EWT/en_ewt'
corpora['es'] = 'UD_Spanish-GSD/es_gsd'
corpora['nl'] = 'UD_Dutch-Alpino/nl_alpino'


def train_corpus(lang):
    return 'D:/CS5012/P1 HMM/corpora/' + corpora[lang] + '-ud-train.conllu'  # adjust to local path. debug only


# return corpora[lang] + '-ud-train.conllu'

def test_corpus(lang):
    return 'D:/CS5012/P1 HMM/corpora/' + corpora[lang] + '-ud-test.conllu'  # adjust to local path, debug only


# return corpora[lang] + '-ud-test.conllu'

# Remove contractions such as "isn't".
def prune_sentence(sent):
    return [token for token in sent if type(token['id']) is int]


def conllu_corpus(path):
    data_file = open(path, 'r', encoding='utf-8')
    sents = list(parse_incr(data_file))
    return [prune_sentence(sent) for sent in sents]


# Choose language.
# lang = 'en'

# Limit length of sentences to avoid underflow.
max_len = 100
def get_sents(lang):
    train_sents = conllu_corpus(train_corpus(lang))
    test_sents = conllu_corpus(test_corpus(lang))
    test_sents = [sent for sent in test_sents if len(sent) <= max_len]
    print(len(train_sents), 'training sentences')
    print(len(test_sents), 'test sentences')
    return train_sents, test_sents

def get_tags(lang):
    train_sents, test_sents = get_sents(lang)
    tags = []
    for sent in train_sents:
        for token in sent:
            tags.append(token.get('upos'))
    return tags

def get_bigrams_result(lang):
    tags = get_tags(lang)
    return list(ngrams(tags,2))

In [18]:
bigrams_en = get_bigrams_result(lang="en")

12543 training sentences
2077 test sentences


In [26]:
bigrams_es = get_bigrams_result(lang="es")

14187 training sentences
422 test sentences


In [27]:
bigrams_nl = get_bigrams_result(lang="nl")

12264 training sentences
596 test sentences


In [33]:
def get_same_elements(list1, list2):
    c = len(list1) if len(list1) < len(list2) else len(list2)
    count = 0
    for i in range(c):
        if list1[1] == list2[i]:
            count += 1
    return count

In [36]:
len(bigrams_en)

204584

In [37]:
len(bigrams_es)

382435

In [38]:
len(bigrams_nl)

185882

In [34]:
print("similiarity between en and es: ")
len(bigrams_en) + len(bigrams_es) - 2* get_same_elements(bigrams_en, bigrams_es)

similiarity between en and es: 


581121

In [39]:
print("similiarity between en and nl: ")
len(bigrams_en) + len(bigrams_nl) - 2* get_same_elements(bigrams_en, bigrams_nl)

similiarity between en and nl: 


388210

In [40]:
print("similiarity between en and es: ")
len(bigrams_es) + len(bigrams_nl) - 2* get_same_elements(bigrams_es, bigrams_nl)

similiarity between en and es: 


566641