In [1]:
import json
import os
import re
import string

from cltk.tokenize.word import WordTokenizer
from cltk.stem.lemma import LemmaReplacer
from nltk.translate import AlignedSent, Alignment, IBMModel1, IBMModel2
from nltk.tokenize import wordpunct_tokenize
from nltk.translate.ibm_model import AlignmentInfo

In [2]:
lat_tokenizer = WordTokenizer('latin')
eng_tokenizer = wordpunct_tokenize
lem = LemmaReplacer('latin')

In [3]:
def get_sentence_pair(alignments):
    for a in alignments:
        for s in a:
            #for s in p:
            lat_sent = tokenize_sentences(s[0],lat_tokenizer.tokenize)
            eng_sent = tokenize_sentences(s[1], eng_tokenizer)
            yield [lat_sent,eng_sent]

In [3]:
def tokenize_sentences(sent,tok):
    tokens = []
    if type(sent) == list:
        for s in sent:
            s = re.sub(r'[{}]'.format(string.punctuation),'',s).lower()
            s = re.sub(r'  *', ' ', s)
            tokens += tok(s.lower())
    else:
        sent = re.sub(r'[{}]'.format(string.punctuation),'',sent).lower()
        sent = re.sub(r'  *', ' ', sent)
        tokens += tok(sent)
        
    return tokens

In [4]:
os.chdir('../aligned_sentences/')

In [5]:
os.listdir()

['caes_bc_sentences.json',
 'cicero_lael_friendship_sentences.json',
 'catullus.json',
 'cicero_div_sentences.json',
 'celsus_sentences.json',
 'amm_lat_sentences.json',
 'cicero_off_sentences.json',
 'caes_bg_sentences_03457.json',
 'cicero_sen_falc_sentences.json']

In [6]:
bitext = []

for f in os.listdir():
    if f == 'amm_lat_sentences.json':
        continue
    of = open(f)
    aligned_sentences = json.loads(of.read())
    of.close()

    for a in aligned_sentences:
        for s in a:
            lat_sent = tokenize_sentences(s[0],lat_tokenizer.tokenize)
            eng_sent = tokenize_sentences(s[1], eng_tokenizer)
            if len(lat_sent) == 0 or len(eng_sent) == 0:
                continue
            bitext.append(AlignedSent(lat_sent,eng_sent))

of = open('amm_lat_sentences.json')
aligned_sentences = json.loads(of.read())
of.close()

for a in aligned_sentences:
    for p in a:
        for s in p:
            lat_sent = tokenize_sentences(s[0],lat_tokenizer.tokenize)
            #while '' in lat_sent:
            #    lat_sent.remove('')
            #lat_sent = lem.lemmatize(lat_sent)
            eng_sent = tokenize_sentences(s[1], eng_tokenizer)
            if len(lat_sent) == 0 or len(eng_sent) == 0:
                continue
            bitext.append(AlignedSent(lat_sent,eng_sent))

In [7]:
len(bitext)

13842

In [8]:
lat_mod = IBMModel2(bitext,5)

In [9]:
eng_bitext = []

for f in os.listdir():
    if f == 'amm_lat_sentences.json':
        continue
    of = open(f)
    aligned_sentences = json.loads(of.read())
    of.close()

    for a in aligned_sentences:
        for s in a:
            lat_sent = tokenize_sentences(s[0],lat_tokenizer.tokenize)
            eng_sent = tokenize_sentences(s[1], eng_tokenizer)
            if len(lat_sent) == 0 or len(eng_sent) == 0:
                continue
            eng_bitext.append(AlignedSent(eng_sent, lat_sent))

of = open('amm_lat_sentences.json')
aligned_sentences = json.loads(of.read())
of.close()

for a in aligned_sentences:
    for p in a:
        for s in p:
            lat_sent = tokenize_sentences(s[0],lat_tokenizer.tokenize)
            #while '' in lat_sent:
            #    lat_sent.remove('')
            #lat_sent = lem.lemmatize(lat_sent)
            eng_sent = tokenize_sentences(s[1], eng_tokenizer)
            if len(lat_sent) == 0 or len(eng_sent) == 0:
                continue
            eng_bitext.append(AlignedSent(eng_sent,lat_sent))

In [13]:
print(eng_bitext[0])

<AlignedSent: 'while these things p...' -> 'dum haec in hispania...'>


In [10]:
eng_mod = IBMModel2(eng_bitext,5)

In [16]:
print('Done')

Done


In [103]:
lat_mod.best_model2_alignment(AlignedSent(['vir', 'mittit','florem'],['the','dog','sleeps']))

<nltk.translate.ibm_model.AlignmentInfo at 0x10e4372e8>

In [104]:
ba = _

In [105]:
ba.alignment

(0, 3, 2, 2)

In [82]:
ba.alignment = (0,0,1,2)

In [106]:
lat_mod.prob_t_a_given_s(ba)

1e-12

In [34]:
len(bitext)

13842

In [313]:
a = Alignment([(0,0),(1,1),(2,2)])

In [64]:
al = [AlignedSent(['puer','amat','puella'],['boy','loves','girl'])]

In [34]:
water = eng_mod.translation_table['wonder']

import operator

sorted_x = sorted(water.items(), key=operator.itemgetter(1),reverse=True)

sorted_x

[('mirari', 0.39738726744809927),
 ('mirum', 0.290788864574761),
 ('meditantur', 0.22387139799136865),
 ('ambigere', 0.1826862623044445),
 ('admirari', 0.179655016247271),
 ('edidisset', 0.1705839395142952),
 ('serpentem', 0.1625525042587961),
 ('pallium', 0.15841327057590487),
 ('importunitate', 0.11716463336651324),
 ('mirabilis', 0.11193191876965876),
 ('gratuitam', 0.09733852417391137),
 ('conliniet', 0.09020256829106647),
 ('invisum', 0.0874431166264658),
 ('effudisse', 0.08679655992940684),
 ('mirandum', 0.08567665253373072),
 ('excitaverunt', 0.08218588582184007),
 ('seiungam', 0.07639166228945823),
 ('accommodatur', 0.07639166228945823),
 ('limatur', 0.07639166228945823),
 ('subtilitas', 0.07639166228945823),
 ('superbia', 0.07613365438661937),
 ('usitatis', 0.05921774222614373),
 ('attribuito', 0.056862817054392875),
 ('admirantis', 0.056862817054392875),
 ('cius', 0.056862817054392875),
 ('perstudiosum', 0.056862817054392875),
 ('tribuimus', 0.056862817054392875),
 ('aristo',

In [73]:
import dill as pickle

In [36]:
import pickle

In [74]:
with open('lat_ibm_model_2_1.pk', 'wb') as fout:
    pickle.dump(lat_mod,fout)

In [77]:
with open('eng_ibm_model_2_1.pk', 'wb') as fout:
    pickle.dump(eng_mod,fout)

In [37]:
with open('lat_ibm_model_2_bitext.pk', 'wb') as fout:
    pickle.dump(bitext,fout)

In [38]:
with open('eng_ibm_model_2_bitext.pk', 'wb') as fout:
    pickle.dump(eng_bitext,fout)

In [217]:
l.src_sentence

(None, 'boy', 'loves', 'girl')

In [218]:
l.trg_sentence

('UNUSED', 'puer', 'amat', 'puellam')

In [71]:
bitext[3].alignment

Alignment([(0, 3), (1, 63), (2, 5), (3, 50), (4, 50), (5, 6), (6, 12), (7, 3), (8, 21), (9, 24), (10, 23), (11, 19), (12, 20), (13, 63), (14, 50), (15, 30), (16, 6), (17, 72), (18, 3), (19, 6), (20, 36), (21, 55), (22, 3), (23, 41), (24, 54), (25, 63), (26, 74), (27, 3), (28, 35), (29, 6), (30, 46), (31, 54), (32, 3), (33, 47), (34, 50), (35, 50), (36, 62), (37, 3), (38, 66), (39, 78), (40, 20), (41, 67), (42, 77), (43, 79), (44, 64), (45, 3)])

In [126]:
b = 1
for a in bitext[b].alignment:
    w = a[0]
    m = a[1]
    if w == None:
        word = 'None'
    else:
        word = bitext[b].words[w]
    if m == None:
        mot = 'None'
    else:
        mot = bitext[b].mots[a[1]]
        
    print('{} : {}'.format(word,mot))

hispania : spain
quod : which
adiacet : empties
qua : rhone
-que : the
ad : the
ostium : empties
altera : side
portu : port
portam : empties
gallia : gaul
atque : and
una : one
mare : sea
est : None
erat : was
proxima : attacks
navalibus : empties
ad : towards
aditus : empties
ad : the
id : the
rhodani : rhone
ex : of


In [165]:
bitext[-900].words

['quidam',
 'cataplasmo',
 'calfacientibus',
 'uto',
 'eo1',
 'multus',
 'pondus',
 'noceo',
 'quis1',
 'vesica',
 'urgeo',
 'vulnus',
 'irrito1',
 'qui1',
 'calor1',
 'proficio']

In [70]:
for b in range(len(bitext)):
    for a in bitext[b].alignment:
        print('{} : {}'.format(bitext[b].words[a[0]],bitext[b].mots[a[1]]))

per : by
iuvant : incursion
ordinatus : reconnoitring
bellis : involved
eum : him
moventem : incursion
dederit : makes
mesopotamiam : mesopotamia
nostra : territories
quadam : policy
ferocissimas : stage
isauriam : incursion
arma : frontiers


TypeError: list indices must be integers or slices, not NoneType

In [58]:
bitext[3].words[8]

'ab'

In [59]:
bitext[3].mots[60]

'from'

In [23]:
mod = IBMModel1.train(bitext)

TypeError: train() missing 1 required positional argument: 'parallel_corpus'

In [16]:
mod.translation_table['rex']['king']

0.5675554043221417

In [287]:
mod.translation_table['regina']['queen']

0.6156239752345138

In [279]:
mod.prob_of_alignments(['hic','puer','amat'],['this','boy','loves'],[(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])

TypeError: prob_of_alignments() takes 2 positional arguments but 4 were given

In [297]:
mod.align(bitexts[0])

AttributeError: 'IBMModel2' object has no attribute 'align'

In [295]:
al_s = AlignedSent(['puer','amat','puellam'],['the','boy','loves','the','girl'],a)

In [293]:
a = Alignment([(0,1),(1,2),(2,4)])

In [298]:
bitext[0].src_sentence

AttributeError: 'AlignedSent' object has no attribute 'src_sentence'