# Word2Vec for a Latin Corpus
Here we will implement word2vec using the gensim library

In [1]:
# Import dependencies 
# gensim library and tools for implementing Word2Vec
import gensim
from gensim.summarization import textcleaner
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.test.utils import datapath

# CLTK library used for cleaning Latin txt data
import cltk
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenize.word import WordTokenizer
from cltk.stop.latin import STOPS_LIST
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer

First we'll need to get the file names for the books that we'll be using

In [3]:
import glob

# Create a list of all the txt file names
path = 'corpus/*.txt'
files = glob.glob(path)
print(files)


['corpus/Cicero-Academica.txt', 'corpus/Athanasius-inillud.txt', 'corpus/Augustine_Confessiones.txt', 'corpus/Tacitus-GermaniaAgricola.txt', 'corpus/Caesar-Commentaries-bks1-4.txt', 'corpus/Cicero-CatoMaiorDeSenectute.txt', 'corpus/Persius-satires.txt', 'corpus/Augustine-CityofGod-bks1to6.txt', 'corpus/Cicero-Orationes.txt', 'corpus/TiberiusCatiusSIliasItalicus-Punicorum Libri Septemdecim.txt', 'corpus/AulusGellius-AtticNights.txt', 'corpus/Quintilianus-InstitutionisOratoriaeLiberDecimus.txt', 'corpus/Caesar-Commentaries-bks5-8.txt', 'corpus/Athanasius-DeclarationOfFaith.txt', 'corpus/Boethius1-3.txt', 'corpus/Augustine-Confessiones2.txt']


In [4]:
#Tokenize a document by words and then create a list of tokenized documents
# Normalize J V
jv_replacer = JVReplacer()

# Read documents into a list and preprocess
books = []
for name in files:
    with open(name, 'r') as f_obj:
        books.append(jv_replacer.replace(f_obj.read().lower()))
        


In [5]:
# Initialize latin word tokenizer
word_tokenizer = WordTokenizer('latin')
lemmatizer = LemmaReplacer('latin')


# Tokenize each book
tokenized_books = []
for book in books:
    tokenized_books.append(word_tokenizer.tokenize(book))

In [147]:
# trying to figure out stopwords

#set(tokenized_books[0]).difference(STOPS_LIST)

{'',
 '!',
 '(',
 ')',
 ',',
 '-ne',
 '-que',
 '-ue',
 '.',
 '.»',
 '1',
 '1.',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '2.',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '3.',
 '30',
 '4',
 '4.',
 '5',
 '5.',
 '6',
 '6.',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '?_',
 '_',
 '_dedit_',
 '_deus',
 '_dominus',
 '_dominus_',
 '_ecce',
 '_etenim',
 '_genitus',
 '_i',
 '_mea',
 '_mortui',
 '_nemo',
 '_omnia',
 '_omnia_',
 '_pater',
 '_per',
 '_producant',
 '_quem',
 '_quæcun',
 '_sanctus',
 '_sanctus_',
 '_sicut',
 '_super',
 '_tradita',
 '_uade_',
 '_uenite',
 'a',
 'absit',
 'absque',
 'absurdum',
 'accepi',
 'accepit',
 'accipere',
 'acciperet',
 'accipiente',
 'accurate',
 'adam',
 'adeo',
 'adiunxit',
 'adiutorem',
 'admirari',
 'admodum',
 'adoranda',
 'adueniret',
 'affingere',
 'agitur',
 'agunt',
 'ait',
 'aiunt',
 'aliena',
 'aliquando',
 'aliud',
 'ambos',
 'ame',
 'amplius',
 'angeli',
 'angelus',
 'animalia',
 'a

Here we'll take a look at how many books and words we have

In [6]:
word_count = 0
for i in range(0, len(tokenized_books)):
    word_count = word_count + len(tokenized_books[i])
    print(len(tokenized_books[i]))
print('Total number of tokens is:', word_count)
#print(len(tokenized_books[0]))

29008
2707
187311
16413
24009
10020
6478
74064
16112
114944
5973
14659
36642
1307
24486
93024
Total number of tokens is: 657157


In [7]:
unique_word_count = 0
for i in range(0, len(tokenized_books)):
    unique_word_count = unique_word_count + len(set(tokenized_books[i]))
    print(len(set(tokenized_books[i])))
print('Total number of unique tokens is:', unique_word_count)

6192
979
25351
5764
5512
3428
3157
14639
4444
21698
2514
4539
8115
512
6409
16384
Total number of unique tokens is: 129637


Summary stats

In [8]:
unique_word_count/word_count

0.1972694500705311

# Train the word2vec Model


In [9]:
mod1 = Word2Vec(tokenized_books, size=50, window=10, min_count=10, iter = 20)

In [17]:
mod1.save('first.model')

# See if we get the same thing after loading the model

In [18]:
myModel = Word2Vec.load('first.model')

In [19]:
myModel.wv.most_similar(positive='militum')

[('proelium', 0.9781436920166016),
 ('impedimenta', 0.976380467414856),
 ('contendit', 0.9707452058792114),
 ('legiones', 0.9697864055633545),
 ('exercitum', 0.968395471572876),
 ('caesar', 0.9671887755393982),
 ('mittit', 0.9665457010269165),
 ('legionem', 0.9658355712890625),
 ('equitum', 0.9653707146644592),
 ('equitatu', 0.9648735523223877)]

In [10]:
mod1.wv.most_similar(positive="militum")

[('proelium', 0.9781436920166016),
 ('impedimenta', 0.976380467414856),
 ('contendit', 0.9707452058792114),
 ('legiones', 0.9697864055633545),
 ('exercitum', 0.968395471572876),
 ('caesar', 0.9671887755393982),
 ('mittit', 0.9665457010269165),
 ('legionem', 0.9658355712890625),
 ('equitum', 0.9653707146644592),
 ('equitatu', 0.9648735523223877)]

In [11]:
mod1.wv.most_similar(positive=['caesar', 'ecclesia'], negative=['copias'])

[('effectus', 0.921194851398468),
 ('cogit', 0.904762864112854),
 ('aduentus', 0.8986020088195801),
 ('barbarorum', 0.8955750465393066),
 ('deam', 0.8872822523117065),
 ('ambrosio', 0.8867388963699341),
 ('situm', 0.8856954574584961),
 ('hastas', 0.8839676976203918),
 ('commemoratio', 0.8779969811439514),
 ('scripturis', 0.877773106098175)]

In [12]:
mod1.wv.most_similar(positive=['capiti', 'gallia'])

[('petierunt', 0.974312961101532),
 ('conuersa', 0.9723016023635864),
 ('aestate', 0.9635784029960632),
 ('principibus', 0.9634742140769958),
 ('aditum', 0.9632537961006165),
 ('decem', 0.9632123112678528),
 ('sede', 0.962972104549408),
 ('turrim', 0.9628841876983643),
 ('accipit', 0.9621815085411072),
 ('actis', 0.9616867303848267)]

In [13]:
len(mod1.wv.vocab)

6605

Roman political titles - consul

In [14]:
mod1.wv.doesnt_match("consul tribunus episcopi".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'episcopi'

In [27]:
mod1.wv.doesnt_match("consul tribunus praetor magistris episcopi".split())

'episcopi'

Household roles - Father mother son daughter husband

In [15]:
mod1.wv.doesnt_match("pater mater filius filia maritus".split())

'filia'

In [16]:
mod1.wv.doesnt_match("ambrosio augustini caesar".split())

'caesar'

# Using the latin library corpus

In [28]:
import glob

# Create a list of all the txt file names
path = 'latin-library/latin_text_latin_library/**/*.txt'
files = glob.glob(path, recursive=True)
print(files)




['latin-library/latin_text_latin_library/epistaustras.txt', 'latin-library/latin_text_latin_library/notitia2.txt', 'latin-library/latin_text_latin_library/asserius.txt', 'latin-library/latin_text_latin_library/sall.2.txt', 'latin-library/latin_text_latin_library/12tables.txt', 'latin-library/latin_text_latin_library/sidonius3.txt', 'latin-library/latin_text_latin_library/levis.txt', 'latin-library/latin_text_latin_library/abbofloracensis.txt', 'latin-library/latin_text_latin_library/priapea.txt', 'latin-library/latin_text_latin_library/quum.txt', 'latin-library/latin_text_latin_library/petrarch.rom.txt', 'latin-library/latin_text_latin_library/obsequens.txt', 'latin-library/latin_text_latin_library/marcellinus1.txt', 'latin-library/latin_text_latin_library/newton.scholium.txt', 'latin-library/latin_text_latin_library/varro.ll10.txt', 'latin-library/latin_text_latin_library/sall.phil.txt', 'latin-library/latin_text_latin_library/sall.lep.txt', 'latin-library/latin_text_latin_library/sid

In [29]:
len(files)

2141

In [30]:
#Tokenize a document by words and then create a list of tokenized documents
# Normalize J V
jv_replacer = JVReplacer()

# Read documents into a list and preprocess
books = []
for name in files:
    with open(name, 'r') as f_obj:
        books.append(jv_replacer.replace(f_obj.read().lower()))
        


In [31]:
# Initialize latin word tokenizer
word_tokenizer = WordTokenizer('latin')
lemmatizer = LemmaReplacer('latin')


# Tokenize each book
tokenized_books = []
for book in books:
    tokenized_books.append(word_tokenizer.tokenize(book))

In [32]:
word_count = 0
for i in range(0, len(tokenized_books)):
    word_count = word_count + len(tokenized_books[i])
    print(len(tokenized_books[i]))
print('Total number of tokens is:', word_count)
#print(len(tokenized_books[0]))

5387
6115
14784
26057
1075
4621
121
5481
4560
381
2359
6925
11909
1840
6140
1115
1014
5927
6928
7574
7610
2360
6008
13210
17196
8319
1521
1704
16372
8199
570
37422
417
984
14089
2137
6135
9607
14699
6841
988
5451
3633
8086
2722
487
951
1233
12208
3195
2879
6142
5516
170
10344
13394
10814
285
4483
11628
1222
5371
1699
9197
4883
5511
6672
11397
2326
19964
13373
2259
206
10343
929
2779
3013
2402
14616
68
1534
5735
3087
19830
190
2748
910
6269
103
4143
3114
42663
4766
45262
46
6347
1851
506
7131
2156
8708
13989
6266
140
7226
12070
2648
27729
12314
1204
18155
130
3020
11663
474
1135
923
7348
2357
34298
529
3792
554
849
4381
10956
810
6491
24434
8015
2151
1636
161
28088
13795
27582
19447
6604
507
2771
2427
15755
3844
25426
166
162
98
7555
11983
720
4375
15483
2236
10616
19442
3152
7479
569
7467
1994
3047
8426
601
1798
7681
1051
482
3889
422
111
11953
3277
34417
4511
4998
3524
206
5241
6724
336
1001
745
3864
1879
1874
215
290
12284
66
782
544
5187
9945
9005
7105
1113
1400
9674
14781
4977
3103

In [33]:
unique_word_count = 0
for i in range(0, len(tokenized_books)):
    unique_word_count = unique_word_count + len(set(tokenized_books[i]))
    print(len(set(tokenized_books[i])))
print('Total number of unique tokens is:', unique_word_count)

2117
1260
4291
6201
488
2486
86
2751
2093
233
1304
2657
4328
810
1602
626
608
3278
3348
1374
3012
1488
3014
3962
5769
3142
907
918
5200
3095
296
6850
278
559
4363
1041
2348
2666
3995
3508
535
2800
1953
3122
974
266
572
608
3777
1847
1752
3076
2785
130
4652
4900
4876
203
1950
5280
685
1980
468
4211
1714
2550
2318
3459
1012
6502
3245
1055
139
3269
582
1162
1439
1413
4921
59
812
2814
1663
4050
137
1228
499
2124
77
2008
1486
9892
2053
8433
37
2608
1206
342
1729
946
4001
5161
2989
103
2190
4473
1603
6377
4527
770
4280
96
1085
3975
221
719
511
2696
1263
8070
286
1551
367
505
2286
3891
582
2365
4445
1713
888
714
110
7926
3212
7394
6099
1550
235
1675
1270
4222
1962
5971
131
89
76
2545
4736
457
1223
5079
1019
3950
7142
1286
3429
397
2481
1066
1377
2875
380
624
2402
634
300
1965
310
38
3616
1104
9045
2009
1659
1450
143
2217
2210
230
609
466
1010
962
950
135
217
4745
52
515
346
2369
2623
2330
2726
626
678
2598
5084
1797
9364
2405
1066
1689
1373
5234
2863
470
582
2261
637
1805
593
40
6813
1453
546

5786
1844
4758
2223
2045
1629
1747
5512
1556
2618
1204
4018
1006
2476
991
3045
2538
5045
5697
6543
1763
798
2360
1738
2107
2964
1697
5512
2864
1441
3381
1205
1347
7074
1613
194
307
2117
2370
3294
5635
7453
2343
2983
2743
3368
5560
5978
1428
4631
3191
4294
1978
4063
3435
2909
4071
3389
4394
3813
752
3274
4228
1297
851
1872
4178
3653
3177
4130
2620
2131
2952
2219
1528
924
5061
2704
1982
3417
2630
2967
2610
3419
2497
2777
3194
3264
2341
5186
3074
3608
2144
2161
1529
1507
3040
2002
2570
2842
2682
2846
3394
2862
3700
3884
2710
2793
421
2705
2229
3560
1337
2262
2777
1995
1884
2058
3184
4247
2552
4952
5507
1396
771
517
457
730
881
499
810
1967
710
2115
5790
2244
5397
3565
5731
5277
2973
4090
5019
458
5269
5230
4650
2436
4401
3924
4789
2904
4091
4205
1647
59
1807
2782
280
2592
1408
2385
13239
2527
753
360
2670
3395
3061
2922
2162
1582
9251
5886
1772
378
2528
243
514
3671
2260
3120
442
2538
2951
2913
3089
2955
2704
2775
3000
3074
3063
3601
2920
2538
3302
2772
2701
2956
2732
3667
3658
3085
3613


In [34]:
unique_word_count/word_count

0.30935665611820273

In [35]:
# takes about 8 minutes to train...model has been saved so just load it below
#bigmod = Word2Vec(tokenized_books, size=50, window=10, min_count=10, iter = 20)

### Lets save this model trained on the whole latin library

In [50]:
#bigmod.save('full.model')

In [2]:
bigmod = Word2Vec.load('full.model')

In [112]:
bigmod.wv.most_similar(positive="troiae")

[('sagunti', 0.7756997346878052),
 ('carthaginis', 0.7734566330909729),
 ('argi', 0.7660139203071594),
 ('italum', 0.7657891511917114),
 ('libyae', 0.764527440071106),
 ('danaum', 0.7633456587791443),
 ('teucrorum', 0.7532935738563538),
 ('scipiadae', 0.7476391792297363),
 ('iliacis', 0.7458595037460327),
 ('phrygum', 0.7429102659225464)]

In [61]:
bigmod.wv.doesnt_match('caesar roma gallia africa'.split())

'roma'

In [105]:
bigmod.wv.most_similar(positive=['caesar', 'gallia'])

[('illyrico', 0.7993365526199341),
 ('pompeius', 0.7921752333641052),
 ('mithridatem', 0.7615995407104492),
 ('hispania', 0.7568758130073547),
 ('macedonia', 0.7528416514396667),
 ('africa', 0.7526384592056274),
 ('italia', 0.7494314908981323),
 ('triumphauit', 0.7392575740814209),
 ('iugurtha', 0.7370326519012451),
 ('tigranem', 0.7286438941955566)]

In [39]:
bigmod.wv.most_similar(positive=['capiti', 'gallia'], negative=[''])

[('triumpho', 0.6781671643257141),
 ('socero', 0.6586953401565552),
 ('lucullo', 0.6309340000152588),
 ('triumphauit', 0.6255151629447937),
 ('piceno', 0.6185227632522583),
 ('sceptro', 0.6154395937919617),
 ('gymnasio', 0.6150475740432739),
 ('municipio', 0.6098893284797668),
 ('curru', 0.6086866855621338),
 ('gnaeo', 0.6057745218276978)]

In [70]:
len(bigmod.wv.vocab)

79238

In [42]:
bigmod.wv.doesnt_match("consul tribunus episcopi".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'episcopi'

In [46]:
bigmod.wv.doesnt_match("consul tribunus praetor magistris episcopi".split())

'episcopi'

In [47]:
bigmod.wv.doesnt_match("pater mater filius filia maritus".split())

'maritus'

In [49]:
bigmod.wv.doesnt_match("ambrosio augustini caesar aquinas".split())

'caesar'

In [79]:
# Test Word Analogies
analogy_scores = bigmod.wv.evaluate_word_analogies('Evaluation/latin-analogies.txt')


In [80]:
print(analogy_scores)

(0.4666666666666667, [{'section': 'gender', 'correct': [('PUER', 'PUELLA', 'PATER', 'MATER'), ('PUELLA', 'PUER', 'REGINA', 'REX'), ('PATER', 'MATER', 'REX', 'REGINA'), ('MATER', 'PATER', 'REGINA', 'REX'), ('REX', 'REGINA', 'PATER', 'MATER'), ('REGINA', 'REX', 'MATER', 'PATER')], 'incorrect': [('PUER', 'PUELLA', 'REX', 'REGINA'), ('PUELLA', 'PUER', 'MATER', 'PATER'), ('PATER', 'MATER', 'PUER', 'PUELLA'), ('MATER', 'PATER', 'PUELLA', 'PUER'), ('REX', 'REGINA', 'PUER', 'PUELLA'), ('REGINA', 'REX', 'PUELLA', 'PUER')]}, {'section': 'Church and State', 'correct': [('PAPAM', 'REX', 'EPISCOPUS', 'DUX')], 'incorrect': []}, {'section': 'Leader and Jurisdiction', 'correct': [], 'incorrect': [('PAPAM', 'ROMA', 'AUGUSTINI', 'HIPPO'), ('PAPAM', 'ROMA', 'AMBROSIUS', 'MEDIOLANUM')]}, {'section': 'Total accuracy', 'correct': [('PUER', 'PUELLA', 'PATER', 'MATER'), ('PUELLA', 'PUER', 'REGINA', 'REX'), ('PATER', 'MATER', 'REX', 'REGINA'), ('MATER', 'PATER', 'REGINA', 'REX'), ('REX', 'REGINA', 'PATER', 'MA