# Word2Vec for a Latin Corpus
Here we will implement word2vec using the gensim library

In [1]:
# Import dependencies 
# gensim library and tools for implementing Word2Vec
import gensim
from gensim.summarization import textcleaner
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from gensim.test.utils import datapath

# CLTK library used for cleaning Latin txt data
import cltk
from cltk.tokenize.latin.sentence import SentenceTokenizer
from cltk.tokenize.word import WordTokenizer
from cltk.stop.latin import STOPS_LIST
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer

First we'll need to get the file names for the books that we'll be using

In [2]:
import glob

# Create a list of all the txt file names
path = './*.txt'
files = glob.glob(path)
print(files)


['./Cicero-Academica.txt', './Athanasius-inillud.txt', './Augustine_Confessiones.txt', './Tacitus-GermaniaAgricola.txt', './Caesar-Commentaries-bks1-4.txt', './Cicero-CatoMaiorDeSenectute.txt', './Persius-satires.txt', './Augustine-CityofGod-bks1to6.txt', './Cicero-Orationes.txt', './TiberiusCatiusSIliasItalicus-Punicorum Libri Septemdecim.txt', './AulusGellius-AtticNights.txt', './Quintilianus-InstitutionisOratoriaeLiberDecimus.txt', './Caesar-Commentaries-bks5-8.txt', './Athanasius-DeclarationOfFaith.txt', './Boethius1-3.txt', './Augustine-Confessiones2.txt']


In [3]:
#Tokenize a document by words and then create a list of tokenized documents
# Normalize J V
jv_replacer = JVReplacer()

# Read documents into a list and preprocess
books = []
for name in files:
    with open(name, 'r') as f_obj:
        books.append(jv_replacer.replace(f_obj.read().lower()))
        


In [4]:
# Initialize latin word tokenizer
word_tokenizer = WordTokenizer('latin')
lemmatizer = LemmaReplacer('latin')


# Tokenize each book
tokenized_books = []
for book in books:
    tokenized_books.append(word_tokenizer.tokenize(book))

In [147]:
# trying to figure out stopwords

#set(tokenized_books[0]).difference(STOPS_LIST)

{'',
 '!',
 '(',
 ')',
 ',',
 '-ne',
 '-que',
 '-ue',
 '.',
 '.»',
 '1',
 '1.',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '2.',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '3.',
 '30',
 '4',
 '4.',
 '5',
 '5.',
 '6',
 '6.',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '?_',
 '_',
 '_dedit_',
 '_deus',
 '_dominus',
 '_dominus_',
 '_ecce',
 '_etenim',
 '_genitus',
 '_i',
 '_mea',
 '_mortui',
 '_nemo',
 '_omnia',
 '_omnia_',
 '_pater',
 '_per',
 '_producant',
 '_quem',
 '_quæcun',
 '_sanctus',
 '_sanctus_',
 '_sicut',
 '_super',
 '_tradita',
 '_uade_',
 '_uenite',
 'a',
 'absit',
 'absque',
 'absurdum',
 'accepi',
 'accepit',
 'accipere',
 'acciperet',
 'accipiente',
 'accurate',
 'adam',
 'adeo',
 'adiunxit',
 'adiutorem',
 'admirari',
 'admodum',
 'adoranda',
 'adueniret',
 'affingere',
 'agitur',
 'agunt',
 'ait',
 'aiunt',
 'aliena',
 'aliquando',
 'aliud',
 'ambos',
 'ame',
 'amplius',
 'angeli',
 'angelus',
 'animalia',
 'a

Here we'll take a look at how many books and words we have

In [36]:
word_count = 0
for i in range(0, len(tokenized_books)):
    word_count = word_count + len(tokenized_books[i])
    print(len(tokenized_books[i]))
print('Total number of tokens is:', word_count)
#print(len(tokenized_books[0]))

29008
2707
187311
16413
24009
10020
6478
74064
16112
114944
5973
14659
36642
1307
24486
93024
Total number of tokens is: 657157


In [7]:
unique_word_count = 0
for i in range(0, len(tokenized_books)):
    unique_word_count = unique_word_count + len(set(tokenized_books[i]))
    print(len(set(tokenized_books[i])))
print('Total number of unique tokens is:', unique_word_count)

6192
979
25351
5764
5512
3428
3157
14639
4444
21698
2514
4539
8115
512
6409
16384
Total number of unique tokens is: 129637


Summary stats

In [8]:
unique_word_count/word_count

0.1972694500705311

# Train the word2vec Model


In [9]:
mod1 = Word2Vec(tokenized_books, size=50, window=10, min_count=10, iter = 20)

In [22]:
mod1.wv.most_similar(positive="militum")

[('copiis', 0.9774023294448853),
 ('praesidio', 0.9765084385871887),
 ('copias', 0.9762064218521118),
 ('proelium', 0.9761289358139038),
 ('impedimenta', 0.9758837819099426),
 ('hiberna', 0.9745978116989136),
 ('contendit', 0.9735535979270935),
 ('exercitu', 0.9730570316314697),
 ('reliquos', 0.972862958908081),
 ('exercitum', 0.9726850986480713)]

In [24]:
mod1.wv.most_similar(positive=['caesar', 'ecclesia'], negative=['copias'])

[('situm', 0.9153473377227783),
 ('mensa', 0.909437358379364),
 ('deam', 0.9045636057853699),
 ('aduentus', 0.9020926356315613),
 ('consulit', 0.8966860175132751),
 ('agi', 0.8954918384552002),
 ('daemonibus', 0.8951117396354675),
 ('argento', 0.8892412185668945),
 ('piget', 0.8872928619384766),
 ('tristes', 0.8857428431510925)]

In [96]:
mod1.wv.most_similar(positive=['capiti', 'gallia'])

[('pacem', 0.9747473001480103),
 ('petierunt', 0.9733341932296753),
 ('actis', 0.9721436500549316),
 ('decem', 0.9716939926147461),
 ('aestate', 0.9712899923324585),
 ('turres', 0.9705880880355835),
 ('castellis', 0.9696308970451355),
 ('aditus', 0.9677024483680725),
 ('deditionem', 0.965579628944397),
 ('aditum', 0.9652883410453796)]

In [35]:
len(mod1.wv.vocab)

6605

Roman political titles - consul

In [26]:
mod1.wv.doesnt_match("consul tribunus episcopi".split())

'episcopi'

In [27]:
mod1.wv.doesnt_match("consul tribunus praetor magistris episcopi".split())

'episcopi'

Household roles - Father mother son daughter husband

In [28]:
mod1.wv.doesnt_match("pater mater filius filia maritus".split())

'filia'

In [31]:
mod1.wv.doesnt_match("ambrosio augustini caesar".split())

'caesar'

AttributeError: 'Word2Vec' object has no attribute 'vocab'