In [2]:
from utils.lexical import normalizador
normalizer = normalizador.Normalizador()
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import os
import re



# Getting corpora files

In [3]:
corpora_path = '../data/corpora/'
corpora_dirs = os.listdir(corpora_path)

corpora_files = {} #elements are like corpus:[files]
for corpus in corpora_dirs:
    corpora_files[corpus] = [os.path.join(corpora_path + corpus, f)
             for f in os.listdir(corpora_path + corpus)
             if os.path.isfile(os.path.join(corpora_path + corpus, f))]

# Getting data to construct Word2Vec and Doc2Vec

    Here we are going to collect the data in the following ways:

    *A list of all sentences(tokenized by words) for each corpus
    *A list of documents(tokenized by words) for the entire corpora

In [4]:
corpora_sentences = {} #elements are like: corpus:[sentences]
corpora_documents = [] #elements are like: documents tokenized by words
for corpus in corpora_files:
    corpora_sentences[corpus] = []
    for file in corpora_files[corpus]:
        with open(file, 'r') as text_file:
            #all files in my corpora have only one line
            #because I removed the line brakes during the compilation
            text = text_file.read()
            
            #removing trash read from the sports blog
            if corpus == "esporte":
                text = re.sub("^Pesquisar este blog ", '', text)
            
            #transforming to lowercase
            text = normalizer.to_lowercase(text)
            
            #tokenizing by word - this is for Doc2Vec
            document = normalizer.tokenize_words(text)
            corpora_documents.append(document)
            
            #tokenize text by sentence - this is for Word2Vec
            sentences = normalizer.tokenize_sentences(text)
            #tokenize each sentence by word and add it to the list
            sentences = [normalizer.tokenize_words(sent) for sent in sentences]
            corpora_sentences[corpus].extend(sentences)
    
    #this is also for Doc2Vec
    tagged_documents = [TaggedDocument(words=d, tags=[str(i)]) 
                        for i, d in enumerate(corpora_documents)]

# Creating Word2Vecs for each corpus

### Training the Word2Vecs

In [51]:
w2vmodels = {} #elements are like corpus: w2vmodel
for corpus in corpora_sentences:
    w2vmodels[corpus] = Word2Vec(corpora_sentences[corpus],
        size=200, window=5, min_count=3, workers=4)
        

# Creating Doc2Vec for the corpora

In [9]:
d2vmodel = Doc2Vec(tagged_documents, vector_size=20, window=2, min_count=1, workers=4)

# Using Word2Vec

In [93]:
w2vmodels['esporte'].wv.most_similar('jogo')

[('gol', 0.9986592531204224),
 ('primeiro', 0.9971026182174683),
 ('placar', 0.9967527985572815),
 ('fim', 0.9964091777801514),
 ('fluminense', 0.9958785772323608),
 ('segundo', 0.9954502582550049),
 ('título', 0.9946879148483276),
 ('contra', 0.9945873618125916),
 ('início', 0.994193971157074),
 ('último', 0.9937357902526855)]

In [91]:
w2vmodels['ciencia_e_tecnologia'].wv.most_similar('energia')

[('eletricidade', 0.9373791813850403),
 ('corrente', 0.9100709557533264),
 ('emitida', 0.8989173173904419),
 ('eletromagnética', 0.8983625173568726),
 ('partir', 0.8889291286468506),
 ('colheita', 0.8823054432868958),
 ('estática', 0.8803973197937012),
 ('determinado', 0.8752703666687012),
 ('transformado', 0.875077486038208),
 ('elétrica', 0.8737519383430481)]

# Using Doc2Vec

In [10]:
vector_sci1 = d2vmodel.infer_vector(corpora_documents[0])
vector_sci2 = d2vmodel.infer_vector(corpora_documents[1])
vector_sport1 = d2vmodel.infer_vector(corpora_documents[-1])
vector_sport2 = d2vmodel.infer_vector(corpora_documents[-2])

In [17]:
from scipy import spatial
print(spatial.distance.cosine(vector_sci1, vector_sci1))
print(spatial.distance.cosine(vector_sci1, vector_sci2))
print(spatial.distance.cosine(vector_sci2, vector_sport2))
print(spatial.distance.cosine(vector_sport1, vector_sport2))
print(spatial.distance.cosine(vector_sci2, vector_sport2))

0.0
0.16041719913482666
0.22453200817108154
0.0733189582824707
0.22453200817108154
