In [26]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from pprint import pprint
import multiprocessing

In [4]:
wiki = WikiCorpus("ptwiki-20180101-pages-articles.xml.bz2")

In [5]:
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield TaggedDocument([c for c in content], [title])

In [6]:
documents = TaggedWikiDocument(wiki)

In [9]:
pre = Doc2Vec(min_count = 0)
pre.scan_vocab(documents)

In [10]:
for num in range(0,20):
    print('min_count: {}, size of vocab: '.format(num), 
          pre.scale_vocab(min_count=num,dry_run=True)['memory']['vocab']/700)

min_count: 0, size of vocab:  1648766.4285714286
min_count: 1, size of vocab:  1648766.4285714286
min_count: 2, size of vocab:  829600.0
min_count: 3, size of vocab:  601290.0
min_count: 4, size of vocab:  491646.4285714286
min_count: 5, size of vocab:  422730.71428571426
min_count: 6, size of vocab:  376028.5714285714
min_count: 7, size of vocab:  340491.4285714286
min_count: 8, size of vocab:  311821.4285714286
min_count: 9, size of vocab:  289728.5714285714
min_count: 10, size of vocab:  271503.5714285714
min_count: 11, size of vocab:  256016.42857142858
min_count: 12, size of vocab:  242602.14285714287
min_count: 13, size of vocab:  230754.2857142857
min_count: 14, size of vocab:  220620.0
min_count: 15, size of vocab:  211390.0
min_count: 16, size of vocab:  203138.57142857142
min_count: 17, size of vocab:  195667.14285714287
min_count: 18, size of vocab:  188942.85714285713
min_count: 19, size of vocab:  182740.7142857143


In [11]:
cores = multiprocessing.cpu_count()

models = [
    Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=2, iter=10, workers=cores),
    Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=2, iter=10, workers=cores)
]

In [12]:
models[0].build_vocab(documents)
print(str(models[0]))
models[1].reset_from(models[0])time model.t)

Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)


In [13]:
for model in models:
    %%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)

CPU times: user 21h 26min 47s, sys: 7min 28s, total: 21h 34min 16s
Wall time: 4h 29min 48s
CPU times: user 3h 33min 20s, sys: 5min 12s, total: 3h 38min 33s
Wall time: 2h 24min 6s


In [20]:
for model in models:
    print(str(model))
    pprint(model.docvecs.most_similar(positive=["Inteligência"], topn=10))

Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
[('Quociente de inteligência', 0.702828586101532),
 ('Pensamento divergente', 0.6468408107757568),
 ('Teoria g', 0.6415136456489563),
 ('Matrizes Progressivas de Raven', 0.6334096789360046),
 ('Competência (psicologia)', 0.6279304027557373),
 ('Autismo altamente-funcional', 0.6241010427474976),
 ('Lógica e racionalidade', 0.6216001510620117),
 ('Modelo científico', 0.616083025932312),
 ('Psicologia comportamental', 0.6154487133026123),
 ('Inteligência emocional', 0.6144423484802246)]
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
[('Teoria g', 0.6904677748680115),
 ('Teste psicológico', 0.6620209217071533),
 ('Inteligência emocional', 0.6590678095817566),
 ('Zona de desenvolvimento proximal', 0.6482210159301758),
 ('Psicologia da personalidade', 0.6440185308456421),
 ('Quociente de inteligência', 0.6423906087875366),
 ('Competência social', 0.6331567168235779),
 ('Gênio (pessoa)', 0.6320877075195312),
 ('Análise sensorial', 0.6313241720199585),
 ('Efeit

In [15]:
for model in models:
    print(str(model))
    pprint(model.docvecs.most_similar(positive=["Lady Gaga"], topn=10))

Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
[('Adele', 0.6877843141555786),
 ('Christina Aguilera', 0.670610249042511),
 ('Rihanna', 0.6683495044708252),
 ('Mariah Carey', 0.6533797979354858),
 ('Born This Way', 0.6523556709289551),
 ('LoveGame', 0.6518796682357788),
 ('Katy Perry', 0.6504320502281189),
 ('Single Ladies (Put a Ring on It)', 0.6492538452148438),
 ('Bad Romance', 0.6467785835266113),
 ('The Fame', 0.6421879529953003)]
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
[('The Fame', 0.6751108169555664),
 ('Bad Romance', 0.6534554958343506),
 ('The Fame Monster', 0.6474413275718689),
 ('Born This Way', 0.6441248655319214),
 ('Born This Way (canção)', 0.6429762840270996),
 ('Just Dance', 0.6380847096443176),
 ('LoveGame', 0.6323598623275757),
 ('Joanne (álbum)', 0.6285462379455566),
 ('Speechless (canção)', 0.6284322738647461),
 ('Fashion of His Love', 0.6250290870666504)]


In [16]:
for model in models:
    print(str(model))
    vec = [model.docvecs["Lady Gaga"] - model["american"] + model["japanese"]]
    pprint([m for m in model.docvecs.most_similar(vec, topn=11) if m[0] != "Lady Gaga"])


Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
[('How Beautiful You Are', 0.5841714143753052),
 ('Hiroko Shimabukuro', 0.5547493696212769),
 ('Born This Way (canção)', 0.5529972910881042),
 ('Fairies (álbum)', 0.552701473236084),
 ('Dos Angeles', 0.5523363351821899),
 ('The 1st Concert In Japan "Shinee World"', 0.5519884824752808),
 ('Beyoncé: The Ultimate Performer', 0.5501809120178223),
 ('Beloved', 0.5420556664466858),
 ('GBI (German Bold Italic)', 0.5419186949729919),
 ('Spiderwebs', 0.5405997633934021)]
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
[('Osaka (cidade)', 0.5056620240211487),
 ('Rikki', 0.4907521605491638),
 ('Tanaka Isson', 0.4895462691783905),
 ('◯◯ Ganbaranakutemo Eenende!!', 0.48410287499427795),
 ('Kumi Koda', 0.4797716736793518),
 ('Watarasebashi', 0.4783862829208374),
 ('Kiyoshi Hikawa', 0.4778934717178345),
 ('Babymetal × Kiba of Akiba', 0.4744834899902344),
 ('Kimaguren', 0.4740353524684906),
 ('Puffy Amiyumi x Puffy', 0.47362032532691956),
 ('Bairros do Japão', 0.4734594

  app.launch_new_instance()


In [24]:
for model in models:
    print(str(model))
    vec = [model["rei"] - model["homem"] + model["mulher"]]
    pprint([m for m in model.most_similar(vec, topn=11) if m[0] != "rei"])


Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
[('rainha', 0.7577492594718933),
 ('consorte', 0.6718885898590088),
 ('filha', 0.6586146354675293),
 ('esposa', 0.6581082344055176),
 ('princesa', 0.6393976211547852),
 ('herdeira', 0.6299619674682617),
 ('sujarinee', 0.6268012523651123),
 ('somanat', 0.6259405016899109),
 ('mahendradatta', 0.6246206760406494),
 ('jonsdotter', 0.6238503456115723)]
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
[('rainha', 0.6301841735839844),
 ('esposa', 0.6233345866203308),
 ('mulher', 0.6099225878715515),
 ('princesa', 0.5634665489196777),
 ('concubina', 0.5331172347068787),
 ('filha', 0.5082984566688538),
 ('dama', 0.49854812026023865),
 ('monarca', 0.4933313727378845),
 ('consorte', 0.4898892939090729),
 ('madrasta', 0.4833236038684845)]


  app.launch_new_instance()


In [31]:
doc = "Acrescenta dispositivos ao Código de Processo Penal para determinar a especificação de gênero no 
        inquérito policial, processo penal e estatísticas correspondentes e dá outras providências."

tokens = simple_preprocess(doc)
print(tokens)

['acrescenta', 'dispositivos', 'ao', 'código', 'de', 'processo', 'penal', 'para', 'determinar', 'especificação', 'de', 'gênero', 'no', 'inquérito', 'policial', 'processo', 'penal', 'estatísticas', 'correspondentes', 'dá', 'outras', 'providências']


In [32]:
for model in models:
    print(str(model))
    inferred = model.infer_vector(tokens)
    sims = model.docvecs.most_similar([inferred], topn=10)
    pprint(sims)

Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
[('Fraude processual', 0.738481879234314),
 ('Falso testemunho ou falsa perícia', 0.7367019653320312),
 ('Processo de conhecimento', 0.736674427986145),
 ('Estrito cumprimento de dever legal', 0.7224631309509277),
 ('Princípio dispositivo', 0.721563458442688),
 ('Carta testemunhável', 0.7192434072494507),
 ('Judicialiforme', 0.7121644020080566),
 ('Despacho', 0.7117525339126587),
 ('Prescrição retroativa', 0.7102814316749573),
 ('Erro judicial', 0.7086154222488403)]
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
[('Acacia robusta robusta', 0.6625255346298218),
 ('Acacia microcarpa', 0.6595112681388855),
 ('Coleophora sudanella', 0.6506903171539307),
 ('Acacia philippinarum', 0.6487383246421814),
 ('Perotrochus metivieri', 0.6475683450698853),
 ('4280 Simonenko', 0.6459099054336548),
 ('Exsilirarcha', 0.6446590423583984),
 ('Tau5 Serpentis', 0.643500804901123),
 ('Tetramorium impurum', 0.6431348323822021),
 ('2003 TK58', 0.6425009965896606)]


In [51]:
docs = [
    "Acrescenta dispositivos ao Código de Processo Penal para determinar a especificação de gênero no inquérito policial, processo penal e estatísticas correspondentes e dá outras providências.",
    "Dá nova redação ao § 1º do art. 7º da Lei nº 8.666, de 21 de junho de 1993, para estabelecer a obrigatoriedade do sistema de modelagem da informação da construção, identificado pela sigla inglesa BIM - Building Information Model, na confecção de projetos executivos de obras e serviços de engenharia contratados pelos órgãos e entidades da administração pública, e dá outras providências.",
    "Determina a inclusão de quadras poliesportivas nos projetos de construção de novos estabelecimentos públicos de ensino fundamental e de ensino médio.",
    "Torna obrigatória a presença de profissional da área de Fonoaudiologia em todas escolas públicas e privadas de ensino fundamental."
]

In [52]:
for doc in docs:
    for model in models:
        print(str(model))
        sims = model.docvecs.similarity_unseen_docs(model, doc, 'Engenharia',
                                                    alpha=0.1, min_alpha=0.0001,
                                                   steps=5)
        pprint(sims)

Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
-0.009694335071942716
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
-0.009694335071942716
Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
0.08458677396092397
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
0.08458677396092397
Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
0.04800966110464222
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
0.04800966110464222
Doc2Vec(dbow+w,d200,n5,w8,mc2,s0.001,t8)
-0.09792388854478891
Doc2Vec(dm/m,d200,n5,w8,mc2,s0.001,t8)
-0.09792388854478891


Por que o modelo atribui a mesma distância para documentos tão distintos?