In [1]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import TfidfModel
from gensim.models.keyedvectors import KeyedVectors
from gensim.similarities import WmdSimilarity, MatrixSimilarity
from nltk.corpus import stopwords

import json
import numpy as np
import re
import string
import wmd

# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

# Declare variables

In [2]:
articles_path = "../datasets/ptwiki-20170820-pages-articles-parsed.json"
embeddings_path = "../datasets/glove_s300.txt"

# Preprocessing

In [3]:
# Remove stopwords.
stop_words = stopwords.words('portuguese')

# String Ponctuation
punctuation = string.punctuation
numbers = "0123456789"

# Regex for removing all non-alphabet letters or spaces
regex = re.compile('([' + punctuation + numbers + ']|\n|\t)')

def preprocessing(document):
    # Remove all non-alphabet letters or spaces
    processed_document = regex.sub(' ', document)

    # Remove extra spaces and lower
    processed_document = [token.lower()
                          for token in processed_document.split(' ')
                          if token and token not in stop_words]
    
    return processed_document

# Get Corpora

In [4]:
def load_corpora(corpora_path, token_min, corpora_limit):
    documents = {}
    with open(corpora_path, mode="r", encoding="utf-8") as fp:
        for line in fp:
            # Parse Json
            json_document = json.loads(line)
            
            # Get information needed
            title, text = json_document["title"], json_document["text"]
            
            # Apply preprocessing
            document = preprocessing(text)

            # Add
            if len(document) >= token_min:
                documents[title] = document
            if len(documents) >= corpora_limit:
                break
    return documents

# Load data

## Articles

In [5]:
%time articles = load_corpora(articles_path, 100, 10001)

CPU times: user 1min, sys: 596 ms, total: 1min 1s
Wall time: 1min 1s


### Mean of words per article

In [6]:
sum([len(text) for text in articles.values()])/len(articles)

910.4079592040796

## Embeddings

In [7]:
%time embeddings = KeyedVectors.load_word2vec_format(embeddings_path, limit=30000)

CPU times: user 15 s, sys: 108 ms, total: 15.1 s
Wall time: 15.1 s


# Experiments

## Dictionary

In [8]:
%time dictionary = Dictionary(articles.values(), prune_at=30000)

CPU times: user 14.2 s, sys: 52 ms, total: 14.2 s
Wall time: 14.2 s


## BoW

In [9]:
%time corpus = [dictionary.doc2bow(text) for text in articles.values()]

CPU times: user 9.73 s, sys: 88 ms, total: 9.82 s
Wall time: 9.82 s


In [10]:
%time tfidf = TfidfModel(corpus)

CPU times: user 2.04 s, sys: 24 ms, total: 2.06 s
Wall time: 2.06 s


In [11]:
%time BoW_similarity = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary), num_best=10)

CPU times: user 28.8 s, sys: 652 ms, total: 29.4 s
Wall time: 14.9 s


## WMD

instance = WmdSimilarity(processed_articles, embeddings, num_best=10)

%time instance[preprocessing("maior tenista de todos os tempos")]

### Embedding index

In [12]:
token2idx = {token:vocab.index
             for token, vocab in embeddings.vocab.items()}

### Dictionary index to Embedding index

In [13]:
idx2idx = {dict_idx:token2idx[token]
           for dict_idx, token in dictionary.items()
           if token in token2idx}

### Convert from text to nbow

In [14]:
def get_nbow(document):
    title, text = document
    
    # Generate BoW
    bow = dictionary.doc2bow(text)
    
    # Convert index from dictionary to index from embedding
    bow = {idx2idx[idx]: count
            for idx, count in bow
            if idx in idx2idx}
    
    # Get indices
    indices = list(bow.keys())
    
    # Get and normalize frequencies
    counters = np.array(list(bow.values()), dtype=np.float32)
    counters = counters/counters.sum()
    
    return (title, (title, indices, counters))

In [15]:
nbows = dict([get_nbow(article) for article in articles.items()])

### Instantiate WMD

In [16]:
% time WMD_similarity = wmd.WMD(embeddings.syn0, nbows)

CPU times: user 12 ms, sys: 16 ms, total: 28 ms
Wall time: 26.7 ms


## Tests BoW x WMD

### Some articles available

In [17]:
titles = list(articles.keys())

In [18]:
list(enumerate(titles[:50]))

[(0, 'Astronomia'),
 (1, 'América Latina'),
 (2, 'Albino Forjaz de Sampaio'),
 (3, 'Anno Domini'),
 (4, 'Aquiles'),
 (5, 'Anarcocapitalismo'),
 (6, 'Anarquismo'),
 (7, 'Albert Einstein'),
 (8, 'Aquecimento global'),
 (9, 'Adriano'),
 (10, 'Alexandre (nome)'),
 (11, 'Afonso, Príncipe de Portugal (1475–1491)'),
 (12, 'A Ideia Perigosa de Darwin'),
 (13, 'Agricultura'),
 (14, 'Afeganistão'),
 (15, 'História do Afeganistão'),
 (16, 'Argentina'),
 (17, 'Algoritmo'),
 (18, 'Ananás'),
 (19, 'Angola'),
 (20, 'Aves'),
 (21, 'Arara-azul-de-lear'),
 (22, 'Arara-azul-grande'),
 (23, 'Ararinha-azul'),
 (24, 'Alfred Jules Ayer'),
 (25, 'Antropologia'),
 (26, 'Arqueologia'),
 (27, 'Alexandre Rodrigues Ferreira'),
 (28, 'Aldous Huxley'),
 (29, 'Anatomia'),
 (30, 'A Carta da Terra'),
 (31, 'Arquivo sonoro'),
 (32, 'Aleksandr Oparin'),
 (33, 'Amapá'),
 (34, 'Amazonas'),
 (35, 'Alagoas'),
 (36, 'Astrofísica'),
 (37, 'Azulona'),
 (38, 'Aristóteles'),
 (39, 'Lista de aves do Brasil'),
 (40, 'Albânia'),
 (4

### Astronomia

### BoW

In [19]:
%time sims = BoW_similarity[tfidf[corpus[0]]]
[(titles[sim[0]], sim[1]) for sim in sims]

CPU times: user 200 ms, sys: 0 ns, total: 200 ms
Wall time: 111 ms


[('Astronomia', 0.99999982118606567),
 ('Astrometria', 0.41765323281288147),
 ('Estrela', 0.3328622579574585),
 ('Galáxia', 0.31383705139160156),
 ('Sol', 0.31361353397369385),
 ('Radiação eletromagnética', 0.30375868082046509),
 ('Cosmologia', 0.29916435480117798),
 ('Astrofísica', 0.29271420836448669),
 ('Sistema Solar', 0.28780707716941833),
 ('Planeta', 0.27009952068328857)]

### WMD

In [20]:
%time WMD_similarity.nearest_neighbors("Astronomia", early_stop=0.10)

2017-12-01 15:35:31,232 : INFO : Vocabulary size: 500 500
2017-12-01 15:35:31,240 : INFO : WCD
2017-12-01 15:35:35,671 : INFO : 4.4
2017-12-01 15:35:35,673 : INFO : First K WMD
2017-12-01 15:35:39,240 : INFO : [(-5.517649173736572, 'Ganímedes (satélite)'), (-5.402247428894043, 'Caronte (satélite)'), (-4.901270389556885, 'Netuno (planeta)'), (-5.1248860359191895, 'Planetologia'), (-5.09043025970459, '90377 Sedna'), (-4.854953289031982, 'Saturno (planeta)'), (-4.777352333068848, 'Cosmologia'), (-4.9420671463012695, 'Ceres (planeta anão)'), (-5.051176071166992, 'Astrometria'), (-4.984040260314941, 'Astrofísica')]
2017-12-01 15:35:39,242 : INFO : 3.6
2017-12-01 15:35:39,244 : INFO : P&P
2017-12-01 15:36:39,256 : INFO : 830 0.9373493975903614 5.1815619468688965 [(-4.824405193328857, 'Estrela binária'), (-4.804336071014404, 'Via Láctea'), (-4.804930686950684, 'Galáxia')] ['Estrela', 'Sistema Solar', 'Sol']
2017-12-01 15:36:46,073 : INFO : stopped by early_stop condition


CPU times: user 2min 11s, sys: 1.33 s, total: 2min 12s
Wall time: 1min 14s


[('Estrela', 4.394160747528076),
 ('Sol', 4.530681610107422),
 ('Planeta', 4.708134651184082),
 ('Mercúrio (planeta)', 4.722616195678711),
 ('Sistema Solar', 4.776794910430908),
 ('Cosmologia', 4.777352333068848),
 ('Universo', 4.778275489807129),
 ('Via Láctea', 4.804336071014404),
 ('Galáxia', 4.804930686950684),
 ('Estrela binária', 4.824405193328857)]

In [21]:
%time WMD_similarity.nearest_neighbors("Astronomia", early_stop=0.010)

2017-12-01 15:36:46,090 : INFO : Vocabulary size: 500 500
2017-12-01 15:36:46,092 : INFO : WCD
2017-12-01 15:36:50,406 : INFO : 4.3
2017-12-01 15:36:50,408 : INFO : First K WMD
2017-12-01 15:36:53,860 : INFO : [(-5.517649173736572, 'Ganímedes (satélite)'), (-5.402247428894043, 'Caronte (satélite)'), (-4.901270389556885, 'Netuno (planeta)'), (-5.1248860359191895, 'Planetologia'), (-5.09043025970459, '90377 Sedna'), (-4.854953289031982, 'Saturno (planeta)'), (-4.777352333068848, 'Cosmologia'), (-4.9420671463012695, 'Ceres (planeta anão)'), (-5.051176071166992, 'Astrometria'), (-4.984040260314941, 'Astrofísica')]
2017-12-01 15:36:53,861 : INFO : 3.5
2017-12-01 15:36:53,862 : INFO : P&P
2017-12-01 15:37:13,418 : INFO : stopped by early_stop condition


CPU times: user 42.6 s, sys: 372 ms, total: 43 s
Wall time: 27.3 s


[('Estrela', 4.394160747528076),
 ('Sol', 4.530681610107422),
 ('Planeta', 4.708134651184082),
 ('Mercúrio (planeta)', 4.722616195678711),
 ('Sistema Solar', 4.776794910430908),
 ('Cosmologia', 4.777352333068848),
 ('Universo', 4.778275489807129),
 ('Via Láctea', 4.804336071014404),
 ('Galáxia', 4.804930686950684),
 ('Estrela binária', 4.824405193328857)]

### Algoritmos

### BoW

In [23]:
%time sims = BoW_similarity[tfidf[corpus[17]]]
[(titles[sim[0]], sim[1]) for sim in sims]

CPU times: user 188 ms, sys: 0 ns, total: 188 ms
Wall time: 100 ms


[('Algoritmo', 1.0),
 ('Algoritmo de Kruskal', 0.4165273904800415),
 ('Algoritmo de Dijkstra', 0.38106128573417664),
 ('Algoritmo de Euclides', 0.37403565645217896),
 ('Quicksort', 0.3235933780670166),
 ('Programação de computadores', 0.31062531471252441),
 ('Algoritmo de Ford-Fulkerson', 0.2949603796005249),
 ('Alonzo Church', 0.27957868576049805),
 ('Problema do caixeiro-viajante', 0.27397683262825012),
 ('Heapsort', 0.27035114169120789)]

### WMD

In [24]:
%time WMD_similarity.nearest_neighbors("Algoritmo", early_stop=0.10)

2017-12-01 03:04:24,626 : INFO : Vocabulary size: 330 500
2017-12-01 03:04:24,628 : INFO : WCD
2017-12-01 03:04:28,629 : INFO : 4.0
2017-12-01 03:04:28,630 : INFO : First K WMD
2017-12-01 03:04:30,980 : INFO : [(-5.190297603607178, 'Lisp'), (-5.142288684844971, 'Computação quântica'), (-5.127190113067627, 'Criptografia'), (-4.930139541625977, 'Teoria da computação'), (-5.0168280601501465, 'Haskell (linguagem de programação)'), (-4.8569769859313965, 'Icon (linguagem de programação)'), (-4.888482093811035, 'Programação de computadores'), (-4.6164984703063965, 'Computador quântico'), (-4.8993449211120605, 'Compilador'), (-4.9773850440979, 'Análise numérica')]
2017-12-01 03:04:30,981 : INFO : 2.4
2017-12-01 03:04:30,982 : INFO : P&P
2017-12-01 03:05:31,598 : INFO : 670 0.7477611940298508 4.993443012237549 [(-4.955425262451172, 'Programa de computador'), (-4.930139541625977, 'Teoria da computação'), (-4.918553829193115, 'Núcleo (sistema operacional)')] ['Computador quântico', 'Compilador', 

CPU times: user 2min 23s, sys: 1.48 s, total: 2min 25s
Wall time: 1min 22s


[('Computador quântico', 4.6164984703063965),
 ('Ciência da computação', 4.795504570007324),
 ('Inteligência artificial', 4.839584827423096),
 ('Icon (linguagem de programação)', 4.8569769859313965),
 ('Programação de computadores', 4.888482093811035),
 ('Compilador', 4.8993449211120605),
 ('Núcleo (sistema operacional)', 4.918553829193115),
 ('Linguagem formal', 4.923614501953125),
 ('Teoria da computação', 4.930139541625977),
 ('Programa de computador', 4.955425262451172)]

### Austrália

### BoW

In [25]:
%time sims = BoW_similarity[tfidf[corpus[45]]]
[(titles[sim[0]], sim[1]) for sim in sims]

CPU times: user 220 ms, sys: 0 ns, total: 220 ms
Wall time: 112 ms


[('Austrália', 1.0000002384185791),
 ('Bandeira da Austrália', 0.4579404890537262),
 ('Oceania', 0.43976134061813354),
 ('Brisbane', 0.38360887765884399),
 ('Economia da Austrália', 0.36113718152046204),
 ('Perth (Austrália Ocidental)', 0.3513389527797699),
 ('Aborígenes australianos', 0.31155544519424438),
 ('Nauru', 0.29495945572853088),
 ('Ilhas Ashmore e Cartier', 0.27670067548751831),
 ('Melbourne', 0.27420958876609802)]

### WMD

In [26]:
%time WMD_similarity.nearest_neighbors("Austrália", early_stop=0.10)

2017-12-01 03:05:47,166 : INFO : Vocabulary size: 500 500
2017-12-01 03:05:47,167 : INFO : WCD
2017-12-01 03:05:51,675 : INFO : 4.5
2017-12-01 03:05:51,676 : INFO : First K WMD
2017-12-01 03:05:55,433 : INFO : [(-3.952953338623047, 'Tonga'), (-3.677894353866577, 'Filipinas'), (-3.7729878425598145, 'Singapura'), (-3.612440824508667, 'África do Sul'), (-3.6635212898254395, 'Japão'), (-3.4707348346710205, 'Canadá'), (-3.416084051132202, 'Malásia'), (-3.5614395141601562, 'Coreia do Sul'), (-3.2323861122131348, 'Nova Zelândia'), (-3.552936315536499, 'Tailândia')]
2017-12-01 03:05:55,434 : INFO : 3.8
2017-12-01 03:05:55,435 : INFO : P&P
2017-12-01 03:06:55,542 : INFO : 870 0.9551724137931035 4.720662593841553 [(-3.5622050762176514, 'Argentina'), (-3.5614395141601562, 'Coreia do Sul'), (-3.4707348346710205, 'Canadá')] ['Estados Unidos', 'Nova Zelândia', 'México']
2017-12-01 03:07:01,293 : INFO : stopped by early_stop condition


CPU times: user 2min 15s, sys: 1.58 s, total: 2min 17s
Wall time: 1min 14s


[('Nova Zelândia', 3.2323861122131348),
 ('Estados Unidos', 3.4053266048431396),
 ('Malásia', 3.416084051132202),
 ('Índia', 3.432809352874756),
 ('Canadá', 3.4707348346710205),
 ('México', 3.4919443130493164),
 ('Reino Unido', 3.5022168159484863),
 ('Tailândia', 3.552936315536499),
 ('Coreia do Sul', 3.5614395141601562),
 ('Argentina', 3.5622050762176514)]

### Nintendo

### BoW

In [27]:
%time sims = BoW_similarity[tfidf[corpus[6884]]]
[(titles[sim[0]], sim[1]) for sim in sims]

CPU times: user 184 ms, sys: 0 ns, total: 184 ms
Wall time: 90.3 ms


[('Nintendo', 0.99999994039535522),
 ('Sega', 0.56802898645401001),
 ('Sega Game Gear', 0.55226325988769531),
 ('Emulador', 0.45704707503318787),
 ('Sega Saturn', 0.4535183310508728),
 ('Dreamcast', 0.4030613899230957),
 ('Sega Master System', 0.29286766052246094),
 ('Jogo on-line', 0.23883216083049774),
 ('Jogo de computador', 0.23267152905464172),
 ('Darmstádtio', 0.18947595357894897)]

### WMD

In [28]:
%time WMD_similarity.nearest_neighbors("Nintendo", early_stop=0.10)

2017-12-01 03:07:01,412 : INFO : Vocabulary size: 500 500
2017-12-01 03:07:01,413 : INFO : WCD
2017-12-01 03:07:05,297 : INFO : 3.9
2017-12-01 03:07:05,298 : INFO : First K WMD
2017-12-01 03:07:08,159 : INFO : [(-6.0954060554504395, 'AmigaOS'), (-5.48569393157959, 'Berserk (mangá)'), (-5.741476058959961, 'Warcraft II'), (-5.061034679412842, 'Sega Game Gear'), (-5.149575710296631, 'Jogo de computador'), (-4.792379856109619, 'Emulador'), (-4.325271129608154, 'Sega Master System'), (-4.101327419281006, 'Sega Saturn'), (-3.9657070636749268, 'Sega'), (-4.925591468811035, 'Dreamcast')]
2017-12-01 03:07:08,160 : INFO : 2.9
2017-12-01 03:07:08,161 : INFO : P&P
2017-12-01 03:08:08,617 : INFO : 340 0.6617647058823529 6.167176723480225 [(-5.149575710296631, 'Jogo de computador'), (-5.124457836151123, 'Symbian'), (-5.147664546966553, 'Steve Jobs')] ['Sega Saturn', 'Sega', 'Dreamcast']
2017-12-01 03:09:04,645 : INFO : stopped by early_stop condition


CPU times: user 3min 18s, sys: 1.6 s, total: 3min 20s
Wall time: 2min 3s


[('Sega', 3.9657070636749268),
 ('Sega Saturn', 4.101327419281006),
 ('Sega Master System', 4.325271129608154),
 ('Emulador', 4.792379856109619),
 ('Dreamcast', 4.925591468811035),
 ('Computador pessoal', 5.026719570159912),
 ('Sega Game Gear', 5.061034679412842),
 ('Symbian', 5.124457836151123),
 ('Steve Jobs', 5.147664546966553),
 ('Jogo de computador', 5.149575710296631)]

### Administração

### BoW

In [29]:
%time sims = BoW_similarity[tfidf[corpus[75]]]
[(titles[sim[0]], sim[1]) for sim in sims]

CPU times: user 228 ms, sys: 0 ns, total: 228 ms
Wall time: 116 ms


[('Administração', 1.0000002384185791),
 ('Organização', 0.23896037042140961),
 ('Direito administrativo', 0.21482208371162415),
 ('Economia', 0.20210880041122437),
 ('Teoria', 0.19930656254291534),
 ('Ciência', 0.17526255548000336),
 ('Objetivo', 0.16347302496433258),
 ('Max Horkheimer', 0.14986594021320343),
 ('Sistema de informação', 0.14733175933361053),
 ('Teoria das cordas', 0.14614959061145782)]

### WMD

In [30]:
%time WMD_similarity.nearest_neighbors("Administração", early_stop=0.10)

2017-12-01 03:09:04,820 : INFO : Vocabulary size: 490 500
2017-12-01 03:09:04,821 : INFO : WCD
2017-12-01 03:09:10,454 : INFO : 5.6
2017-12-01 03:09:10,456 : INFO : First K WMD
2017-12-01 03:09:14,938 : INFO : [(-5.5233612060546875, 'Invéxis'), (-5.309115886688232, 'Zona de desenvolvimento proximal'), (-5.093391418457031, 'Teletrabalho'), (-5.1121368408203125, 'Inteligência empresarial'), (-5.062416076660156, 'Sistema integrado de gestão empresarial'), (-4.8951897621154785, 'Comunicação'), (-4.942392349243164, 'Logosofia'), (-4.936980724334717, 'Responsabilidade social'), (-4.9502949714660645, 'Seis Sigma'), (-4.99066686630249, 'Organização')]
2017-12-01 03:09:14,939 : INFO : 4.5
2017-12-01 03:09:14,940 : INFO : P&P
2017-12-01 03:10:17,685 : INFO : 220 0.4636363636363636 4.673911094665527 [(-4.936980724334717, 'Responsabilidade social'), (-4.930944919586182, 'Política'), (-4.904395580291748, 'Gestalt')] ['Método científico', 'Arquivística', 'Sistema de informação']
2017-12-01 03:11:17,

CPU times: user 4min 12s, sys: 2.53 s, total: 4min 14s
Wall time: 2min 45s


[('Economia', 4.76804256439209),
 ('Arquivística', 4.788609504699707),
 ('Ciência', 4.793127059936523),
 ('Sistema de informação', 4.858047962188721),
 ('Método científico', 4.862987518310547),
 ('Psicologia', 4.871213436126709),
 ('Sociologia', 4.894178867340088),
 ('Comunicação', 4.8951897621154785),
 ('Gestalt', 4.904395580291748),
 ('Política', 4.930944919586182)]