In [1]:
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords

import json
import numpy as np
import os
import re
import string
import sys

sys.path.append(os.path.abspath('../wrapper'))
import fastwmd

# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

# Declare variables

In [2]:
articles_path = "../dataset/wikipedia/AA/wiki_00"
embeddings_path = "../dataset/wikipedia/glove_s300.txt"

# Preprocessing

In [3]:
# Remove stopwords.
stop_words = stopwords.words('portuguese')

# String Ponctuation
punctuation = string.punctuation
numbers = "0123456789"

# Regex for removing all non-alphabet letters or spaces
regex = re.compile('([' + punctuation + numbers + ']|\n|\t)')

def preprocessing(document):
    # Remove all non-alphabet letters or spaces
    processed_document = regex.sub(' ', document)

    # Remove extra spaces and lower
    processed_document = [token.lower()
                          for token in processed_document.split(' ')
                          if token and token not in stop_words]
    
    return processed_document

# Get Corpora

In [4]:
def load_corpora(corpora_path, token_min, corpora_limit):
    documents = {}
    with open(corpora_path, mode="r", encoding="utf-8") as fp:
        for line in fp:
            # Parse Json
            json_document = json.loads(line)
            
            # Get information needed
            title, text = json_document["title"], json_document["text"]
            
            # Apply preprocessing
            document = preprocessing(text)

            # Add
            if len(document) >= token_min:
                documents[title] = document
            if len(documents) >= corpora_limit:
                break
    return documents

# Load data

## Articles

In [5]:
%time articles = load_corpora(articles_path, 100, 10001)

CPU times: user 49.6 s, sys: 463 ms, total: 50 s
Wall time: 50.1 s


### Mean of words per article

In [6]:
sum([len(text) for text in articles.values()])/len(articles)

1001.5756989247312

## Embeddings

In [7]:
%time embeddings = fastwmd.Embeddings(embeddings_path, 30000)

CPU times: user 2.86 s, sys: 56 ms, total: 2.92 s
Wall time: 2.91 s


## Related words

In [8]:
%time cache = fastwmd.RelatedWords(embeddings, 16)

CPU times: user 1min 51s, sys: 476 ms, total: 1min 52s
Wall time: 1min 51s


# Experiments

## Dictionary

In [9]:
%time dictionary = Dictionary(articles.values(), prune_at=30000)

CPU times: user 13 s, sys: 28 ms, total: 13 s
Wall time: 13 s


## BoW

In [10]:
%time corpus = [dictionary.doc2bow(text) for text in articles.values()]

CPU times: user 7.3 s, sys: 156 ms, total: 7.46 s
Wall time: 7.46 s


## WMD

### Embedding index

In [11]:
token2idx = {token:index
             for index, token in enumerate(embeddings.get_tokens())}

### Dictionary index to Embedding index

In [12]:
idx2idx = {dict_idx:token2idx[token]
           for dict_idx, token in dictionary.items()
           if token in token2idx}

### Convert from text to nbow

In [13]:
def get_nbow(document):
    title, text = document
    
    # Generate BoW
    bow = dictionary.doc2bow(text)
    
    # Convert index from dictionary to index from embedding
    bow = {idx2idx[idx]: count
            for idx, count in bow
            if idx in idx2idx}
    
    # Get indices
    indices = list(bow.keys())
    
    # Get and normalize frequencies
    counters = np.array(list(bow.values()), dtype=np.float32)
    counters = counters/counters.sum()
    
    #return (title, (title, indices, counters))
    nbow = [(index, freq) for index, freq in zip(indices, counters)]
    nbow = sorted(nbow, key=lambda x:x[0])
    return (title, nbow)

In [14]:
nbows = dict([get_nbow(article) for article in articles.items()])

### Instantiate

In [15]:
title1 = 'Brasil'
title2 = 'Alemanha'

### WMD

In [16]:
%time wmd = fastwmd.WMD(embeddings)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 10 µs


In [17]:
%time wmd.compute_distance(nbows[title1], nbows[title2])

CPU times: user 1min 18s, sys: 637 ms, total: 1min 19s
Wall time: 1min 18s


3.700040102005005

### RWMD

In [18]:
%time rwmd = fastwmd.RWMD(embeddings)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 6.44 µs


In [19]:
%time rwmd.compute_distance(nbows[title1], nbows[title2])

CPU times: user 176 ms, sys: 20 ms, total: 196 ms
Wall time: 196 ms


3.326050043106079

### Rel-WMD

In [20]:
%time rel_wmd = fastwmd.RelWMD(cache)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 9.3 µs


In [21]:
%time rel_wmd.compute_distance(nbows[title1], nbows[title2])

CPU times: user 83.2 ms, sys: 24 µs, total: 83.2 ms
Wall time: 83.8 ms


3.932708263397217

### Rel-RWMD

In [22]:
%time rel_rwmd = fastwmd.RelRWMD(cache)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 6.2 µs


In [23]:
%time rel_rwmd.compute_distance(nbows[title1], nbows[title2])

CPU times: user 9.63 ms, sys: 0 ns, total: 9.63 ms
Wall time: 9.59 ms


3.4973998069763184

## Tests RWMD x Rel-RWMD

### Some articles available

In [24]:
titles = [(key, len(doc)) for key, doc in nbows.items()]

In [25]:
list(enumerate(titles[:20]))

[(0, ('Astronomia', 858)),
 (1, ('América Latina', 1962)),
 (2, ('Albino Forjaz de Sampaio', 416)),
 (3, ('Anno Domini', 520)),
 (4, ('Aquiles', 1296)),
 (5, ('Anarcocapitalismo', 1278)),
 (6, ('Anarquismo', 2564)),
 (7, ('Albert Einstein', 2445)),
 (8, ('Aquecimento global', 3420)),
 (9, ('Adriano', 1336)),
 (10, ('Alexandre (nome)', 99)),
 (11, ('Afonso, Príncipe de Portugal (1475–1491)', 85)),
 (12, ('A Ideia Perigosa de Darwin', 144)),
 (13, ('Agricultura', 390)),
 (14, ('Afeganistão', 981)),
 (15, ('História do Afeganistão', 1550)),
 (16, ('Argentina', 2536)),
 (17, ('Algoritmo', 837)),
 (18, ('Ananás', 679)),
 (19, ('Angola', 2059))]

In [26]:
nbows1 = [nbows['Algoritmo']]
nbows2 = list(nbows.values())
%time distances = rwmd.compute_distances(nbows1, nbows2[:1000])

CPU times: user 22.8 s, sys: 212 ms, total: 23 s
Wall time: 22.9 s


In [27]:
standard_rank = [(title, distance) for title, distance in zip(list(nbows.keys()), distances[0])]
standard_rank = sorted(standard_rank, key=lambda x: x[1])

In [30]:
%time hashed_distances = rel_rwmd.compute_distances(nbows1, nbows2[:1000])

CPU times: user 2.42 s, sys: 8.08 ms, total: 2.42 s
Wall time: 2.41 s


In [31]:
hashed_rank = [(title, distance) for title, distance in zip(list(nbows.keys()), hashed_distances[0])]
hashed_rank = sorted(hashed_rank, key=lambda x: x[1])

In [32]:
rank_template = "\t{0}\t{1:.2f}"
for std_rank, hash_rank in zip(standard_rank, hashed_rank):
    print(std_rank[0]==hash_rank[0], rank_template.format(*std_rank), rank_template.format(*hash_rank))

True 	Algoritmo	0.00 	Algoritmo	0.00
True 	Compilador	3.57 	Compilador	3.89
True 	Núcleo (sistema operacional)	4.05 	Núcleo (sistema operacional)	4.31
False 	Programa de computador	4.12 	Sistema operativo	4.44
True 	Computador quântico	4.16 	Computador quântico	4.55
False 	Sistema operativo	4.17 	C (linguagem de programação)	4.55
False 	C (linguagem de programação)	4.18 	Rede neural artificial	4.60
False 	Software	4.29 	Programa de computador	4.63
False 	Rede neural artificial	4.29 	Linguagem formal	4.72
False 	Linguagem formal	4.33 	Criptografia	4.74
False 	Programação de computadores	4.35 	Comunicação	4.76
False 	Multitarefa	4.36 	Lógica	4.77
False 	Criptografia	4.41 	Software	4.78
False 	Comunicação	4.44 	Peer-to-peer	4.78
True 	Expressão regular	4.45 	Expressão regular	4.80
False 	Perl	4.45 	Multitarefa	4.83
False 	Lógica	4.45 	Programação de computadores	4.84
False 	Peer-to-peer	4.46 	Termodinâmica	4.84
False 	Ciência	4.46 	TCP/IP	4.85
False 	Termodinâmica	4.47 	Unix	4.85
True 	Ci