# Approximate NN with `PySpaRNN` (for sparse data)
This tutorial will show you how to use the `PySpaRNNIndexer` object for approximate nearest neighbour search.

In [1]:
import os
import numpy as np

import gensim
from gensim.similarities.index import PySpaRNNIndexer
from gensim.models.word2vec import Word2Vec

# How to use `PySpaRNNIndexer`
Let's prepare some data first: For this we will load text data from the Lee Corpus.

In [2]:
# This is adapted from /docs/notebooks/annoytutorial.ipynb
training_file = os.path.join(gensim.__path__[0], 'test', 'test_data', 'lee_background.cor')

class MyText(object):
    def __iter__(self):
        for line in open(training_file):
            # Assume there's one document per line, tokens separated by whitespace
            yield gensim.utils.simple_preprocess(line)

sentences = MyText()
model = Word2Vec(sentences, min_count = 1)
model.init_sims()

In [3]:
pysparnn_index = PySpaRNNIndexer(model, num_clusters = 2)

In [35]:
K = len(model.wv.syn0norm)
# Randomly draw a vector...
vector = model.wv.syn0norm[np.random.randint(0, K)]

Now we can query for data points in proximity using `pysparnn` as indexer. For example, let's try to find the ten nearest neighbor of our (random) vector.

In [36]:
model.most_similar([vector], topn = 10, indexer = pysparnn_index)

[('funds', 1.000000238419),
 ('beyond', 0.99431658),
 ('top', 0.99428999),
 ('continuing', 0.99417728),
 ('month', 0.99401522),
 ('crash', 0.99400979),
 ('plans', 0.99398941),
 ('strip', 0.99397492),
 ('laws', 0.99395961),
 ('peter', 0.9939574)]

Let's compare it with Gensim's default indexer...

In [37]:
model.most_similar([vector], topn = 10)

[('funds', 1.0),
 ('beyond', 0.9943166375160217),
 ('top', 0.9942900538444519),
 ('continuing', 0.9941773414611816),
 ('month', 0.9940152168273926),
 ('crash', 0.994009792804718),
 ('plans', 0.9939892292022705),
 ('strip', 0.9939751625061035),
 ('laws', 0.9939596056938171),
 ('peter', 0.9939572811126709)]

# A first benchmark
If we benchmark `PySpaRNNIndexer` against the default indexer, we will find out that it is way slower. This is because it is developed primiarly for sparse data. More on this below!

In [7]:
%timeit model.most_similar([vector], topn = 10)

1000 loops, best of 3: 589 µs per loop


In [8]:
%timeit model.most_similar([vector], topn = 10, indexer = pysparnn_index)

10 loops, best of 3: 32.9 ms per loop


# Benchmarking NN implementations (on artificial data)
We will now benchmark PySpaRNN's performance alongside `Annoy` (another library for aproximate NN) and a brute–force exact nearest neighbour search using `scikit-learn`.

In [9]:
from scipy.sparse import csr_matrix

from annoy import AnnoyIndex
from pysparnn.cluster_index import MultiClusterIndex
from sklearn.neighbors import NearestNeighbors

## Generate some data

In [10]:
# Number of vectors
N = 1000
# Number of features
K = 50000
# Sparsity parameters
f = 0.0005
M = int(np.ceil(f * K))

In [11]:
# Hallucinate a corpus
corpus = [[(a, b) for a, b in zip(np.random.randint(0, K, size = M), np.random.normal(size = M))] for i in range(N)]
# Build the feature matrix
vectors, labels = np.zeros((N, K)), np.arange(0, N, 1)
for i, document in enumerate(corpus):
    for j, x in document:
        vectors[i, j] = x
        
# Randomly draw some sample vectors to predict on
indices = np.random.randint(N, size = 100)

## Benchmarking `Annoy`

In [12]:
# Train the Annoy model
a = AnnoyIndex(K)
for i, vector in enumerate(vectors):
    a.add_item(i, vector)
n_trees = 300
a.build(n_trees)

True

In [13]:
def test_annoy(model, indices, k = 3):

    for i in indices:
        dense = vectors[i]
        sparse = csr_matrix(dense)
        model.get_nns_by_vector(dense, k)

In [14]:
print('Benchmarking annoy:')
%timeit test_annoy(a, indices)

Benchmarking annoy:
1 loop, best of 3: 5.18 s per loop


## Benchmarking `PySpaRNN`

In [15]:
# Train the PySpaRNN model
b = MultiClusterIndex(csr_matrix(vectors), labels)

In [16]:
def test_pysparnn(model, indices, k = 3):

    for i in indices:
        dense = vectors[i]
        sparse = csr_matrix(dense)
        model.search(sparse, k = k, return_distance = False)

In [17]:
print('Benchmarking PySpaRNN:')
%timeit test_pysparnn(b, indices)

Benchmarking PySpaRNN:
1 loop, best of 3: 721 ms per loop


## Benchmarking `scikit-learn`'s brute–force (exact) NN

In [18]:
# Train the (brute force) scitki-learn exact NN search
c = NearestNeighbors(n_neighbors = 3, algorithm = 'brute')
c.fit(vectors)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [19]:
def test_sklearn(model, indices, k = 3):

    for i in indices:
        dense = vectors[i]
        sparse = csr_matrix(dense)
        model.kneighbors(dense.reshape(1, -1), n_neighbors = k, return_distance = False)

In [20]:
print('Benchmarking sklearn:')
%timeit test_sklearn(c, indices)

Benchmarking sklearn:
1 loop, best of 3: 18.8 s per loop
