# Example tutorial

## Get data

In [1]:
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

In [2]:
xb.shape

(100000, 64)

## Building an index and adding the vectors to it

In [3]:
import faiss                   # make faiss available
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
100000


## Searching

In [4]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)

[[  0 393 363  78]
 [  1 555 277 364]
 [  2 304 101  13]
 [  3 173  18 182]
 [  4 288 370 531]]
[[0.        7.1751738 7.20763   7.2511625]
 [0.        6.3235645 6.684581  6.799946 ]
 [0.        5.7964087 6.391736  7.2815123]
 [0.        7.2779055 7.527987  7.6628466]
 [0.        6.7638035 7.2951202 7.3688145]]


In [5]:
D, I = index.search(xq, k)     # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:])                  # neighbors of the 5 last queries

[[ 381  207  210  477]
 [ 526  911  142   72]
 [ 838  527 1290  425]
 [ 196  184  164  359]
 [ 526  377  120  425]]
[[ 9900 10500  9309  9831]
 [11055 10895 10812 11321]
 [11353 11103 10164  9787]
 [10571 10664 10632  9638]
 [ 9628  9554 10036  9582]]


# OGB analysis

In [6]:
import numpy as np
import pandas as pd
from ogb.lsc import WikiKG90Mv2Dataset
import sys
import os

In [None]:
rootdir = '/db2/users/minjunpark/ogb/rawdata'
dataset = WikiKG90Mv2Dataset(root=str(rootdir))

In [9]:
print(dataset.num_entities) # number of entities -- > 91230610
print(dataset.num_relations) # number of relation types --> 1387
print(dataset.num_feat_dims) # dimensionality of entity/relation features.

entity_feat = dataset.entity_feat # np.array of shape (91230610, 768)

91230610
1387
768


In [20]:
entity_feat = entity_feat.astype('float32')
entity_feat.shape

(91230610, 768)

In [58]:
np.save('entity_feat_float32', entity_feat)

In [21]:
entity_feat_subs = entity_feat[:100, :].astype('float32')
entity_feat_subs.shape

(100, 768)

In [55]:
np.save('entity_feat__subs_float32', entity_feat_subs)

In [57]:
entity_feat_subs

array([[ 0.08996582,  0.10931396,  0.08935547, ...,  0.03344727,
        -0.07281494, -0.03805542],
       [-0.12127686, -0.10900879,  0.02449036, ...,  0.05871582,
         0.1899414 , -0.08221436],
       [-0.12127686, -0.10900879,  0.02449036, ...,  0.05871582,
         0.1899414 , -0.08221436],
       ...,
       [-0.04736328, -0.20275879,  0.07397461, ..., -0.10705566,
         0.00597763,  0.09545898],
       [-0.12127686, -0.10900879,  0.02449036, ...,  0.05871582,
         0.1899414 , -0.08221436],
       [-0.05117798, -0.10369873,  0.02655029, ...,  0.0165863 ,
         0.02526855, -0.06860352]], dtype=float32)

In [56]:
np.load('entity_feat__subs_float32.npy')

array([[ 0.08996582,  0.10931396,  0.08935547, ...,  0.03344727,
        -0.07281494, -0.03805542],
       [-0.12127686, -0.10900879,  0.02449036, ...,  0.05871582,
         0.1899414 , -0.08221436],
       [-0.12127686, -0.10900879,  0.02449036, ...,  0.05871582,
         0.1899414 , -0.08221436],
       ...,
       [-0.04736328, -0.20275879,  0.07397461, ..., -0.10705566,
         0.00597763,  0.09545898],
       [-0.12127686, -0.10900879,  0.02449036, ...,  0.05871582,
         0.1899414 , -0.08221436],
       [-0.05117798, -0.10369873,  0.02655029, ...,  0.0165863 ,
         0.02526855, -0.06860352]], dtype=float32)

# build index

In [22]:
import time

start_time = time.time()


d = 768 

index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(entity_feat)                  # add vectors to the index
print(index.ntotal)

print("--- %s seconds ---" % (time.time() - start_time))

True
91230610
--- 128.20114755630493 seconds ---


# Search

In [23]:
start_time = time.time()

k = 4 # we want to see 4 nearest neighbors
D, I = index.search(entity_feat_subs, k) # sanity check

print("--- %s seconds ---" % (time.time() - start_time))

--- 116.28497314453125 seconds ---


In [34]:
np.save('index_candidates', I)

In [35]:
np.load('index_candidates.npy')

array([[       0, 50424036, 69118298, 64142503, 25739129, 26710876,
        59540621, 16595952, 51530979, 60240973],
       [       1, 20761291, 21832273,  5334315,  7180347, 40892943,
        43315237,  5986400, 41523041, 25190690],
       [       2, 13761364, 62502943,  7181879,  7507149, 52509816,
         7497860,  7500037,  7182781,  7500222],
       [       3, 45345864, 24142363, 76594045, 13287222, 40204954,
        77718369, 76615506, 27063636, 27063614],
       [       4, 19828401, 13260696, 10064833, 11737469, 10064834,
        11615307, 10064832, 10064835, 11704366],
       [       5,  5317159,  5321756, 39477755, 61033440, 21881780,
        82743303, 18840279, 22707960, 82725938],
       [       6,  8022475, 76174846, 44260192, 17123410, 65128343,
         4907180, 90240419, 22578522, 83065650],
       [       7, 59716253, 46444299, 91230602, 46442069,    67710,
        46442420, 91230606, 20575460, 54788665],
       [       8, 91230388, 91229719, 91230474, 91230439,   9174

In [25]:
start_time = time.time()

# Search
k = 10                          # we want to see 4 nearest neighbors
D, I = index.search(entity_feat_subs, k) # sanity check

print("--- %s seconds ---" % (time.time() - start_time))

--- 191.92717385292053 seconds ---


In [46]:
# Validation dataset
valid_task = dataset.valid_dict['h,r->t'] # get a dictionary storing the h,r->t task.
hr = valid_task['hr']
h = hr[:,0]
t = valid_task['t']

In [54]:
entity_feat_subs = entity_feat[[h.tolist()]]

In [None]:
def get_candidates(K, entity_feat_subs, name):
    D, I = index.search(entity_feat_subs, k) # sanity check
    np.save(f'I_{name}_candidates', I)
    np.save(f'D_{name}_candidates', D)
    return None

In [None]:
K = 1000
name = 'valid'
get_candidates(K, entity_feat_subs, name)