In [74]:
from sklearn.neighbors.kde import KernelDensity
import numpy as np
from scipy.stats import ks_2samp

import numpy as np

from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections

# Data

In [41]:
f_key = [102, 102, 102, 102, 102, 102, 107, 107, 107, 106, 106, 106, 106, 106, 106, 106, 106, 107, 107, 107, 107, 106, 106, 107, 107, 107, 106]
f2_key = [x+5 for x in f_key]
f_org = [149,149,149,149,149,149,149,149,149,199,199,199,199,199,199,199,199,149,149,149,149,199,199,149,149]
area = [677.21,1700.45,1060.68,180.67,97.4,89.63,636.2,338.33,173.35,957.59,172.157,3.38,1236.30,1010.41,245.94,319.9,851.13,9.72,17.11]
key_m = KernelDensity(kernel="gaussian", bandwidth=0.2)
key_np = np.asarray(f_key).reshape(-1,1)
key2_m = KernelDensity(kernel="gaussian", bandwidth=0.2)
key2_np = np.asarray(f2_key).reshape(-1,1)
org_m = KernelDensity(kernel="gaussian", bandwidth=0.2)
org_np = np.asarray(f_org).reshape(-1,1)
area_m = KernelDensity(kernel="gaussian", bandwidth=0.2)
area_np= np.asarray(area).reshape(-1,1)

In [42]:
kde_key = key_m.fit(key_np)
kde_key2 = key2_m.fit(key2_np)
kde_org = org_m.fit(org_np)
kde_area = area_m.fit(area_np)

## compare f_key and f_org

In [43]:
key_samples = [kde_key.sample()[0][0] for x in range(10)]
key2_samples= [kde_key2.sample()[0][0] for x in range(10)]
org_samples = [kde_org.sample()[0][0] for x in range(10)]
area_samples= [kde_area.sample()[0][0] for x in range(10)]

In [44]:
ks_key_key = ks_2samp(key_samples, key_samples)
ks_key_key2 = ks_2samp(key_samples, key2_samples)
ks_key_org = ks_2samp(key_samples, org_samples)
ks_key_area= ks_2samp(key_samples, area_samples)
dvalue = 0.5
pvalue = 0.001

In [45]:
dvalue_key_key = ks_key_key[0]
pvalue_key_key =ks_key_key[1]
if dvalue_key_key < dvalue and pvalue_key_key > pvalue:
    print("key equals key")
else:
    print("key diff than key")

key equals key


In [46]:
dvalue_key_key2 = ks_key_key2[0]
pvalue_key_key2 =ks_key_key2[1]
if dvalue_key_key2 < dvalue and pvalue_key_key2 > pvalue:
    print("key equals key2")
else:
    print("key diff than key2")

key diff than key2


In [47]:
dvalue_key_org = ks_key_org[0]
pvalue_key_org =ks_key_org[1]
if dvalue_key_org < dvalue and pvalue_key_org > pvalue:
    print("key equals org")
else:
    print("key diff than org")

key diff than org


In [48]:
dvalue_key_area = ks_key_area[0]
pvalue_key_area =ks_key_area[1]
if dvalue_key_area < dvalue and pvalue_key_area > pvalue:
    print("key equals area")
else:
    print("key diff than area")

key diff than area


# Non-statistical distance

### Euclidean

In [49]:
d_key_key = np.linalg.norm(np.array(key_samples)-np.array(key_samples))
d_key_key

0.0

In [50]:
d_key_key2 = np.linalg.norm(np.array(key_samples)-np.array(key2_samples))
d_key_key2

19.167475270128165

In [51]:
d_key_org = np.linalg.norm(np.array(key_samples)-np.array(org_samples))
d_key_org

199.85559977670724

In [52]:
d_key_area = np.linalg.norm(np.array(key_samples)-np.array(area_samples))
d_key_area

1817.3793185243383

In [53]:
d_org_area = np.linalg.norm(np.array(org_samples)-np.array(area_samples))
d_org_area

1715.6691436679548

### Manhattan

In [54]:
def manhattan_distance(x, y):
    return sum(abs(a-b) for a,b in zip(x,y))

In [55]:
d_key_key = manhattan_distance(key_samples, key_samples)
d_key_key

0.0

In [56]:
d_key_key2 = manhattan_distance(key_samples, key2_samples)
d_key_key2

55.53150147402944

In [57]:
d_key_org = manhattan_distance(key_samples, org_samples)
d_key_org

587.97534562809187

In [58]:
d_key_area = manhattan_distance(key_samples, area_samples)
d_key_area

3969.0113723640361

# LSH aproximate NN

In [71]:
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from nearpy.distances import EuclideanDistance

In [72]:
dimension = 10
rbp = RandomBinaryProjections('rbp', 10)
engine = Engine(dimension, lshashes=[rbp], distance=EuclideanDistance())

In [73]:
engine.distance

<nearpy.distances.euclidean.EuclideanDistance at 0x10cc1c9b0>

In [96]:
engine.clean_all_buckets()
engine.store_vector(np.array(key_samples), "key-key")
engine.store_vector(np.array(key2_samples), "key-key2")
engine.store_vector(np.array(org_samples), "key-org")
engine.store_vector(np.array(area_samples), "key-area")

In [103]:
query = [102, 102, 104, 106, 106, 102, 104, 105, 106, 107]
N = engine.neighbours(np.array(query))

In [106]:
labels = []
for el in N:
    (data, label, value) = el
    labels.append(label)
print(str(labels))

['key-key2', 'key-key']
