In [1]:
import pandas as pd
import numpy as np

import faiss

import fasttext
import datasketch
from tools.embedding import FastTextTableEmbedder
from tools.utils.utils import get_mongodb_collections, get_one_document_from_mongodb_by_key
from tools.utils.settings import DefaultPath as dp

In [3]:
tabemb = FastTextTableEmbedder(dp.model_path.fasttext + '/cc.en.300.bin')



In [4]:
d = 300
n_bits = 2 * d

In [5]:
mongoclient, collections = get_mongodb_collections(size='standard')

In [None]:
xb = np.empty(shape=(0, d + 1))
for i in range(1000):
    doc = get_one_document_from_mongodb_by_key('_id_numeric', i, *collections)
    colemb = tabemb.embedding_table(doc['content'], doc['numeric_columns'], [])
    ids = np.expand_dims(np.repeat([i], colemb.shape[0]), axis=0)
    colemb = np.concatenate((ids.T, colemb), axis=1)
    xb = np.concatenate((xb, colemb), axis=0)

In [None]:
xq = np.empty(shape=(0, d + 1))
for i in range(110, 120):
    doc = get_one_document_from_mongodb_by_key('_id_numeric', i, *collections)
    colemb = tabemb.embedding_table(doc['content'], doc['numeric_columns'], [])
    ids = np.expand_dims(np.repeat([i], colemb.shape[0]), axis=0)
    colemb = np.concatenate((ids.T, colemb), axis=1)
    xq = np.concatenate((xq, colemb), axis=0)

In [7]:
xb_ids = np.empty(shape=(0, 1))
xb_ids

array([], shape=(0, 1), dtype=float64)

In [16]:
ids = np.expand_dims(np.repeat([1], 2), axis=0)
ids

array([[1, 1]])

In [12]:
ids.shape

(1, 3)

In [17]:
xb_ids = np.concatenate((xb_ids, ids.T))
xb_ids

array([[0.],
       [0.],
       [0.],
       [1.],
       [1.]])

In [18]:
xb = np.empty(shape=(0, d))
xb_ids = np.empty(shape=(0, 1))
for i in range(1000):
    doc = get_one_document_from_mongodb_by_key('_id_numeric', i, *collections)
    colemb = tabemb.embedding_table(doc['content'], doc['numeric_columns'], [])
    ids = np.expand_dims(np.repeat([i], colemb.shape[0]), axis=0)
    xb_ids = np.concatenate((xb_ids, ids.T))
    xb = np.concatenate((xb, colemb), axis=0)

In [20]:
xq = np.empty(shape=(0, d))
xq_ids = np.empty(shape=(0, 1))
for i in range(110, 120):
    doc = get_one_document_from_mongodb_by_key('_id_numeric', i, *collections)
    colemb = tabemb.embedding_table(doc['content'], doc['numeric_columns'], [])
    ids = np.expand_dims(np.repeat([i], colemb.shape[0]), axis=0)
    xq_ids = np.concatenate((xq_ids, ids.T))
    xq = np.concatenate((xq, colemb), axis=0)

In [22]:
xb.shape, xb_ids.shape, xq.shape, xq_ids.shape

((3622, 300), (3622, 1), (34, 300), (34, 1))

In [42]:
# index_lsh = faiss.IndexLSH(d, n_bits)
index_iv = faiss.index_factory(tabemb.get_dimension(), "IVF3000_HNSW32,Flat,IDMap2")

In [43]:
# index_lsh.train(xb[:, 1:])
# index_lsh.train(xb)
index_iv.train(xb)



In [46]:
# index_up = faiss.IndexIDMap2(index_lsh)
index_up = faiss.IndexIDMap2(index_iv)

In [47]:
# index_up.add_with_ids(xb[:, 1:], xb[:, 0])
index_up.add_with_ids(xb, xb_ids[:, 0])

RuntimeError: Error in void faiss::IndexIDMapTemplate<IndexT>::add(faiss::idx_t, const typename IndexT::component_t*) [with IndexT = faiss::Index; faiss::idx_t = long int; typename IndexT::component_t = float] at /home/conda/feedstock_root/build_artifacts/faiss-split_1685210641191/work/faiss/IndexIDMap.cpp:43: add does not make sense with IndexIDMap, use add_with_ids

In [29]:
# D, I = index_up.search(xq[:, 1:], 5)
D, I = index_up.search(xq, 5)

In [30]:
I

array([[110, 114, 115, 339, 299],
       [110, 114, 144, 140, 115],
       [111, 114, 115, 142, 143],
       [111, 116, 116,  77, 915],
       [112, 757, 114, 311, 140],
       [112, 550, 550, 550, 697],
       [113, 110, 114, 144, 115],
       [114, 110, 115, 339, 299],
       [114, 110, 144, 140, 115],
       [114, 142, 143, 111, 139],
       [114, 401, 396, 134, 341],
       [  4,   6,   8,  10,  12],
       [115, 114, 110, 339, 299],
       [115, 110, 114, 140, 144],
       [115, 114, 111, 142, 143],
       [115, 134, 133, 169, 167],
       [  4,   6,   8,  10,  12],
       [116, 139, 111, 114, 111],
       [116, 645, 797, 116, 539],
       [116, 983, 777, 778, 415],
       [116, 991, 518, 101, 557],
       [116, 302, 991, 557, 509],
       [117,  67,  65,  61,  88],
       [117, 357, 722, 392, 681],
       [117, 351, 915, 188, 387],
       [117, 117, 681, 681, 681],
       [117, 117, 681, 681, 681],
       [118, 816,  95, 753, 577],
       [118, 119, 403, 273, 301],
       [118, 4

In [34]:
# res = np.concatenate((np.expand_dims(xq[:, 0], axis=0).T, I), axis=1)
res = np.concatenate((xq_ids, I), axis=1)

In [36]:
# res2 = np.split(res[:, 1:], np.unique(res[:, 0], return_index=True)[1][1:])
res2 = np.split(res[:, 1:], np.unique(res[:, 0], return_index=True)[1][1:])

In [16]:
x = sorted(list(zip(*np.unique(res2[0], return_counts=True))), key=lambda x: x[1], reverse=True)[:3]
x

[(110.0, 2), (114.0, 2), (115.0, 2)]

In [37]:
final_results = []
for i, qid in enumerate(range(110, 120)):
    x = sorted(list(zip(*np.unique(res2[i], return_counts=True))), key=lambda x: x[1], reverse=True)[:3]
    final_results.append((qid, [int(y[0]) for y in x]))

In [18]:
final_results

[(110, [110, 114, 115]),
 (111, [111, 116, 77]),
 (112, [550, 112, 114]),
 (113, [110, 113, 114]),
 (114, [114, 110, 115]),
 (115, [115, 114, 110]),
 (116, [116, 111, 557]),
 (117, [117, 681, 61]),
 (118, [118, 95, 119]),
 (119, [119, 144, 275])]

In [38]:
from tools.utils.utils import apply_sloth, create_token_set


for qid, rids in final_results:
    qdoc = get_one_document_from_mongodb_by_key('_id_numeric', qid, *collections)
    qset = create_token_set(qdoc['content'], 'set', qdoc['numeric_columns'])
    print(qid, end=': ')
    for rid in rids:
        if rid == qid: continue
        rdoc = get_one_document_from_mongodb_by_key('_id_numeric', rid, *collections)
        rset = create_token_set(rdoc['content'], 'set', rdoc['numeric_columns'])
        intersection = set(qset).intersection(rset)
        sloth_ov = apply_sloth(qdoc['content'], rdoc['content'], qdoc['numeric_columns'], rdoc['numeric_columns'])
        print(f'{rid}->{len(intersection)}->{sloth_ov}', end='\t')
    print()

110: 114->27->40	115->7->7	
111: 116->0->0	77->0->0	
112: 550->0->0	114->0->0	
113: 110->7->7	114->3->3	
114: 110->27->40	115->8->7	
115: 114->8->7	110->7->7	
116: 111->0->0	557->0->0	
117: 681->11->20	61->1->1	
118: 95->2->2	119->0->0	
119: 144->0->0	275->0->0	


In [23]:
qdoc = get_one_document_from_mongodb_by_key('_id_numeric', 114, *collections)
rdoc = get_one_document_from_mongodb_by_key('_id_numeric', 110, *collections)

In [24]:
pd.DataFrame(qdoc['content'])

Unnamed: 0,0,1,2,3,4,5
0,DF,Bruno Rodrigo,Portuguesa,Signed,2012,
1,DF,Bruno Aguiar,Guarani,Signed,2012,
2,MF,Marquinhos,Avaí,Signed,2012,
3,GK,Vladimir,Fortaleza,Loan return,2014,
4,GK,Felipe,Paraná Clube,Loan return,2012,
5,MF,Roberto Brum,Figueirense,Loan return,2011,
6,DF,Durval,Sport,Signed,2011,
7,MF,Wesley,Atlético–PR,Loan Return,2012,
8,MF,Giovanni,Mogi Mirim,Signed,2010,
9,FW,Zé Eduardo,Pinheiros,Loaned,2010,


In [25]:
pd.DataFrame(rdoc['content'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,GK,Felipe,6,0,23,0,9,0,1,0,39,0
1,GK,Rafael,32,0,0,0,2,0,2,0,36,0
2,DF,Alex Sandro,24,1,1,1,5,1,0,0,30,3
3,DF,Léo,18,0,12,0,5,0,2,0,37,0
4,DF,Maranhão,18,2,2,0,4,0,0,0,24,2
5,DF,Pará,31,0,20,1,9,0,2,0,62,1
6,DF,Bruno Aguiar,10,0,6,0,3,0,0,0,19,0
7,DF,Bruno Rodrigo,1,0,5,0,0,0,0,0,6,0
8,DF,Durval,34,2,20,1,10,0,2,0,66,3
9,DF,Edu Dracena,28,2,16,0,9,2,2,0,55,4
