In [6]:
import random
from time import time
from neo4j import GraphDatabase
import pandas as pd

import mmh3
from collections import Counter

In [45]:
small = True

USER = "neo4j"
PASSWD = "12345678"

AUTH = (USER, PASSWD)
DATABASE = "neo4j"
URI = f"bolt://localhost:7687"
       

random_query = "MATCH (t:Table) RETURN t.table_id, rand() as r ORDER BY r LIMIT 100"

query = """
    MATCH (n:Table WHERE n.table_id = $_query_id)-[r:HAS]->(t:Token)<-[p:HAS]-(m:Table WHERE m.table_id <> $_query_id)
    RETURN 
        n.table_id as query_id,
        m.table_id as result_id,
        SUM(
            CASE 
                WHEN r.token_count >= p.token_count THEN p.token_count 
                ELSE r.token_count 
            END
        ) AS token_count 
        ORDER BY token_count DESC 
        LIMIT 10
"""

with GraphDatabase.driver(uri=URI, auth=AUTH) as driver:
    with driver.session(database=DATABASE) as session:
        query_ids = [x[0] for x in session.run(query=random_query).values()]
        
        start = time()
        stepsize = 1
        for i, query_id in enumerate(query_ids):
            step = time()
            results = session.run(query=query, parameters={"_query_id": query_id}).values()
            print(i, time() - step)
        # results.pivot_table(values=['token_count'], index=['query_id', 'result_id'], aggfunc='sum')

print(time() - start)


0 0.3802149295806885
1 0.5239365100860596
2 0.028276920318603516
3 0.4801781177520752
4 0.820742130279541
5 0.4060025215148926
6 0.3363802433013916
7 0.7338411808013916
8 0.009866476058959961
9 0.15889477729797363
10 0.1411290168762207
11 0.2759852409362793
12 0.16452956199645996
13 0.17923879623413086
14 0.008026123046875
15 0.02301168441772461
16 0.40921688079833984
17 0.7401940822601318
18 0.05110645294189453
19 0.4713280200958252
20 0.02402973175048828
21 0.3295102119445801
22 0.2197568416595459
23 0.32523679733276367
24 0.029572248458862305
25 0.1396961212158203
26 0.5397114753723145
27 0.18746423721313477
28 0.09686398506164551
29 0.42729783058166504
30 0.5434448719024658
31 0.02565169334411621
32 0.1103060245513916
33 0.05005049705505371
34 0.035195350646972656
35 0.4129676818847656
36 0.41570425033569336
37 0.1549663543701172
38 0.01949143409729004
39 0.3014848232269287
40 0.7963628768920898
41 0.01740241050720215
42 0.15748953819274902
43 0.6655876636505127
44 0.11324310302734

In [26]:
results

[[2153179, 1770653, 18],
 [2153179, 556455, 16],
 [2153179, 113890, 16],
 [2153179, 202923, 16],
 [2153179, 477153, 16],
 [2153179, 477152, 16],
 [2153179, 467926, 16],
 [2153179, 271572, 16],
 [2153179, 261211, 16],
 [2153179, 67824, 16]]

In [36]:
# results = pd.DataFrame(results, columns=['query_id', 'token_count', 'result_id', 'token_id'])
results = pd.DataFrame(results, columns=['query_id', 'result_id', 'token_count'])

results

Unnamed: 0,query_id,result_id,token_count
0,889958,884711,11
1,889958,882024,10
2,889958,882907,10
3,889958,1387915,9
4,889958,1242790,8
5,889958,1242785,8
6,889958,882187,8
7,889958,2553148,7
8,889958,1828293,7
9,889958,2606051,7


In [37]:
results.pivot_table(values=['token_count'], index=['query_id', 'result_id'], aggfunc='sum').sort_values(by=['token_count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,token_count
query_id,result_id,Unnamed: 2_level_1
889958,1828293,7
889958,2553148,7
889958,2606051,7
889958,882187,8
889958,1242785,8
889958,1242790,8
889958,1387915,9
889958,882024,10
889958,882907,10
889958,884711,11


In [30]:
from tools.utils.utils import get_mongodb_collections


mongoclient, collections = get_mongodb_collections(small=False)

In [38]:
from tools.utils.utils import get_one_document_from_mongodb_by_key


tab1 = get_one_document_from_mongodb_by_key('_id_numeric', 1770653, *collections)
tab2 = get_one_document_from_mongodb_by_key('_id_numeric', 2153179, *collections)

In [39]:
def get_table_tokens_counter(table, numeric_columns):
    def prepare_token(token):
        return str(token).replace('|', ' ').replace('\n', ' ')

    tokens = [prepare_token(token) for row in table for icol, token in enumerate(row) 
                    if not pd.isna(token) and token and numeric_columns[icol] == 0]

    return Counter(tokens)

In [40]:
c1 = get_table_tokens_counter(tab1['content'], tab1['numeric_columns'])
c2 = get_table_tokens_counter(tab2['content'], tab2['numeric_columns'])

In [41]:
common = []
for tokid in set(c1.keys()).intersection(c2.keys()):
    common.append([tokid, c1[tokid], c2[tokid]])
common.append(['tot', 'tot', sum(min(x[1], x[2]) for x in common)])
pd.DataFrame(common)

Unnamed: 0,0,1,2
0,"New Delhi, India",18,16
1,tot,tot,16


In [42]:
pd.DataFrame(tab1['content'])

Unnamed: 0,0,1,2,3,4,5,6,7
0,Outcome,No.,Date,Tournament,Surface,Partner,Opponents,Score
1,Winner,1.,2 November 1998,"New Delhi, India",Hard,{{flagicon|IND}} Sai Jayalakshmy Jayaram,{{flagicon|THA}} Montika Anuchan\n{{flagicon|T...,"7–6(7–4), 1–6, 6–2"
2,Runner-up,2.,10 May 1999,"Lucknow, India",Carpet,{{flagicon|IND}} Sai Jayalakshmy Jayaram,{{flagicon|IND}} Shruti Dhawan\n{{flagicon|IND...,0–1 ret.
3,Runner-up,3.,18 October 1999,"Jakarta, Indonesia",Hard,{{flagicon|IND}} Sai Jayalakshmy Jayaram,{{flagicon|INA}} Liza Andriyani\n{{flagicon|TH...,"0–6, 3–6"
4,Winner,4.,16 April 2000,"Mumbai, India",Hard,{{flagicon|IND}} Sai Jayalakshmy Jayaram,{{flagicon|IND}} Manisha Malhotra\n{{flagicon|...,"6–4, 4–6, 2–1 Ret"
...,...,...,...,...,...,...,...,...
62,Runner-up,62.,20 June 2011,"New Delhi, India",Hard,{{flagicon|ISR}} Keren Shlomo,{{flagicon|KOR}} Kim Hae-sung\n{{flagicon|KOR}...,"5–7, 0–6"
63,Winner,63.,7 May 2012,"New Delhi, India",Hard,{{flagicon|IND}} Ankita Raina,{{flagicon|CHN}} Liu Yuxuan\n{{flagicon|China}...,"6–1, 6–4"
64,Winner,64.,21 May 2012,"New Delhi, India",Hard,{{flagicon|India}} Ankita Raina,{{flagicon|India}} Sri Peddy Reddy\n{{flagicon...,"6–3, 6–2"
65,Runner-up,65.,15 April 2013,"Chennai, India",Clay,{{flagicon|India}} Ankita Raina,{{flagicon|India}} Natasha Palha\n{{flagicon|I...,"7–5, 3–6, [6–10]"


In [43]:
pd.DataFrame(tab2['content'])

Unnamed: 0,0,1,2,3
0,SFL 2017: Finals - The Final Battle: Sher-E-Pu...,{{dts|format=dmy|2017|February|25}},Siri Fort Sports Complex,"New Delhi, India"
1,SFL 2017: Battle for Bronze - Bengaluru Tigers...,{{dts|format=dmy|2017|February|24}},Siri Fort Sports Complex,"New Delhi, India"
2,SFL 2017: Semifinals - The Ultimate Showdown: ...,{{dts|format=dmy|2017|February|18}},Siri Fort Sports Complex,"New Delhi, India"
3,SFL 2017: Semifinals - King of the Jungle: She...,{{dts|format=dmy|2017|February|17}},Siri Fort Sports Complex,"New Delhi, India"
4,SFL 2017: Gujarat Warriors vs. Goa Pirates,{{dts|format=dmy|2017|February|12}},Siri Fort Sports Complex,"New Delhi, India"
5,SFL 2017: Big City Brawl - Mumbai Maniacs vs. ...,{{dts|format=dmy|2017|February|11}},Siri Fort Sports Complex,"New Delhi, India"
6,SFL 2017: War of the North - Delhi Heroes vs. ...,{{dts|format=dmy|2017|February|10}},Siri Fort Sports Complex,"New Delhi, India"
7,SFL 2017: Battle of the Bold - U.P. Nawabs vs....,{{dts|format=dmy|2017|February|05}},Siri Fort Sports Complex,"New Delhi, India"
8,SFL 2017: Coastal Collision - Mumbai Maniacs v...,{{dts|format=dmy|2017|February|04}},Siri Fort Sports Complex,"New Delhi, India"
9,SFL 2017: Metro Mayhem - Gujarat Warriors vs. ...,{{dts|format=dmy|2017|February|03}},Siri Fort Sports Complex,"New Delhi, India"
