In [45]:
import chromadb
import time

import jsonlines
from tqdm.notebook import tqdm
from collections import defaultdict

import polars as pl
import pandas as pd
import numpy as np

from code.fasttext.embedding_utils import TableEncoder
from code.utils.utils import rebuild_table
from code.utils.settings import DefaultPath

In [7]:
tabenc = TableEncoder ()

In [10]:
chroma_client = chromadb.PersistentClient(
    DefaultPath.db_path.chroma + 'double_collection_v1/' + 'v_35_50000_add_label_True_with_metadatas_True'
    )
chroma_client.list_collections()

[Collection(name=rows), Collection(name=columns)]

In [11]:
row_collection = chroma_client.get_collection('rows')
column_collection = chroma_client.get_collection('columns')

In [9]:
sloth_tables = {json_table['_id']: rebuild_table(json_table) 
             for i, json_table in enumerate(jsonlines.open(DefaultPath.data_path.wikitables + 'sloth_tables.jsonl'))
             if i < 50000}

In [128]:
sloth_results = pl.read_csv(DefaultPath.data_path.wikitables + 'train_set_turl_malaguti.csv') \
                    .sort(by=['o_a', 'jsim', 'a%'], descending=[True, True, True])

In [129]:
sloth_results.head()

r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""12158330-1""","""3341919-1""",0.965971,3036,0.932432
"""30210174-4""","""6897174-2""",1.0,2646,1.0
"""6374194-6""","""6374194-7""",0.783883,2142,0.468197
"""14342092-1""","""37953453-1""",0.55753,1778,0.958491
"""16517223-1""","""16519520-1""",0.915589,1640,0.622627


## Test: Identical tables

In [102]:
idx_row = 1
print(sloth_results[idx_row]['r_id'].item() in sloth_tables.keys() and sloth_results[idx_row]['s_id'].item() in sloth_tables.keys())
sloth_results[idx_row]

True


r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""30210174-4""","""6897174-2""",1.0,2646,1.0


In [40]:
sloth_result_rec = sloth_results[1]
r_id = sloth_result_rec['r_id'].item()
s_id = sloth_result_rec['s_id'].item()
r_table = sloth_tables[r_id]
s_table = sloth_tables[s_id]

r_row_embeddings, r_column_embeddings = tabenc.full_embedding(r_table, True)
r_row_embeddings.shape, r_column_embeddings.shape 

In [41]:
r_table

Unnamed: 0,AC #,Assembly Constituency Name,Reserved for,District,Winner,No. of Votes,% of Votes,Party,Party.1
0,1,Mekliganj,Scheduled Caste,Cooch Behar,Paresh Chandra Adhikari,72040,48.88%,,All India Forward Bloc
1,2,Mathabhanga,Scheduled Caste,Cooch Behar,Binay Krishna Barman,78249,46.45%,,All India Trinamool Congress
2,3,Cooch Behar Uttar,Scheduled Caste,Cooch Behar,Nagendra Nath Roy,84825,45.11%,,All India Forward Bloc
3,4,Cooch Behar Dakshin,,Cooch Behar,Akshay Thakur,72028,47.04%,,All India Forward Bloc
4,5,Sitalkuchi,Scheduled Caste,Cooch Behar,Hiten Barman,84651,44.21%,,All India Trinamool Congress
...,...,...,...,...,...,...,...,...,...
289,290,Mayureswar,,Birbhum,Asoke Roy,67478,42.31%,,Communist Party of India (Marxist)
290,291,Rampurhat,,Birbhum,Asish Banerjee,75066,45.79%,,All India Trinamool Congress
291,292,Hansan,,Birbhum,Asit Mal,73370,46.72%,,Indian National Congress
292,293,Nalhati,,Birbhum,Abhijit Mukherjee,76047,49.02%,,Indian National Congress


In [38]:
s_table

Unnamed: 0,AC #,Assembly Constituency Name,Reserved for,District,Winner,No. of Votes,% of Votes,Party,Party.1
0,1,Mekliganj,Scheduled Caste,Cooch Behar,Paresh Chandra Adhikari,72040,48.88%,,All India Forward Bloc
1,2,Mathabhanga,Scheduled Caste,Cooch Behar,Binay Krishna Barman,78249,46.45%,,All India Trinamool Congress
2,3,Cooch Behar Uttar,Scheduled Caste,Cooch Behar,Nagendra Nath Roy,84825,45.11%,,All India Forward Bloc
3,4,Cooch Behar Dakshin,,Cooch Behar,Akshay Thakur,72028,47.04%,,All India Forward Bloc
4,5,Sitalkuchi,Scheduled Caste,Cooch Behar,Hiten Barman,84651,44.21%,,All India Trinamool Congress
...,...,...,...,...,...,...,...,...,...
289,290,Mayureswar,,Birbhum,Asoke Roy,67478,42.31%,,Communist Party of India (Marxist)
290,291,Rampurhat,,Birbhum,Asish Banerjee,75066,45.79%,,All India Trinamool Congress
291,292,Hansan,,Birbhum,Asit Mal,73370,46.72%,,Indian National Congress
292,293,Nalhati,,Birbhum,Abhijit Mukherjee,76047,49.02%,,Indian National Congress


### Row querying

In [42]:
res = row_collection.query(
    query_embeddings=r_row_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

In [60]:
# counting table_id frequencies
table_id_freq = defaultdict(int)
for res_metadatas in res['metadatas']:
    for single_rec in res_metadatas:
        table_id_freq[single_rec['table_id']] += 1
table_id_freq = sorted(list(table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
table_id_freq[:5]        

### Column querying

In [63]:
res = column_collection.query(
    query_embeddings=r_column_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

In [64]:
# counting table_id frequencies
table_id_freq = defaultdict(int)
for res_metadatas in res['metadatas']:
    for single_rec in res_metadatas:
        table_id_freq[single_rec['table_id']] += 1
table_id_freq = sorted(list(table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
table_id_freq[:5]        

### Conclusions

In [66]:
sloth_result_rec

r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""30210174-4""","""6897174-2""",1.0,2646,1.0


In both cases, row and column, the best-matcher is the table with id '6897174-2', which is also the one with a total overlapping with our starting table. We can suppose that
the method can find identical tables.

## Test: Low Jaccard - High Overlap

In [130]:
idx_row = 3
print(sloth_results[idx_row]['r_id'].item() in sloth_tables.keys() and sloth_results[idx_row]['s_id'].item() in sloth_tables.keys())
sloth_results[idx_row]

True


r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""14342092-1""","""37953453-1""",0.55753,1778,0.958491


In [73]:
sloth_result_rec = sloth_results[idx_row]
r_id = sloth_result_rec['r_id'].item()
s_id = sloth_result_rec['s_id'].item()
r_table = sloth_tables[r_id]
s_table = sloth_tables[s_id]

r_row_embeddings, r_column_embeddings = tabenc.full_embedding(r_table, True)
r_row_embeddings.shape, r_column_embeddings.shape 

In [74]:
r_table

Unnamed: 0,Name,Rural municipality (RM),Population (2011),Population (2006),Change (%),Land area (km²),Population density (per km²)
0,Abbey,Miry Creek No. 229,115,130,-11.5,0.77,149.1
1,Abernethy,Abernethy No. 186,196,197,-0.5,1.03,189.9
2,Albertville,Wise Creek No. 77,140,110,27.3,1.12,124.7
3,Alida,Reciprocity No. 32,131,106,23.6,0.37,357.1
4,Alvena,Fish Creek No. 402,55,55,0,0.43,128
...,...,...,...,...,...,...,...
260,Yarbo,Langenburg No. 181,53,72,-26.4,0.83,63.9
261,Young,Morris No. 312,239,263,-9.1,2.51,95.2
262,Zelma,Bayne No. 371,35,30,16.7,0.72,48.9
263,Zenon Park,Arborfield No. 456,187,192,-2.6,0.56,336.1


In [75]:
s_table

Unnamed: 0,Name,Status,Rural municipality,Population (2011),Population (2006),Change (%),Land area (km²),Population density (per km 2 )
0,Estevan,City,Estevan No. 5,11054,10084,9.6,18.85,586.6
1,Flin Flon (part),City,—,"229 This population does not include 5,363 in ...",242,-5.4,2.37 This area does not include km2 (sqmi) in ...,96.4
2,Humboldt,City,Humboldt No. 370,5678,4998,13.6,13.46,421.9
3,Lloydminster (part) The balance of Lloydminste...,City,Britannia No. 502,"9772 This population does not include 18,032 i...",8118,20.4,17.34 This area does not include km2 (sqmi) in...,563.6
4,Martensville Martensville is Saskatchewan's sm...,City,Corman Park No. 344,7716,4978,55,6.23,1239.3
...,...,...,...,...,...,...,...,...
466,West End,Resort village,Fertile Belt No. 183,17,26,-34.6,0.34,50.4
467,Total City,–,–,595678,547615,8.8,675.25,811.0
468,Total Town,–,–,137611,127795,7.7,473.50,290.6
469,Total Village,–,–,43130,41091,5.0,302.29,142.7


### Row querying

In [78]:
res = row_collection.query(
    query_embeddings=r_row_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

In [79]:
# counting table_id frequencies
row_table_id_freq = defaultdict(int)
for res_metadatas in res['metadatas']:
    for single_rec in res_metadatas:
        row_table_id_freq[single_rec['table_id']] += 1
row_table_id_freq = sorted(list(row_table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
row_table_id_freq[:5]        

### Column querying

In [81]:
col_res = column_collection.query(
    query_embeddings=r_column_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

In [82]:
col_table_id_freq = defaultdict(int)
for res_metadatas in col_res['metadatas']:
    for single_rec in res_metadatas:
        col_table_id_freq[single_rec['table_id']] += 1
col_table_id_freq = sorted(list(col_table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
col_table_id_freq[:5]        

### Conclusions

In [84]:
sloth_result_rec

r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""14342092-1""","""37953453-1""",0.55753,1778,0.958491


Mh, ok, are these results ok for us? On rows the table id "37953453-1" has a frequency in the results of >4000, ok that's fine, but on columns we still have a top score of the
table "37953453-1", even if it hasn't a very high Jaccard similarity. 

## Test: High Jaccard - Low Overlap

In [85]:
idx_row = 4
print(sloth_results[idx_row]['r_id'].item() in sloth_tables.keys() and sloth_results[idx_row]['s_id'].item() in sloth_tables.keys())
sloth_results[idx_row]

r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""16517223-1""","""16519520-1""",0.915589,1640,0.622627


In [87]:
sloth_result_rec = sloth_results[idx_row]
r_id = sloth_result_rec['r_id'].item()
s_id = sloth_result_rec['s_id'].item()
r_table = sloth_tables[r_id]
s_table = sloth_tables[s_id]

r_row_embeddings, r_column_embeddings = tabenc.full_embedding(r_table, True)
r_row_embeddings.shape, r_column_embeddings.shape 

In [88]:
r_table

Unnamed: 0,Rank,Representative,Party,District,Seniority date,Notes
0,1,John Dingell,D,MI-15,"December 13, 1955",
1,2,John Conyers,D,MI-14,"January 3, 1965",
2,3,Dave Obey,D,WI-7,"April 1, 1969",
3,4,Charles B. Rangel,D,NY-15,"January 3, 1971",
4,5,Bill Young,R,FL-10,"January 3, 1971",
...,...,...,...,...,...,...
436,,Jean Schmidt,R,OH-02,"August 3, 2005",
437,,John Campbell,R,CA-48,"December 7, 2005",
438,,Brian Bilbray,R,CA-50,"June 13, 2006",Also served from 1995 to 2001.
439,,Shelley Sekula-Gibbs,R,TX-22,"November 13, 2006",Left the House in 2007 .


In [89]:
s_table

Unnamed: 0,Rank,Representative,Party,District,Seniority date,Notes
0,1,John Dingell,D,MI-15,"December 13, 1955",
1,2,John Conyers,D,MI-14,"January 3, 1965",
2,3,Dave Obey,D,WI-7,"April 1, 1969",
3,4,Phil Crane,R,IL-08,"November 25, 1969",Left the House in 2005 .
4,5,Charles B. Rangel,D,NY-15,"January 3, 1971",
...,...,...,...,...,...,...
434,435,Chris Van Hollen,D,MD-8,"January 3, 2003",
435,,Randy Neugebauer,R,TX-19,"June 3, 2003",
436,,Ben Chandler,D,KY-6,"February 17, 2004",
437,,Stephanie Herseth,D,SD,"June 1, 2004",


### Row querying

In [92]:
res = row_collection.query(
    query_embeddings=r_row_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

row_table_id_freq = defaultdict(int)
for res_metadatas in res['metadatas']:
    for single_rec in res_metadatas:
        row_table_id_freq[single_rec['table_id']] += 1
row_table_id_freq = sorted(list(row_table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
row_table_id_freq[:5]

### Column querying

In [95]:
col_res = column_collection.query(
    query_embeddings=r_column_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

col_table_id_freq = defaultdict(int)
for res_metadatas in col_res['metadatas']:
    for single_rec in res_metadatas:
        col_table_id_freq[single_rec['table_id']] += 1
col_table_id_freq = sorted(list(col_table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
col_table_id_freq[:10]

### Conclusions

In [96]:
sloth_result_rec

r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""16517223-1""","""16519520-1""",0.915589,1640,0.622627


That's interesting: even if the s_table with id "16519520-1" is the best in the row classification, the frequency distance from the 2nd is short (1786 to 1625), compared to the previous cases. In addiction, the s_table is yet the top with frequency score 5, but also other tables has the same result.

## Testing: Low Jaccard - Low Overlap

In [143]:
# import random
# idx_row = random.randint(0, sloth_results.shape[0])
idx_row = 160614
print(idx_row, sloth_results[idx_row]['r_id'].item() in sloth_tables.keys() and sloth_results[idx_row]['s_id'].item() in sloth_tables.keys())
sloth_results[idx_row]

160614 True


r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""22130361-4""","""27804692-10""",0.684211,20,0.416667


In [144]:
sloth_result_rec = sloth_results[idx_row]
r_id = sloth_result_rec['r_id'].item()
s_id = sloth_result_rec['s_id'].item()
r_table = sloth_tables[r_id]
s_table = sloth_tables[s_id]

r_row_embeddings, r_column_embeddings = tabenc.full_embedding(r_table, True)
r_row_embeddings.shape, r_column_embeddings.shape 

((8, 300), (6, 300))

In [145]:
r_table

Unnamed: 0,Rank,Nation,Gold,Silver,Bronze,Total
0,1,Canada,2,0,0,2
1,2,South Korea,1,0,0,1
2,2,China,1,0,0,1
3,4,Japan,0,2,1,3
4,5,United States,0,1,0,1
5,5,Germany,0,1,0,1
6,7,Russia,0,0,2,2
7,8,France,0,0,1,1


In [146]:
s_table

Unnamed: 0,Rank,Nation,Gold,Silver,Bronze,Total
0,1,United States,1,0,0,1
1,1,Russia,1,0,0,1
2,1,South Korea,1,0,0,1
3,1,Kazakhstan,1,0,0,1
4,5,Canada,0,2,1,3
5,6,Japan,0,1,1,2
6,7,Germany,0,1,0,1
7,8,Italy,0,0,2,2


### Row querying

In [147]:
res = row_collection.query(
    query_embeddings=r_row_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

row_table_id_freq = defaultdict(int)
for res_metadatas in res['metadatas']:
    for single_rec in res_metadatas:
        row_table_id_freq[single_rec['table_id']] += 1
row_table_id_freq = sorted(list(row_table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)

[('40038256-2', 3),
 ('31765907-1', 2),
 ('25399779-9', 2),
 ('25399779-8', 2),
 ('7913218-1', 2)]

In [153]:
s_id

'27804692-10'

### Column querying

In [148]:
col_res = column_collection.query(
    query_embeddings=r_column_embeddings.tolist(),
    n_results=20,
    include=['metadatas', 'distances'],
    where={'table_id': {'$ne': r_id}}
)

col_table_id_freq = defaultdict(int)
for res_metadatas in col_res['metadatas']:
    for single_rec in res_metadatas:
        col_table_id_freq[single_rec['table_id']] += 1
col_table_id_freq = sorted(list(col_table_id_freq.items()), key=lambda id_freq: id_freq[1], reverse=True)
col_table_id_freq[:10]

[('855087-1', 8),
 ('27804692-10', 7),
 ('27804692-9', 3),
 ('2687410-1', 3),
 ('494758-1', 3),
 ('19238108-2', 3),
 ('25399779-8', 2),
 ('6746942-2', 2),
 ('19239156-2', 2),
 ('28352026-1', 2)]

### Conclusions

In [154]:
sloth_result_rec

r_id,s_id,jsim,o_a,a%
str,str,f64,i64,f64
"""22130361-4""","""27804692-10""",0.684211,20,0.416667


Ok, s_id doesn't appear in the row frequency table and neither doesn't appear as the best on column frequency table. That seems in according to our sloth values. 