In [1]:
import os
import jsonlines
import random
import pandas as pd
from time import time
from tqdm.notebook import tqdm
from itertools import product

from code.fasttext.embedding_utils import TableEncoder
from code.utils.settings import DefaultPath
from code.utils.utils import rebuild_table

import chromadb

In [2]:
# medium size db, with metadata
chroma_client = chromadb.PersistentClient(DefaultPath.db_path.chroma + 'single_collection' + 'v_0_25000_add_label_False')

tabenc = TableEncoder()

In [3]:
sloth_tested_pairs = pd.read_csv(
    DefaultPath.data_path.wikitables + 'train_set_turl_malaguti.csv'
)

In [4]:
ids = list(set(sloth_tested_pairs['r_id']).union(sloth_tested_pairs['s_id']))
len(ids)

113098

In [None]:
RANDOM_ID_UPPER_BOUND = 500000

In [6]:
print(jsonlines.Reader.iter.__doc__)


        Iterate over all lines.

        This is the iterator equivalent to repeatedly calling
        :py:meth:`~Reader.read()`. If no arguments are specified, this
        is the same as directly iterating over this :py:class:`Reader`
        instance.

        When `skip_invalid` is set to ``True``, invalid lines will be
        silently ignored.

        See :py:meth:`~Reader.read()` for a description of the other
        arguments.
        


In [5]:
def get_random_table

wikitables = {}

with jsonlines.open(DefaultPath.data_path.wikitables + 'medium_train_tables.jsonl', 'r') as reader:
    for obj in reader:
        wikitables[obj['_id']] = obj

In [13]:
random_id = ids[random.randint(0, len(ids))]
# random_id = '34163802-12'
random_id

'34163802-12'

In [14]:
table = rebuild_table(wikitables[random_id])
table

Unnamed: 0,Blue Group,Skip,W,L
0,United States,Korey Dropkin,7,0
1,Switzerland,Michael Brunner,6,1
2,Czech Republic,Marek Černovský,4,3
3,China,Bai Yang,3,4
4,Norway,Markus Skogvold,3,4
5,South Korea,Kang Sue-yeon,2,5
6,New Zealand,Luke Steele,2,5
7,Estonia,Robert-Kent Päll,1,6


In [15]:
row_embeddings, columns_embeddings = tabenc.full_embedding(table)

In [16]:
chroma_client.list_collections()

[Collection(name=column-base), Collection(name=row-base)]

In [17]:
res = chroma_client \
    .get_collection('row-base') \
        .query(
            query_embeddings=[row_embeddings[0].tolist()],
            n_results=20,
            where={'table_id': {"$ne": random_id}},
            include=['metadatas', 'distances']
        )
res['ids']

[['34163295-11#0',
  '34116170-5#0',
  '31320420-16#6',
  '26467694-14#0',
  '31320420-18#6',
  '39394739-6#6',
  '6927800-16#0',
  '17089081-1#0',
  '40746958-6#6',
  '34960687-5#7',
  '15263714-1#0',
  '14938467-1#2',
  '14938467-1#0',
  '14938301-1#0',
  '31152396-1#4',
  '37985834-1#0',
  '4672654-9#2',
  '13292247-8#5',
  '27600119-1#2',
  '1184881-2#3']]

In [18]:
print(random_id, 0)
table.loc[0].tolist()

34163802-12 0


['United States', 'Korey Dropkin', '7', '0']

In [19]:
table_res = []
for rank, r in enumerate(res['ids'][0]):
    table_id, row_id = r.split('#')
    print(f"#{rank} table_id={table_id}, row_id={row_id}, distance={res['distances'][0][rank]}")
    table_res.append(rebuild_table(wikitables[table_id]))
    print(table_res[-1].loc[int(row_id)].tolist(), end='\n\n')

#0 table_id=34163295-11, row_id=0, distance=0.0
['United States', 'Korey Dropkin', '7', '0']

#1 table_id=34116170-5, row_id=0, distance=0.0
['United States', 'Korey Dropkin', '7', '0']

#2 table_id=31320420-16, row_id=6, distance=0.04128605127334595
['7', 'Switzerland', '0.960', '+0.001']

#3 table_id=26467694-14, row_id=0, distance=0.0443727970123291
['FEU Tamaraws', '7', '0', '--']

#4 table_id=31320420-18, row_id=6, distance=0.045673269778490067
['7', 'Austria', '0.955', '+0.003']

#5 table_id=39394739-6, row_id=6, distance=0.04576185345649719
['7', 'Colón', '0', '']

#6 table_id=6927800-16, row_id=0, distance=0.04729574918746948
['FEU Lady Tamaraws', '7', '0', '--']

#7 table_id=17089081-1, row_id=0, distance=0.048360053449869156
['', 'Italy (ITA)', '7', '0']

#8 table_id=40746958-6, row_id=6, distance=0.05248917266726494
['7', 'Perak', '0', '']

#9 table_id=34960687-5, row_id=7, distance=0.05280337482690811
['Poland', 'Eugeniusz Blaszczak', '0', '7']

#10 table_id=15263714-1, row

In [20]:
table_res[0]

Unnamed: 0,Blue Group,Skip,W,L
0,United States,Korey Dropkin,7,0
1,Switzerland,Michael Brunner,6,1
2,Czech Republic,Marek Černovský,4,3
3,China,Bai Yang,3,4
4,Norway,Markus Skogvold,3,4
5,South Korea,Kang Sue-yeon,2,5
6,New Zealand,Luke Steele,2,5
7,Estonia,Robert-Kent Päll,1,6


In [21]:
table_res[1]

Unnamed: 0,Blue Group,Skip,W,L
0,United States,Korey Dropkin,7,0
1,Switzerland,Michael Brunner,6,1
2,Czech Republic,Marek Černovský,4,3
3,China,Bai Yang,3,4
4,Norway,Markus Skogvold,3,4
5,South Korea,Kang Sue-yeon,2,5
6,New Zealand,Luke Steele,2,5
7,Estonia,Robert-Kent Päll,1,6


In [22]:
table_res[9]

Unnamed: 0,Group B,Skip,W,L
0,Sweden,Jalle Jungnell,6,1
1,Switzerland,Urs Bucher,6,1
2,Canada,Chris Daw,5,2
3,South Korea,Kim Hak-sung,5,2
4,Italy,Egidio Marchese,3,4
5,Bulgaria,Ivan Chopov,2,5
6,Germany,Jens Jäger,1,6
7,Poland,Eugeniusz Blaszczak,0,7


In [28]:
random2_id = ids[random.randint(0, len(ids))]
row2_embeddings, column2_embeddings = tabenc.full_embedding(
    rebuild_table(wikitables[random2_id])
)

big_res = chroma_client \
    .get_collection('column-base') \
        .query(
            query_embeddings=column2_embeddings.tolist(),
            n_results=20,
            where={'table_id': {"$ne": random2_id}},
            include=['metadatas', 'distances']
        )

In [29]:
big_res

{'ids': [['1122759-2#0',
   '1122914-4#0',
   '1122807-2#0',
   '1123018-4#0',
   '1122814-2#0',
   '1122657-3#0',
   '1122899-2#0',
   '1122667-2#0',
   '1123085-5#0',
   '1122761-2#0',
   '1122900-2#0',
   '1122790-2#0',
   '1123093-4#0',
   '1123054-5#0',
   '36643612-2#0',
   '16278496-4#0',
   '19429974-2#0',
   '26917365-1#0',
   '38992245-4#0',
   '36036510-6#0'],
  ['36689903-6#6',
   '36689783-10#7',
   '36690022-1#5',
   '36690022-1#6',
   '36689984-8#6',
   '36689984-5#6',
   '28926087-6#3',
   '36690022-19#6',
   '36690022-19#5',
   '36690022-19#4',
   '28935165-1#4',
   '36689984-8#4',
   '36690022-1#4',
   '28935006-1#4',
   '36057138-1#4',
   '13979587-1#3',
   '17633684-1#1',
   '17633696-2#0',
   '17633684-2#0',
   '17633709-2#0'],
  ['5858994-10#1',
   '37513707-2#2',
   '31211860-1#2',
   '31211860-2#2',
   '31226882-1#6',
   '31226882-1#3',
   '24035057-2#2',
   '28699093-1#2',
   '28699454-6#2',
   '28699564-6#2',
   '39828330-3#1',
   '25108961-1#2',
   '35826606-

In [35]:
from collections import defaultdict

freq = defaultdict(int)

for idres in big_res['ids']:
    for id in idres:
        table_id, item_id = id.split('#')
        freq[table_id] += 1

s = sorted([{'table_id': tid, 'f': f} for tid, f in freq.items()], key=lambda d: d['f'], reverse=True)[:5]
s

[{'table_id': '36690022-1', 'f': 6},
 {'table_id': '36690022-19', 'f': 6},
 {'table_id': '36689984-8', 'f': 4},
 {'table_id': '36689903-6', 'f': 2},
 {'table_id': '36689783-10', 'f': 2}]

In [37]:
table

Unnamed: 0,Blue Group,Skip,W,L
0,United States,Korey Dropkin,7,0
1,Switzerland,Michael Brunner,6,1
2,Czech Republic,Marek Černovský,4,3
3,China,Bai Yang,3,4
4,Norway,Markus Skogvold,3,4
5,South Korea,Kang Sue-yeon,2,5
6,New Zealand,Luke Steele,2,5
7,Estonia,Robert-Kent Päll,1,6


In [36]:
rebuild_table(wikitables[s[0]['table_id']])

Unnamed: 0,Type,Name,Title,Royal house,From,To,Refs
0,Sovereign,Ptolemy I Soter,Pharaoh,Ptolemaic dynasty,305 BC,285 BC,


In [38]:
rebuild_table(wikitables[s[1]['table_id']])

Unnamed: 0,Type,Name,Title,Royal house,From,To,Refs
0,Sovereign,Énna Aignech,High King,Milesians,313 BC,293 BC,


In [39]:
rebuild_table(wikitables[s[2]['table_id']])

Unnamed: 0,Type,Name,Title,Royal house,From,To,Refs
0,Sovereign,Seleucus I Nicator,King,Seleucid dynasty,305 BC,September 281 BC,
