In [1]:
import pandas as pd
import jsonlines

import chromadb

from code.fasttext.embedding_utils import TableEncoder

In [2]:
chroma_client = chromadb.Client()

tabenc = TableEncoder()

In [3]:
def rebuild_table(table):
    return pd.DataFrame(
        data=[
            [entry_data['text'] 
             for entry_data in entry]
            for entry in table['tableData']
        ],
        columns=table['tableHeaders'][0]
        )

In [4]:
sloth_tested_pairs = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/train_set_turl_malaguti.csv',
    nrows=1000
    )

In [5]:
ids = list(set(sloth_tested_pairs['r_id']).union(sloth_tested_pairs['s_id']))
len(ids)

1905

In [6]:
wikitables = {}
with jsonlines.open('/home/giovanni/unimore/TESI/src/data/small_train_tables.jsonl', 'r') as reader:
    for obj in reader:
        wikitables[obj['_id']] = obj

In [7]:
try:
    chroma_client.delete_collection("column-base")
    chroma_client.delete_collection("row-base")
    print('Collections deleted')
except: pass
finally:
    collection_column_base = chroma_client.create_collection(name='column-base')
    collection_row_base = chroma_client.create_collection(name='row-base')
    print('Collections created')

Collections created


In [8]:
from tqdm.notebook import tqdm
from time import time

### Non persistent version

In [23]:
time_stat_data = []
i = 0 
#with jsonlines.open('/home/giovanni/unimore/TESI/src/data/small_train_tables.jsonl', 'r') as reader:
#    for wikitable in reader:
#        i += 1
#        if i % 100 == 0:
#            print(i)
if 1:        
    for table_id in tqdm(ids):
        
        try:
            wikitable = wikitables[table_id]
        except KeyError:
            print(f'Table ID {table_id} not found')
            continue
        table_id = wikitable['_id']
        start_build = time()
        table = rebuild_table(wikitable)
        end_build = time()

        start_emb = time()
        row_embeddings, column_embeddings = tabenc.full_embedding(table, False, False)
        end_emb = time()

        start_ind = time()
        collection_column_base.add(
            ids=[f"{table_id}#{i}" for i in range(column_embeddings.shape[0])],
            embeddings=column_embeddings.tolist()
        )

        collection_row_base.add(
            ids=[f"{table_id}#{i}" for i in range(row_embeddings.shape[0])],
            embeddings=row_embeddings.tolist()
        )
        end_ind = time()

        try: table_name = wikitable['pgTitle']
        except: table_name = 'no name'

        time_stat_data.append(
            [
                table_id, table_name, 
                table.shape[0], table.shape[1],
                (end_build - start_build) * 1000,
                (end_emb - start_emb) * 1000,
                (end_ind - start_ind) * 1000,
                (end_ind - start_build) * 1000
            ]
        )

  0%|          | 0/1905 [00:00<?, ?it/s]

Table ID 27308904-2 not found


In [None]:
time_stat = pd.DataFrame(
    data=time_stat_data,
    columns=['table ID', 'table name', 'rows', 'columns', 
             'table building time (ms)', 'embedding time (ms)', 'indexing time (ms)', 'proc time (ms)']
    )

In [None]:
time_stat.describe()

Unnamed: 0,rows,columns,table building time (ms),embedding time (ms),indexing time (ms),proc time (ms)
count,1904.0,1904.0,1904.0,1904.0,1904.0,1904.0
mean,9.091387,6.196429,1.408098,19.458132,29.825277,50.691507
std,7.464475,3.88122,1.933812,21.405615,18.969362,37.818361
min,1.0,1.0,0.398159,0.854492,5.360365,7.105827
25%,4.0,2.0,0.702322,5.165815,17.880023,25.846541
50%,6.0,6.0,0.918269,12.784243,25.937796,40.263653
75%,13.0,9.0,1.322329,25.599778,34.763575,62.889636
max,62.0,36.0,31.152248,248.507261,192.1134,387.294531


In [None]:
time_stat.sort_values(by='proc time (ms)', ascending=False)

Unnamed: 0,table ID,table name,rows,columns,table building time (ms),embedding time (ms),indexing time (ms),proc time (ms)
1817,7906462-3,1936–37 Serie A,16,17,1.486063,193.695068,192.113400,387.294531
1820,35487645-3,2012–13 Football League,24,11,1.284122,233.566999,120.319128,355.170250
6,13062324-4,1996–97 Divizia A,18,19,6.570339,248.507261,94.145298,349.222898
1453,4918676-1,Keith Lasley,21,12,14.785290,124.416113,116.213799,255.415201
1733,6266485-1,Pejman Nouri,18,11,11.580467,107.092619,136.185169,254.858255
...,...,...,...,...,...,...,...,...
1109,3482286-1,Now I Got Worry,3,2,0.465393,1.612425,7.637024,9.714842
1569,31200223-1,1999 Albirex Niigata season,3,2,0.562191,1.375198,7.085800,9.023190
86,38375789-2,Pain Killer (Moumoon album),1,4,1.018047,1.469612,6.472588,8.960247
330,6491071-1,Rhymes & Reasons (Carole King album),3,2,0.494480,1.603842,6.782532,8.880854


In [None]:
time_stat.sort_values(by='embedding time (ms)', ascending=False).head(10)

Unnamed: 0,table ID,table name,rows,columns,table building time (ms),embedding time (ms),indexing time (ms),proc time (ms)
6,13062324-4,1996–97 Divizia A,18,19,6.570339,248.507261,94.145298,349.222898
1820,35487645-3,2012–13 Football League,24,11,1.284122,233.566999,120.319128,355.17025
1817,7906462-3,1936–37 Serie A,16,17,1.486063,193.695068,192.1134,387.294531
1195,35100306-1,List of Falling Skies characters,26,4,0.816107,172.505856,35.578251,208.900213
1801,31317245-1,1999 Campeonato Ecuatoriano de Fútbol Serie A,12,11,1.24383,139.872551,95.269203,236.385584
1327,13057852-4,2001–02 Divizia A,16,17,5.622625,132.285118,111.820936,249.72868
1470,16288454-2,2008 National Youth Competition (rugby league)...,16,28,3.982544,125.593424,66.144943,195.720911
741,2281909-1,2005 World Aquatics Championships,27,6,0.948668,125.410795,79.470396,205.829859
1176,27525541-1,2004–05 Football League Championship,24,11,1.271486,125.206947,52.687168,179.165602
1453,4918676-1,Keith Lasley,21,12,14.78529,124.416113,116.213799,255.415201


In [None]:
rebuild_table(wikitables['3482286-1']).head()

Unnamed: 0,Professional ratings,Professional ratings.1
0,Allmusic,
1,Pitchfork Media,(8.5/10)
2,Rolling Stone,


In [None]:
rebuild_table(wikitables['35100306-1']).head()

Unnamed: 0,Name,Actor/actress,Starring seasons,Recurring seasons
0,Tom Mason,Noah Wyle,"1 , 2 , 3",
1,A former Boston University military history pr...,A former Boston University military history pr...,A former Boston University military history pr...,A former Boston University military history pr...
2,Anne Glass,Moon Bloodgood,"1 , 2 , 3",
3,"Doctor to the 2nd Mass, she was a pediatrician...","Doctor to the 2nd Mass, she was a pediatrician...","Doctor to the 2nd Mass, she was a pediatrician...","Doctor to the 2nd Mass, she was a pediatrician..."
4,Hal Mason,Drew Roy,"1 , 2 , 3",


In [None]:
rebuild_table(wikitables['12660383-1']).head()

Unnamed: 0,Country,2006,2005,2004,2003
0,Singapore,1,1,2,4
1,Switzerland,2,3,3,2
2,United States,3,4,7,11
3,Ireland,4,2,1,1
4,Denmark,5,7,10,6
