In [3]:
from tools.utils.settings import DefaultPath as defpath

from tools.table_bert.table_bert import TableBertModel

model = TableBertModel.from_pretrained(
    defpath.model_path.tabert + '/tabert_base_k3/model.bin',
)

In [4]:
model.output_size

768

In [11]:
from tools.table_bert import Table, Column

table = Table(
    id='List of countries by GDP (PPP)',
    header=[
        Column('Nation', 'text', sample_value='United States'),
        Column('Gross Domestic Product', 'real', sample_value='21,439,453')
    ],
    data=[
        ['United States', '21,439,453'],
        ['China', '27,308,857'],
        ['European Union', '22,774,165'],
    ]
).tokenize(model.tokenizer)

table2 = Table(
    id='List of comix',
    header=[
        Column('Name', 'text', sample_value='Mafalda'),
        Column('Author', 'real', sample_value='Quino'),
        Column('Score', 'real', sample_value=4)
    ],
    data=[
        ['Mafalda', 'Quino', 1],
        ['The Peanuts', 'Charles M. Schulz', 3],
        ['Spiderman', 'Stan Lee', 5],
    ]
).tokenize(model.tokenizer)


# To visualize table in an IPython notebook:
# display(table.to_data_frame(), detokenize=True)

context = ""

# model takes batched, tokenized inputs
context_encoding, column_encoding, info_dict = model.encode(
    contexts=[[], [], []],
    tables=[table, table2]
)

column_encoding.shape

torch.Size([2, 3, 768])

In [16]:
z = column_encoding[0]
z.detach().numpy().shape

(3, 768)

In [22]:
table3 = Table(
    id='List of comix',
    header=[
        Column('Name', 'text', sample_value='Mafalda'),
        Column('Author', 'real', sample_value='Quino'),
        Column('Score', 'real', sample_value=4)
    ],
    data=[
        ['Mafalda', 'Quino', 1],
        ['The Peanuts', 'Charles M. Schulz', 3],
        ['Spiderman', 'Stan Lee', 5],
    ]
).tokenize(model.tokenizer)


# To visualize table in an IPython notebook:
# display(table.to_data_frame(), detokenize=True)

# model takes batched, tokenized inputs
context_encoding, column_encoding, info_dict = model.encode(
    contexts=[[]],
    tables=[table3]
)

In [24]:
model.tokenizer.tokenize("Nocontext")

['no', '##con', '##text']

In [32]:
import torch

torch.cuda.device_count()

0

In [21]:

# To visualize table in an IPython notebook:
# display(table.to_data_frame(), detokenize=True)

context2 = ""

context_encoding2, column_encoding2, info_dict2 = model.encode(
    contexts=[model.tokenizer.tokenize(context2)],
    tables=[table2]
)

In [22]:
column_encoding.shape, column_encoding2.shape

(torch.Size([1, 2, 768]), torch.Size([1, 3, 768]))

In [7]:
nation_emb, gdp_emb = column_encoding[0, 0, :], column_encoding[0, 1, :]
comix_emb, author_emb = column_encoding2[0, 0, :], column_encoding2[0, 1, :]

In [8]:
import torch
cosim = torch.nn.CosineSimilarity(dim=0)

In [9]:
float(cosim(nation_emb, gdp_emb))

0.7574862837791443

In [10]:
float(cosim(nation_emb, comix_emb))

0.7895163297653198

In [11]:
float(cosim(comix_emb, author_emb))

0.934893012046814

In [38]:
from tools.utils.utils import get_mongodb_collections, get_one_document_from_mongodb_by_key


mongoclient, collections = get_mongodb_collections(True)

doc = get_one_document_from_mongodb_by_key('_id_numeric', 10178, *collections)

doc['content']

[['1960', '1960', '1960', '1960', '1960'],
 ['Aleksandër Moisiu', '', '', '', ''],
 ['Ansambli i këngëve dhe valleve', '', '', '', ''],
 ['Divjakë 2', '', '', '', ''],
 ['Fusha e rilinduar', '', '', '', ''],
 ['Kurora e gjelbër', '', '', '', ''],
 ['Në duart tuaja', '', '', '', ''],
 ['Në gjurmët e shekujve', '', '', '', ''],
 ['Vallëzimi i shqipeve', '', '', '', ''],
 ['Zëri i paqës', '', '', '', ''],
 ['1961', '1961', '1961', '1961', '1961'],
 ['Debatik', '', '', '', ''],
 ['Gjithmonë fitimtarë', '', '', '', ''],
 ['Jubileu i lavdishëm', '', '', '', ''],
 ['Kalitja', '', '', '', ''],
 ['Nën flamurin e M-L', '', '', '', ''],
 ['Në prag të festës', '', '', '', ''],
 ['Ngadhnjimtarë në çdo betejë', '', '', '', ''],
 ['Për fitore të reja', '', '', '', ''],
 ['Pranverë fitoresh', '', '', '', ''],
 ['Program madhështor', '', '', '', ''],
 ['Reportazh nga Kurbneshi', '', '', '', ''],
 ['1962', '1962', '1962', '1962', '1962'],
 ['Me peshkatarët tanë', '', '', '', ''],
 ['Vllazërit Tare', '',

In [39]:
import pandas as pd
pd.DataFrame(doc['content'])

Unnamed: 0,0,1,2,3,4
0,1960,1960,1960,1960,1960
1,Aleksandër Moisiu,,,,
2,Ansambli i këngëve dhe valleve,,,,
3,Divjakë 2,,,,
4,Fusha e rilinduar,,,,
...,...,...,...,...,...
121,Reportazh nga Mirdita,,,,
122,"Rruga jote, shok",,,,
123,Shkolla dhe praktika,,,,
124,Toka të përtërira,,,,


In [43]:
doc['numeric_columns']

[0, 0, 0, 0, 0]

In [41]:
from tools.utils.utils import prepare_token


numeric_columns = doc['numeric_columns']
pd.DataFrame([[prepare_token(cell) for i, cell in enumerate(row) if numeric_columns[i] == 0] for row in doc['content']])

Unnamed: 0,0,1,2,3,4
0,1960,1960,1960,1960,1960
1,Aleksandër Moisiu,,,,
2,Ansambli i këngëve dhe valleve,,,,
3,Divjakë 2,,,,
4,Fusha e rilinduar,,,,
...,...,...,...,...,...
121,Reportazh nga Mirdita,,,,
122,"Rruga jote, shok",,,,
123,Shkolla dhe praktika,,,,
124,Toka të përtërira,,,,


In [55]:
import multiprocessing as mp

lock = mp.Lock()
mongoclient, collections = get_mongodb_collections(False)

def foo():
    def worker(inp):
        global lock
        
        doc = get_one_document_from_mongodb_by_key('_id_numeric', inp, *collections)
        
        assert inp == doc['_id_numeric']
        # lock.acquire()
        # print(inp, doc['_id_numeric'], type(inp))
        # lock.release()

    print(locals())
    work = range(10000)

    with mp.Pool(72) as pool:

        pool.map(worker, work)


foo()

{'worker': <function foo.<locals>.worker at 0x7fbb1462a290>}


AttributeError: Can't pickle local object 'foo.<locals>.worker'