In [1]:
import numpy as np
import pandas as pd
import torch

from table_bert import TableBertModel
from table_bert import Table, Column



In [2]:
model = TableBertModel.from_pretrained(
    '/home/giovanni/unimore/TESI/TaBERT/pre-trained-models/tabert_base_k3/model.bin',
)

In [6]:
stadium_df = pd.read_csv('~/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv').drop('Image', axis=1)
presidents_df = pd.read_csv('~/unimore/TESI/src/data/us_presidents/List_of_presidents_of_the_United_States_1.csv').drop(['Portrait', 'Party'], axis=1)

In [7]:
stadium_df.head()

Unnamed: 0,Rank\n(England only),Stadium,Town / City,Capacity,Team,League
0,1.0,Wembley Stadium,"Wembley, London",90000,"England (Men's, women's and youth)",
1,2.0,Old Trafford,"Old Trafford, Greater Manchester",74031,Manchester United,Premier League
2,3.0,Tottenham Hotspur Stadium,"Tottenham, London",62850,Tottenham Hotspur,Premier League
3,4.0,London Stadium,"Stratford, London",62500,West Ham United,Premier League
4,5.0,Anfield,"Anfield, Liverpool",61276,Liverpool,Premier League


In [8]:
presidents_df.head()

Unnamed: 0,No.,Name\n(Birth–Death),Term,Party.1,Election,Vice President
0,1,George Washington\n(1732–1799),"April 30, 1789\n–\nMarch 4, 1797",Unaffiliated,1788–1789\n\n1792,John Adams
1,2,John Adams\n(1735–1826),"March 4, 1797\n–\nMarch 4, 1801",Federalist,1796,Thomas Jefferson
2,3,Thomas Jefferson\n(1743–1826),"March 4, 1801\n–\nMarch 4, 1809",Democratic-\nRepublican,1800\n\n\n1804,Aaron Burr\n\nGeorge Clinton
3,4,James Madison\n(1751–1836),"March 4, 1809\n–\nMarch 4, 1817",Democratic-\nRepublican,1808\n\n1812,"George Clinton\n\nVacant after\nApril 20, 1812..."
4,5,James Monroe\n(1758–1831),"March 4, 1817\n–\nMarch 4, 1825",Democratic-\nRepublican,1816\n\n1820,Daniel D. Tompkins


## First Comparison with TaBERT: stadium vs clubs_1

I expect to find a very high similarity between some columns of the two tables, since they have identical/similar column names/values.

In [9]:
# no casting at all: all the columns are interpreted as 'text' columns
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Rank
(England only)  143 non-null    float64
 1   Stadium              147 non-null    object 
 2   Town / City          147 non-null    object 
 3   Capacity             147 non-null    object 
 4   Team                 147 non-null    object 
 5   League               146 non-null    object 
dtypes: float64(1), object(5)
memory usage: 7.0+ KB


In [11]:
presidents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   No.                 46 non-null     int64 
 1   Name
(Birth–Death)  46 non-null     object
 2   Term                46 non-null     object
 3   Party.1             46 non-null     object
 4   Election            46 non-null     object
 5   Vice President      46 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.3+ KB


In [10]:
def get_col_type(df: pd.DataFrame, c: str):
    dtype = df.dtypes[c]
    if 'int' in str(dtype).lower() or 'float' in str(dtype).lower(): return 'real'
    else: return 'text'

In [13]:
def apply_tabert(ids, dataframes, contexts, onlytext=True):
    con_col_info = []
    for (id, df, context) in zip(ids, dataframes, contexts):
        if onlytext:
            header = [Column(c, 'text', sample_value=df[c].sample()) for c in df.columns]
        else:
            header = [Column(c, get_col_type(df, c), sample_value=df[c].sample()) for c in df.columns]

        data = [df[c].to_list() for c in df.columns]

        table = Table(
            id=id,
            header=header,
            data=data
        ).tokenize(model.tokenizer)

        context_encoding, column_encoding, info_dict = model.encode(
            contexts=[model.tokenizer.tokenize(context)],
            tables=[table]
        )
        con_col_info.append([context_encoding, column_encoding, info_dict])
    return con_col_info

In [33]:
con_col_info = apply_tabert(
    ids=[
        'A list of UK football stadiums',
        'A table with data about US presidents'
    ],
    dataframes=[
        stadium_df,
        presidents_df
    ],
    contexts=[
        '',
        ''
    ]
)

In [34]:
len(con_col_info), len(con_col_info[0]), len(con_col_info[1])

(2, 3, 3)

In [35]:
col_emb_stadium = con_col_info[0][1]
col_emb_presidents = con_col_info[1][1]
stadium_df.shape, tuple(col_emb_stadium.shape), '---', presidents_df.shape, tuple(col_emb_presidents.shape)

((147, 6), (1, 6, 768), '---', (46, 6), (1, 6, 768))

In [36]:
cos = torch.nn.CosineSimilarity(dim=0)

comparisons = pd.DataFrame(columns=['stadium', 'presidents', 'cosine similarity', 'np cosim', 'dot product'])

In [37]:
for i, col_gdp in enumerate(stadium_df.columns):
    for j, col_pop in enumerate(presidents_df.columns):
        cosim = cos(col_emb_stadium[0, i, :], col_emb_presidents[0, j, :])
        es, ep = col_emb_stadium[0, i, :].cpu().detach().numpy(),col_emb_presidents[0, j, :].cpu().detach().numpy()
        npcosim = np.dot(es, ep) / (np.linalg.norm(es) * np.linalg.norm(ep))
        dotp = torch.dot(col_emb_stadium[0, i, :], col_emb_presidents[0, j, :])
        comparisons.loc[len(comparisons)] = [col_gdp, col_pop, float(cosim), float(npcosim), float(dotp)]

In [39]:
comparisons

Unnamed: 0,stadium,presidents,cosine similarity,np cosim,dot product
0,Rank\n(England only),No.,0.888168,0.888168,0.720056
1,Rank\n(England only),Name\n(Birth–Death),0.895579,0.895579,0.930715
2,Rank\n(England only),Term,0.87069,0.87069,0.179924
3,Rank\n(England only),Party.1,0.874172,0.874172,0.307881
4,Rank\n(England only),Election,0.875474,0.875474,0.336428
5,Rank\n(England only),Vice President,0.88108,0.88108,0.387447
6,Stadium,No.,0.885399,0.885399,0.236481
7,Stadium,Name\n(Birth–Death),0.886511,0.886511,0.259515
8,Stadium,Term,0.887904,0.887904,0.289589
9,Stadium,Party.1,0.877186,0.877186,0.0
