In [27]:
import numpy as np
import pandas as pd
import torch

from table_bert import TableBertModel
from table_bert import Table, Column

In [28]:
model = TableBertModel.from_pretrained(
    '/home/giovanni/unimore/TESI/TaBERT/pre-trained-models/tabert_base_k3/model.bin',
)

In [29]:
stadium_df = pd.read_csv('~/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv').drop('Image', axis=1)
clubs_1_df = pd.read_csv('~/unimore/TESI/src/data/uk_football/Premier_League_1.csv')
clubs_2_df = pd.read_csv('~/unimore/TESI/src/data/uk_football/Premier_League_2.csv')

In [4]:
stadium_df.head()

Unnamed: 0,Rank\n(England only),Stadium,Town / City,Capacity,Team,League
0,1.0,Wembley Stadium,"Wembley, London",90000,"England (Men's, women's and youth)",
1,2.0,Old Trafford,"Old Trafford, Greater Manchester",74031,Manchester United,Premier League
2,3.0,Tottenham Hotspur Stadium,"Tottenham, London",62850,Tottenham Hotspur,Premier League
3,4.0,London Stadium,"Stratford, London",62500,West Ham United,Premier League
4,5.0,Anfield,"Anfield, Liverpool",61276,Liverpool,Premier League


In [5]:
clubs_1_df.head()

Unnamed: 0,Team,Location,Stadium,Capacity
0,Arsenal,London (Holloway),Emirates Stadium,60704
1,Aston Villa,Birmingham,Villa Park,42657
2,Bournemouth,Bournemouth,Vitality Stadium,11307
3,Brentford,London (Brentford),Gtech Community Stadium,17250
4,Brighton & Hove Albion,Brighton,American Express Stadium,31876


## First Comparison with TaBERT: stadium vs clubs_1

I expect to find a very high similarity between some columns of the two tables, since they have identical/similar column names/values.

In [6]:
# no casting at all: all the columns are interpreted as 'text' columns
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Rank
(England only)  143 non-null    float64
 1   Stadium              147 non-null    object 
 2   Town / City          147 non-null    object 
 3   Capacity             147 non-null    object 
 4   Team                 147 non-null    object 
 5   League               146 non-null    object 
dtypes: float64(1), object(5)
memory usage: 7.0+ KB


In [7]:
def get_col_type(df: pd.DataFrame, c: str):
    dtype = df.dtypes[c]
    if 'int' in str(dtype).lower() or 'float' in str(dtype).lower(): return 'real'
    else: return 'text'

In [31]:
def apply_tabert(ids, dataframes, contexts, onlytext=True):
    con_col_info = []
    for (id, df, context) in zip(ids, dataframes, contexts):
        if onlytext:
            header = [Column(c, 'text', sample_value=df[c].sample()) for c in df.columns]
        else:
            header = [Column(c, get_col_type(df, c), sample_value=df[c].sample()) for c in df.columns]

        data = [df[c].to_list() for c in df.columns]

        table = Table(
            id=id,
            header=header,
            data=data
        ).tokenize(model.tokenizer)

        context_encoding, column_encoding, info_dict = model.encode(
            #contexts=[model.tokenizer.tokenize(context)],
            tables=[table]
        )
        con_col_info.append([context_encoding, column_encoding, info_dict])
    return con_col_info

In [32]:
con_col_info = apply_tabert(
    ids=[
        'A list of UK football stadiums',
        'A table with data about UK football clubs'
    ],
    dataframes=[
        stadium_df,
        clubs_1_df
    ],
    contexts=[
        'Show me the stadium with the highest capacity',
        'List all the clubs in alphabetical order'
    ]
)

TypeError: encode() missing 1 required positional argument: 'contexts'

In [10]:
col_emb_stadium = con_col_info[0][1]
col_emb_clubs_1 = con_col_info[1][1]
stadium_df.shape, col_emb_stadium.shape, '---', clubs_1_df.shape, col_emb_clubs_1.shape

((147, 6), torch.Size([1, 6, 768]), '---', (20, 4), torch.Size([1, 4, 768]))

In [11]:
cos = torch.nn.CosineSimilarity(dim=0)

comparisons = pd.DataFrame(columns=['stadium', 'clubs_1', 'cosine similarity', 'dot product'])

In [12]:
for i, col_gdp in enumerate(stadium_df.columns):
    for j, col_pop in enumerate(clubs_1_df.columns):
        cosim = cos(col_emb_stadium[0, i, :], col_emb_clubs_1[0, j, :])
        dotp = torch.dot(col_emb_stadium[0, i, :], col_emb_clubs_1[0, j, :])
        comparisons.loc[len(comparisons)] = [col_gdp, col_pop, float(cosim), float(dotp)]

In [14]:
comparisons

Unnamed: 0,stadium,clubs_1,cosine similarity,dot product
0,Rank\n(England only),Team,0.84891,0.033736
1,Rank\n(England only),Location,0.852501,0.023411
2,Rank\n(England only),Stadium,0.862368,0.029372
3,Rank\n(England only),Capacity,0.88431,0.009613
4,Stadium,Team,0.882794,0.313855
5,Stadium,Location,0.911545,0.716238
6,Stadium,Stadium,0.934575,0.926189
7,Stadium,Capacity,0.93806,0.582603
8,Town / City,Team,0.857127,0.0
9,Town / City,Location,0.895922,0.574726


Also in this case the cosine similarity is always really high, why?

Dot product is generally more correct, but also in this case there are some strange things, such as 
> DP(Town/City, Team)=0.07 and DP(Town/City, Capacity)=0.46

In [15]:
stadium_df['Town / City'].sample(5)

89                      Yeovil
75     Kings Park, Bournemouth
133                   Solihull
40                       Wigan
17                    Coventry
Name: Town / City, dtype: object

In [16]:
clubs_1_df['Team'].sample(5)

4     Brighton & Hove Albion
12           Manchester City
10                 Liverpool
5                    Burnley
9                     Fulham
Name: Team, dtype: object

## Second Comparison: stadium vs clubs_1 with casting

Similar to the previous pipeline, but adding specific datatype for each dataframe

In [17]:
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Rank
(England only)  143 non-null    float64
 1   Stadium              147 non-null    object 
 2   Town / City          147 non-null    object 
 3   Capacity             147 non-null    object 
 4   Team                 147 non-null    object 
 5   League               146 non-null    object 
dtypes: float64(1), object(5)
memory usage: 7.0+ KB


In [18]:
stadium_df['Capacity'] = stadium_df['Capacity'].apply(lambda e: int(str(e).replace(',', '')))
stadium_df = stadium_df.convert_dtypes()
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Rank
(England only)  143 non-null    Int64 
 1   Stadium              147 non-null    string
 2   Town / City          147 non-null    string
 3   Capacity             147 non-null    Int64 
 4   Team                 147 non-null    string
 5   League               146 non-null    string
dtypes: Int64(2), string(4)
memory usage: 7.3 KB


In [19]:
clubs_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     object
 1   Location  20 non-null     object
 2   Stadium   20 non-null     object
 3   Capacity  20 non-null     object
dtypes: object(4)
memory usage: 768.0+ bytes


In [20]:
clubs_1_df['Capacity'] = clubs_1_df['Capacity'].apply(lambda e: int(str(e).replace(',', '')))
clubs_1_df = clubs_1_df.convert_dtypes()
clubs_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     string
 1   Location  20 non-null     string
 2   Stadium   20 non-null     string
 3   Capacity  20 non-null     Int64 
dtypes: Int64(1), string(3)
memory usage: 788.0 bytes


In [21]:
con_col_info = apply_tabert(
    ids=[
        'A list of UK football stadiums',
        'A table with data about UK football clubs'
    ],
    dataframes=[
        stadium_df,
        clubs_1_df
    ],
    contexts=[
        'Show me the stadium with the highest capacity',
        'List all the clubs in alphabetical order'
    ],
    onlytext=False
)

In [22]:
col_emb_stadium = con_col_info[0][1]
col_emb_clubs_1 = con_col_info[1][1]
stadium_df.shape, col_emb_stadium.shape, '---', clubs_1_df.shape, col_emb_clubs_1.shape

((147, 6), torch.Size([1, 6, 768]), '---', (20, 4), torch.Size([1, 4, 768]))

In [23]:
cos = torch.nn.CosineSimilarity(dim=0)

comparisons_cast = pd.DataFrame(columns=['stadium', 'clubs_1', 'cosine similarity', 'dot product'])

In [24]:
for i, col_gdp in enumerate(stadium_df.columns):
    for j, col_pop in enumerate(clubs_1_df.columns):
        cosim = cos(col_emb_stadium[0, i, :], col_emb_clubs_1[0, j, :])
        dotp = torch.dot(col_emb_stadium[0, i, :], col_emb_clubs_1[0, j, :])
        comparisons_cast.loc[len(comparisons_cast)] = [col_gdp, col_pop, float(cosim), float(dotp)]

In [26]:
comparisons_merged = pd.merge(comparisons, comparisons_cast, how='inner', on=['stadium', 'clubs_1'], suffixes=['', '-cast'])
comparisons_merged[['stadium', 'clubs_1', 'cosine similarity', 'cosine similarity-cast', 'dot product', 'dot product-cast']]

Unnamed: 0,stadium,clubs_1,cosine similarity,cosine similarity-cast,dot product,dot product-cast
0,Rank\n(England only),Team,0.84891,0.880481,0.033736,0.155279
1,Rank\n(England only),Location,0.852501,0.875334,0.023411,0.104386
2,Rank\n(England only),Stadium,0.862368,0.882969,0.029372,0.16583
3,Rank\n(England only),Capacity,0.88431,0.892818,0.009613,0.0
4,Stadium,Team,0.882794,0.915041,0.313855,0.416904
5,Stadium,Location,0.911545,0.935032,0.716238,0.759797
6,Stadium,Stadium,0.934575,0.954525,0.926189,1.0
7,Stadium,Capacity,0.93806,0.92985,0.582603,0.288839
8,Town / City,Team,0.857127,0.886166,0.0,0.002025
9,Town / City,Location,0.895922,0.917949,0.574726,0.52892
