In [1]:
import pandas as pd

from code.fasttext.embedding_utils import TableEncoder, compare_embeddings_of

# Embedding Table Columns

Now we try to embed every column of a single table:

through fastText, for every cell in a categorical column, we'll compute a tokenization and then a vectorization of the cell.

> cell --> token sequence --> token vector sequence --> cell vector

In [2]:
tabenc = TableEncoder()

### Comparing embeddings from UK football datasets - only categorical columns

In [3]:
stadium_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv'
    ).drop(['Rank\n(England only)', 'Image', 'Capacity'], axis=1).dropna()

pl1_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/Premier_League_1.csv'
).drop('Capacity', axis=1).dropna()

In [4]:
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, 1 to 146
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Stadium      146 non-null    object
 1   Town / City  146 non-null    object
 2   Team         146 non-null    object
 3   League       146 non-null    object
dtypes: object(4)
memory usage: 5.7+ KB


In [5]:
stadium_df.head()

Unnamed: 0,Stadium,Town / City,Team,League
1,Old Trafford,"Old Trafford, Greater Manchester",Manchester United,Premier League
2,Tottenham Hotspur Stadium,"Tottenham, London",Tottenham Hotspur,Premier League
3,London Stadium,"Stratford, London",West Ham United,Premier League
4,Anfield,"Anfield, Liverpool",Liverpool,Premier League
5,Emirates Stadium,"Holloway, London",Arsenal,Premier League


In [6]:
pl1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     object
 1   Location  20 non-null     object
 2   Stadium   20 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


In [7]:
pl1_df.head()

Unnamed: 0,Team,Location,Stadium
0,Arsenal,London (Holloway),Emirates Stadium
1,Aston Villa,Birmingham,Villa Park
2,Bournemouth,Bournemouth,Vitality Stadium
3,Brentford,London (Brentford),Gtech Community Stadium
4,Brighton & Hove Albion,Brighton,American Express Stadium


In [8]:
stadium_df.shape, pl1_df.shape

((146, 4), (20, 3))

In [9]:
stadium_embeddings = list(tabenc.create_column_embeddings(stadium_df))
pl1_embeddings = list(tabenc.create_column_embeddings(pl1_df))

In [10]:
len(stadium_embeddings), len(stadium_embeddings[0]), len(pl1_embeddings), len(pl1_embeddings[0])

(4, 300, 3, 300)

In [11]:
# TODO what is the best manner to store/process cell embeddings? Since they are needed both
# for row and column embeddings, it would be better to create a sort of cell-embedding-matrix and 
# then operate on it

In [12]:
compare_embeddings_of(stadium_df, pl1_df, tabenc)

Unnamed: 0,C1,C2,cosine similarity
0,Stadium,Stadium,0.967901
1,Town / City,Location,0.940481
2,Team,Team,0.924574
3,Town / City,Team,0.8469
4,Team,Location,0.760189
5,Team,Stadium,0.737822
6,Stadium,Team,0.65113
7,Town / City,Stadium,0.526184
8,Stadium,Location,0.506453
9,League,Team,0.477763


### Compare complete tables

In [13]:
# TODO how to handle unknown columns data types? 'Capacity' in stadium dataframe has the ',' character 
# in its values, and so pandas doesn't recognize it as a float

In [14]:
stadium_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv'
    ).drop(['Rank\n(England only)', 'Image'], axis=1).dropna()

pl1_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/Premier_League_1.csv'
    ).dropna()

In [15]:
stadium_df.convert_dtypes().info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, 1 to 146
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Stadium      146 non-null    string
 1   Town / City  146 non-null    string
 2   Capacity     146 non-null    string
 3   Team         146 non-null    string
 4   League       146 non-null    string
dtypes: string(5)
memory usage: 6.8 KB


In [16]:
pl1_df.convert_dtypes().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     string
 1   Location  20 non-null     string
 2   Stadium   20 non-null     string
 3   Capacity  20 non-null     string
dtypes: string(4)
memory usage: 768.0 bytes


In [17]:
compare_embeddings_of(stadium_df, pl1_df, tabenc)

NameError: name 'compare_column_embeddings_of' is not defined

Without any kind of column preprocessing we have in output a lot of NaN values, since fastText 
cannot encode continuos values, such as 'Capacity'...