In [99]:
import re
import numpy as np
import pandas as pd

In [100]:
import compress_fasttext
import fasttext

In [101]:
model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    '/home/giovanni/unimore/TESI/src/models/fastText/cc.en.300.compressed.bin'
)

In [102]:
tokenization = fasttext.FastText.tokenize('That\'s my first trial with fasttext; it\s tricky!')
tokenization

["That's", 'my', 'first', 'trial', 'with', 'fasttext;', 'it\\s', 'tricky!']

# Embedding a Table

Now we try to embed every column of a single table:

through fastText, for every cell in a categorical column, we'll compute a tokenization and then a vectorization of the cell.

> cell --> token sequence --> token vector sequence --> cell vector

In [126]:
def my_tokenizer(s: str):
    return [re.sub('[^a-z]+', '', x) for x in s.lower().split()]

In [127]:
def compute_column_embeddings(df: pd.DataFrame):
    return \
        (
            np.mean( 
                np.array( 
                    [
                        np.sum( 
                            [model.get_vector(token) for token in cell], # compute cell embedding
                            axis=0)
                        for cell in df[column].apply(my_tokenizer)  # compute column embedding
                    ]
                ),
                axis=0
            ) 
        for column in df.columns
        )

### Comparing embeddings from UK football datasets - only categorical columns

In [120]:
stadium_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv'
    ).drop(['Rank\n(England only)', 'Image', 'Capacity'], axis=1).dropna()

pl1_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/Premier_League_1.csv'
).drop('Capacity', axis=1).dropna()

In [118]:
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, 1 to 146
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Stadium      146 non-null    object
 1   Town / City  146 non-null    object
 2   Team         146 non-null    object
 3   League       146 non-null    object
dtypes: object(4)
memory usage: 5.7+ KB


In [105]:
stadium_df.head()

Unnamed: 0,Stadium,Town / City,Team,League
0,Wembley Stadium,"Wembley, London","England (Men's, women's and youth)",
1,Old Trafford,"Old Trafford, Greater Manchester",Manchester United,Premier League
2,Tottenham Hotspur Stadium,"Tottenham, London",Tottenham Hotspur,Premier League
3,London Stadium,"Stratford, London",West Ham United,Premier League
4,Anfield,"Anfield, Liverpool",Liverpool,Premier League


In [121]:
pl1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     object
 1   Location  20 non-null     object
 2   Stadium   20 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


In [123]:
pl1_df.head()

Unnamed: 0,Team,Location,Stadium
0,Arsenal,London (Holloway),Emirates Stadium
1,Aston Villa,Birmingham,Villa Park
2,Bournemouth,Bournemouth,Vitality Stadium
3,Brentford,London (Brentford),Gtech Community Stadium
4,Brighton & Hove Albion,Brighton,American Express Stadium


In [124]:
stadium_df.shape, pl1_df.shape

((146, 4), (20, 3))

In [134]:
stadium_embeddings = list(compute_column_embeddings(stadium_df))
pl1_embeddings = list(compute_column_embeddings(pl1_df))

In [135]:
len(stadium_embeddings), len(stadium_embeddings[0]), len(pl1_embeddings), len(pl1_embeddings[0])

(4, 300, 3, 300)

In [111]:
# TODO what is the best manner to store/process cell embeddings? Since they are needed both
# for row and column embeddings, it would be better to create a sort of cell-embedding-matrix and 
# then operate on it

In [130]:
comparisons = pd.DataFrame(columns=['C1', 'C2', 'cosine similarity'])

In [137]:
for i, es in enumerate(stadium_df.columns):
    for j, ep in enumerate(pl1_df.columns):
        ei = stadium_embeddings[i]
        ej = pl1_embeddings[j]

        cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))

        comparisons.loc[len(comparisons)] = [es, ep, cosim]

In [138]:
comparisons

Unnamed: 0,C1,C2,cosine similarity
0,Stadium,Team,0.672407
1,Stadium,Location,0.516966
2,Stadium,Stadium,0.96706
3,Town / City,Team,0.79341
4,Town / City,Location,0.954044
5,Town / City,Stadium,0.561669
6,Team,Team,0.917976
7,Team,Location,0.726615
8,Team,Stadium,0.74328
9,League,Team,0.503647
