In [2]:
import re
import numpy as np
import pandas as pd

In [3]:
import compress_fasttext
import fasttext

In [4]:
model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    '/home/giovanni/unimore/TESI/src/models/fastText/cc.en.300.compressed.bin'
)

In [5]:
tokenization = fasttext.FastText.tokenize('That\'s my first trial with fasttext; it\s tricky!')
tokenization

["That's", 'my', 'first', 'trial', 'with', 'fasttext;', 'it\\s', 'tricky!']

# Embedding a Table

Now we try to embed every column of a single table:

through fastText, for every cell in a categorical column, we'll compute a tokenization and then a vectorization of the cell.

> cell --> token sequence --> token vector sequence --> cell vector

In [6]:
def my_tokenizer(s: str):
    return [re.sub('[^a-z]+', '', x) for x in s.lower().split()]

In [102]:
def np_cosine_similarity(a1, a2):
    return np.dot(a1, a2) / (np.linalg.norm(a1) * np.linalg.norm(a2))

In [27]:
def encode_cell(cell):
    return \
        np.mean( 
            [model.get_vector(token) for token in cell], # compute cell embedding
            axis=0
            )

In [28]:
def compute_column_embeddings(df: pd.DataFrame):
    return \
        (
            np.mean( 
                np.array( 
                    [
                        encode_cell(cell)
                        for cell in df[column].apply(my_tokenizer)  # compute column embedding
                    ]
                ),
                axis=0
            ) 
        for column in df.columns
        )

### Comparing embeddings from UK football datasets - only categorical columns

In [8]:
stadium_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv'
    ).drop(['Rank\n(England only)', 'Image', 'Capacity'], axis=1).dropna()

pl1_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/Premier_League_1.csv'
).drop('Capacity', axis=1).dropna()

In [9]:
stadium_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, 1 to 146
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Stadium      146 non-null    object
 1   Town / City  146 non-null    object
 2   Team         146 non-null    object
 3   League       146 non-null    object
dtypes: object(4)
memory usage: 5.7+ KB


In [10]:
stadium_df.head()

Unnamed: 0,Stadium,Town / City,Team,League
1,Old Trafford,"Old Trafford, Greater Manchester",Manchester United,Premier League
2,Tottenham Hotspur Stadium,"Tottenham, London",Tottenham Hotspur,Premier League
3,London Stadium,"Stratford, London",West Ham United,Premier League
4,Anfield,"Anfield, Liverpool",Liverpool,Premier League
5,Emirates Stadium,"Holloway, London",Arsenal,Premier League


In [11]:
pl1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     object
 1   Location  20 non-null     object
 2   Stadium   20 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes


In [12]:
pl1_df.head()

Unnamed: 0,Team,Location,Stadium
0,Arsenal,London (Holloway),Emirates Stadium
1,Aston Villa,Birmingham,Villa Park
2,Bournemouth,Bournemouth,Vitality Stadium
3,Brentford,London (Brentford),Gtech Community Stadium
4,Brighton & Hove Albion,Brighton,American Express Stadium


In [13]:
stadium_df.shape, pl1_df.shape

((146, 4), (20, 3))

In [14]:
stadium_embeddings = list(compute_column_embeddings(stadium_df))
pl1_embeddings = list(compute_column_embeddings(pl1_df))

In [15]:
len(stadium_embeddings), len(stadium_embeddings[0]), len(pl1_embeddings), len(pl1_embeddings[0])

(4, 300, 3, 300)

In [16]:
# TODO what is the best manner to store/process cell embeddings? Since they are needed both
# for row and column embeddings, it would be better to create a sort of cell-embedding-matrix and 
# then operate on it

In [17]:
def compare_datasets(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    emb1, emb2 = list(compute_column_embeddings(df1)), list(compute_column_embeddings(df2))
    comparisons = pd.DataFrame(columns=['C1', 'C2', 'cosine similarity'])
    for i, es in enumerate(df1.columns):
        for j, ep in enumerate(df2.columns):
            ei = emb1[i]
            ej = emb2[j]

            cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))

            comparisons.loc[len(comparisons)] = [es, ep, cosim]

    return comparisons

In [29]:
compare_datasets(stadium_df, pl1_df)

  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))


Unnamed: 0,C1,C2,cosine similarity
0,Stadium,Team,0.672407
1,Stadium,Location,0.516966
2,Stadium,Stadium,0.96706
3,Stadium,Capacity,
4,Town / City,Team,0.79341
5,Town / City,Location,0.954044
6,Town / City,Stadium,0.561669
7,Town / City,Capacity,
8,Capacity,Team,
9,Capacity,Location,


### Compare complete tables

In [19]:
# TODO how to handle unknown columns data types? 'Capacity' in stadium dataframe has the ',' character 
# in its values, and so pandas doesn't recognize it as a float

In [20]:
stadium_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv'
    ).drop(['Rank\n(England only)', 'Image'], axis=1).dropna()

pl1_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/Premier_League_1.csv'
    ).dropna()

In [21]:
stadium_df.convert_dtypes().info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, 1 to 146
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Stadium      146 non-null    string
 1   Town / City  146 non-null    string
 2   Capacity     146 non-null    string
 3   Team         146 non-null    string
 4   League       146 non-null    string
dtypes: string(5)
memory usage: 6.8 KB


In [22]:
pl1_df.convert_dtypes().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Team      20 non-null     string
 1   Location  20 non-null     string
 2   Stadium   20 non-null     string
 3   Capacity  20 non-null     string
dtypes: string(4)
memory usage: 768.0 bytes


In [23]:
compare_datasets(stadium_df, pl1_df)

  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))
  cosim = np.dot(ei, ej) / (np.linalg.norm(ei) * np.linalg.norm(ej))


Unnamed: 0,C1,C2,cosine similarity
0,Stadium,Team,0.672407
1,Stadium,Location,0.516966
2,Stadium,Stadium,0.96706
3,Stadium,Capacity,
4,Town / City,Team,0.79341
5,Town / City,Location,0.954044
6,Town / City,Stadium,0.561669
7,Town / City,Capacity,
8,Capacity,Team,
9,Capacity,Location,


Without any kind of column preprocessing we have in output a lot of NaN values, since fastText 
cannot encode continuos values, such as 'Capacity'...

## Computing row embedding - only known categorical columns

In [145]:
stadium_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/List_of_football_stadiums_in_England_1.csv'
    ).drop(['Rank\n(England only)', 'Image', 'Capacity'], axis=1).dropna(ignore_index=True)

pl1_df = pd.read_csv(
    '/home/giovanni/unimore/TESI/src/data/uk_football/Premier_League_1.csv'
    ).drop('Capacity', axis=1).dropna(ignore_index=True)

In [146]:
stadium_df.head()

Unnamed: 0,Stadium,Town / City,Team,League
0,Old Trafford,"Old Trafford, Greater Manchester",Manchester United,Premier League
1,Tottenham Hotspur Stadium,"Tottenham, London",Tottenham Hotspur,Premier League
2,London Stadium,"Stratford, London",West Ham United,Premier League
3,Anfield,"Anfield, Liverpool",Liverpool,Premier League
4,Emirates Stadium,"Holloway, London",Arsenal,Premier League


In [147]:
pl1_df.head()

Unnamed: 0,Team,Location,Stadium
0,Arsenal,London (Holloway),Emirates Stadium
1,Aston Villa,Birmingham,Villa Park
2,Bournemouth,Bournemouth,Vitality Stadium
3,Brentford,London (Brentford),Gtech Community Stadium
4,Brighton & Hove Albion,Brighton,American Express Stadium


### No label input in embedding creation

In [148]:
row = stadium_df.loc[1]
row

Stadium        Tottenham Hotspur Stadium
Town / City            Tottenham, London
Team                   Tottenham Hotspur
League                    Premier League
Name: 1, dtype: object

In [149]:
for idx, row in stadium_df.head(3).iterrows():
    for cell in row:
        print(cell)
    print()

Old Trafford
Old Trafford, Greater Manchester
Manchester United
Premier League

Tottenham Hotspur Stadium
Tottenham, London
Tottenham Hotspur
Premier League

London Stadium
Stratford, London
West Ham United
Premier League



In [150]:
def row_embedding(row):
    return \
        np.mean(
            [encode_cell(cell) for cell in row],
            axis=0
        )

In [151]:
def create_row_embeddings(df: pd.DataFrame):
    return \
        (     
            row_embedding(row) for _, row in df.iterrows()
        )

In [152]:
np.array([np.mean([encode_cell(cell) for cell in row], axis=0) for _, row in stadium_df.iterrows()]).shape

(146, 300)

In [153]:
stadium_emb_row = list(create_row_embeddings(stadium_df))
pl1_emb_row = list(create_row_embeddings(pl1_df))

In [154]:
np.array(stadium_emb_row).shape, np.array(pl1_emb_row).shape

((146, 300), (20, 300))

In [156]:
comparisons_row = pd.DataFrame(columns=['R1', 'R2', 'cosine similarity'])

for i, emb_s in enumerate(stadium_emb_row):
    for j, emb_p in enumerate(pl1_emb_row):
        cosim = np_cosine_similarity(emb_s, emb_p)
        comparisons_row.loc[len(comparisons_row)] = [i, j, cosim]
comparisons_row = comparisons_row.convert_dtypes()

In [157]:
comparisons_row.head()

Unnamed: 0,R1,R2,cosine similarity
0,0,0,0.974807
1,0,1,0.954423
2,0,2,0.956373
3,0,3,0.978458
4,0,4,0.979733


In [158]:
stadium_row =   stadium_df.loc[1]
pl1_row =           pl1_df.loc[0]
np_cosine_similarity(row_embedding(stadium_row), row_embedding(pl1_row))

0.9848310839114431

In [159]:
comparisons_row_sorted = \
    comparisons_row.sort_values(by='cosine similarity', ascending=False, ignore_index=True)

In [160]:
for i in range(5):
    r1, r2, cosim = comparisons_row_sorted.loc[i]
    r1, r2 = int(r1), int(r2)
    s = stadium_df.loc[r1]
    p = pl1_df.loc[r2]
    print(f"#{i}: {cosim}\n\t{r1}: {' '.join([c for c in s])}\n\t{r2}: {' '.join([c for c in p])}")
    print()

#0: 0.9975808457329806
	19: Bramall Lane Sheffield Sheffield United Premier League
	16: Sheffield United Sheffield Bramall Lane

#1: 0.9973654447219926
	6: St James' Park Newcastle upon Tyne Newcastle United Premier League
	14: Newcastle United Newcastle upon Tyne St James' Park

#2: 0.9967100065340178
	1: Tottenham Hotspur Stadium Tottenham, London Tottenham Hotspur Premier League
	17: Tottenham Hotspur London (Tottenham) Tottenham Hotspur Stadium

#3: 0.9964319158297446
	56: Brentford Community Stadium Brentford, London Brentford Premier League
	3: Brentford London (Brentford) Gtech Community Stadium

#4: 0.996405089976656
	37: Selhurst Park Selhurst, London Crystal Palace Premier League
	7: Crystal Palace London (Selhurst) Selhurst Park

