# Fasttext Sandbox

### Install fasttext if needed

In [None]:
# https://fasttext.cc/
pip install fasttext

In [168]:
import pandas as pd
import numpy as np
import nltk
import fasttext
import fasttext.util
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emilnuutinen/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Load the fasttext English pretrained model

In [50]:
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')



### Load the sts-benchmark data and remove lines that contain errors. 

In [51]:

# Remove "warn_bad_lines=False" to print the lines with errors.
train_df = pd.read_csv('stsbenchmark/sts-train.csv', sep='\t', engine='python', header=None, encoding='utf-8', error_bad_lines=False, warn_bad_lines=False)

#data = []
#with open('stsbenchmark/sts-train.csv') as f:
#    for line in f.read().splitlines():
#        splits = line.split('\t')
#        data.append({
#            'score': float(splits[4]),
#            's1': splits[5],
#            's2': splits[6]
#        }) 

### Investigate the genres, data shape and look at the head

In [52]:
print(train_df[0].value_counts())
print('\n')
print('Train dataset shape: ' + str(train_df.shape))
print('\n')
train_df.head()

main-news        2976
main-captions    2000
main-forum        438
Name: 0, dtype: int64


Train dataset shape: (5414, 7)




Unnamed: 0,0,1,2,3,4,5,6
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.


### Two sentence paires that I will be comparing

In [53]:
print(train_df.loc[0])
print('\n')
print(train_df.loc[45])

0                  main-captions
1                         MSRvid
2                       2012test
3                              1
4                              5
5         A plane is taking off.
6    An air plane is taking off.
Name: 0, dtype: object


0                     main-captions
1                            MSRvid
2                          2012test
3                                68
4                                 1
5       A man is playing the piano.
6    A woman is playing the violin.
Name: 45, dtype: object


In [54]:
s1 = train_df.loc[0][5]
s2 = train_df.loc[0][6]
s3 = train_df.loc[45][5]
s4 = train_df.loc[45][6]

print(f's1 = {s1}')
print(f's2 = {s2}')
print('\n')
print(f's3 = {s3}')
print(f's4 = {s4}')

s1 = A plane is taking off.
s2 = An air plane is taking off.


s3 = A man is playing the piano.
s4 = A woman is playing the violin.


In [55]:
from scipy.spatial import distance

s1_vec = ft.get_sentence_vector(s1)
s2_vec = ft.get_sentence_vector(s2)
s3_vec = ft.get_sentence_vector(s3)
s4_vec = ft.get_sentence_vector(s4)

print(f's1 vs s2 = {1-distance.cosine(s1_vec,s2_vec)}')
print(f's3 vs s4 = {1-distance.cosine(s3_vec,s4_vec)}')
print(f's1 vs s3 = {1-distance.cosine(s1_vec,s3_vec)}')
print(f's1 vs s4 = {1-distance.cosine(s1_vec,s4_vec)}')

s1 vs s2 = 0.8981232047080994
s3 vs s4 = 0.9621221423149109
s1 vs s3 = 0.718084990978241
s1 vs s4 = 0.7184442281723022


In [56]:
from scipy.stats import pearsonr

corr1, _ = pearsonr(s1_vec,s2_vec)
corr2, _ = pearsonr(s3_vec,s4_vec)
corr3, _ = pearsonr(s2_vec,s3_vec)
corr4, _ = pearsonr(s1_vec,s4_vec)

print(f's1 vs s2 = {corr1}')
print(f's3 vs s4 = {corr2}')
print(f's1 vs s3 = {corr3}')
print(f's1 vs s4 = {corr4}')

s1 vs s2 = 0.8981070027806705
s3 vs s4 = 0.9621134995201079
s1 vs s3 = 0.6361684902231468
s1 vs s4 = 0.7199339771232091


### Getting the human scores and the fasttext scores and comparing them

**https://ixa2.si.ehu.es/stswiki/index.php/STS_benchmark_reproducibility**

> The averaged word embedding baselines compute a sentence embedding by averaging word embeddings and then using cosine to compute pairwise sentence similarity scores. 

> FastText: Since, to our knowledge, the tokenizer and preprocessing used for the pre-trained FastText embeddings is not publicly described. We use the following heuristics to preprocess and tokenize sentences for Fast-Text: numbers are converted into words, text is lowercased, and finally prefixed, suffixed and infixed punctuation is recursively removed from each token that does not match an entry in the model’s lexicon;

In [183]:
data = []
with open('stsbenchmark/sts-dev.csv') as f:
    for line in f.read().splitlines():
        splits = line.split('\t')
        data.append({
            'score': float(splits[4]),
            's1': splits[5],
            's2': splits[6]
        })

# removes punctuation from sentences
tokenizer = nltk.RegexpTokenizer(r"\w+")

# lowecase, tokenize and remove punctuation from sentences
for x in data:
    x['s1'].lower()
    x['s2'].lower()
    x['s1'] = tokenizer.tokenize(x['s1'])
    x['s2'] = tokenizer.tokenize(x['s2'])
    x['s1'] = ' '.join(x['s1'])
    x['s2'] = ' '.join(x['s2'])

In [184]:
data[3]

{'score': 2.4,
 's1': 'A woman is playing the guitar',
 's2': 'A man is playing guitar'}

In [188]:
score_human = []

for x in data:
    score = x['score']/5
    score_human.append(score)

In [202]:
score_machine = []

for x in data:
    s1_vec = ft.get_sentence_vector(x['s1'])
    s2_vec = ft.get_sentence_vector(x['s2'])
    score = (1-distance.cosine(s1_vec,s2_vec))
    score_machine.append(score)

In [215]:
result, _ = pearsonr(score_machine, score_human)
print("%.1f" % (result*100))

56.1


### Numbers and written numbers are expressed very differently in fasttext
Converting numbers to written number probably produce much better results.

In [220]:
vec_1 = ft.get_word_vector('40')
vec_2 = ft.get_word_vector('forty')
vec_3 = ft.get_word_vector('41')
vec_4 = ft.get_word_vector('forty-one')
print(f'40 and forty = {1-distance.cosine(vec_1,vec_2)}')
print(f'41 and forty-one = {1-distance.cosine(vec_3,vec_4)}')
print(f'40 and 41 = {1-distance.cosine(vec_1,vec_3)}')
print(f'forty and forty-one = {1-distance.cosine(vec_2,vec_4)}')

40 and forty = 0.5712938904762268
41 and forty-one = 0.465349406003952
40 and 41 = 0.8811241388320923
forty and forty-one = 0.81319260597229
