# Testing fastText for semantic similarity

Install [fasttext](https://fasttext.cc/) if needed

In [None]:
!pip install fasttext

In [1]:
import pandas as pd
import numpy as np
import nltk
import fasttext
import fasttext.util

from scipy.spatial import distance
import nltk

## 1. Get fastText ready to use

We are using the normal English model.

More info about the fastText [models](https://fasttext.cc/docs/en/crawl-vectors.html) for different languages.

> We distribute pre-trained word vectors for 157 languages, trained on Common Crawl and Wikipedia using fastText. These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives.

In [2]:
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')



Example about the word "learning".

In [3]:
print(ft.get_word_vector('learning').shape)
print(ft.get_word_vector('learning')[:20])

# Get the nearest words for "learning"
ft.get_nearest_neighbors('learning') # may take some time

(300,)
[-3.1326819e-02  5.8432957e-03  3.5721278e-05  3.2791961e-02
 -9.6422508e-03 -5.0007503e-02  1.6288273e-02  3.5059921e-02
 -6.6784739e-02 -1.8172603e-03 -1.8895891e-02 -5.0050311e-02
  5.2792020e-02  3.0742858e-02  1.2085622e-02 -1.8491376e-03
  5.5508241e-02 -9.5799835e-03  3.2117605e-02  1.1655847e-02]


[(0.7456761598587036, 'learing'),
 (0.6895476579666138, 'Learning'),
 (0.6878188848495483, 'learning.This'),
 (0.6796225309371948, 'learning.The'),
 (0.6753033399581909, 'learning.It'),
 (0.6706692576408386, 'learning.So'),
 (0.6673312187194824, 'learning.What'),
 (0.6648250222206116, 'learning.But'),
 (0.664309024810791, 'learning-'),
 (0.6633586883544922, 'learning.As')]

## 2. Load the sts-benchmark training data and remove bad lines. 

In [3]:
train_df = pd.pandas.read_table(
    'stsbenchmark/sts-train.csv',
    error_bad_lines=False,
    skip_blank_lines=True,
    usecols=[4, 5, 6],
    names=["score", "s1", "s2"])


## 3. A quick look at the dataset we are using

In [4]:
train_df.head()

Unnamed: 0,score,s1,s2
0,5.0,A plane is taking off.,An air plane is taking off.
1,3.8,A man is playing a large flute.,A man is playing a flute.
2,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,2.6,Three men are playing chess.,Two men are playing chess.
4,4.25,A man is playing the cello.,A man seated is playing the cello.


In [5]:
train_df.tail()

Unnamed: 0,score,s1,s2
5706,0.0,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia
5707,0.0,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...
5708,0.0,President heading to Bahrain,President Xi: China to continue help to fight ...
5709,0.0,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders
5710,0.0,Putin spokesman: Doping charges appear unfounded,The Latest on Severe Weather: 1 Dead in Texas ...


## 4. Comparing two sentence paires with fastText as an example

In [6]:
s1 = train_df.loc[0][1]
s2 = train_df.loc[0][2]
s3 = train_df.loc[45][1]
s4 = train_df.loc[45][2]

print(f's1 = {s1}')
print(f's2 = {s2}')
print('\n')
print(f's3 = {s3}')
print(f's4 = {s4}')

s1 = A plane is taking off.
s2 = An air plane is taking off.


s3 = A man is playing the piano.
s4 = A woman is playing the violin.


fastText fails when there are a lot of same words in the sentence, but the sentences are semantically different.

In [7]:
s1_vec = ft.get_sentence_vector(s1)
s2_vec = ft.get_sentence_vector(s2)
s3_vec = ft.get_sentence_vector(s3)
s4_vec = ft.get_sentence_vector(s4)

print(f's1 vs s2 = {distance.cosine(s1_vec,s2_vec)}')
print(f'Human score = {train_df.loc[0][0]}')
print(f'fastText score = {round((1-distance.cosine(s1_vec,s2_vec))*5,1)}')

print(f's3 vs s4 = {distance.cosine(s3_vec,s4_vec)}')
print(f'Human score = {train_df.loc[45][0]}')
print(f'fastText score = {round((1-distance.cosine(s3_vec,s4_vec))*5,1)}')

print(f's1 vs s3 = {distance.cosine(s1_vec,s3_vec)}')
print(f's1 vs s4 = {distance.cosine(s1_vec,s4_vec)}')

s1 vs s2 = 0.10187679529190063
Human score = 5.0
fastText score = 4.5
s3 vs s4 = 0.03787785768508911
Human score = 1.0
fastText score = 4.8
s1 vs s3 = 0.28191500902175903
s1 vs s4 = 0.28155577182769775


## 5. Getting the human scores and the fasttext scores and comparing them

**https://ixa2.si.ehu.es/stswiki/index.php/STS_benchmark_reproducibility**

> The averaged word embedding baselines compute a sentence embedding by averaging word embeddings and then using cosine to compute pairwise sentence similarity scores. 

> FastText: Since, to our knowledge, the tokenizer and preprocessing used for the pre-trained FastText embeddings is not publicly described. We use the following heuristics to preprocess and tokenize sentences for Fast-Text: numbers are converted into words, text is lowercased, and finally prefixed, suffixed and infixed punctuation is recursively removed from each token that does not match an entry in the modelâ€™s lexicon;

### 5.1 Load the dev data and preprocess it

In [8]:
dev_df = pd.pandas.read_table(
    'stsbenchmark/sts-dev.csv',
    error_bad_lines=False,
    skip_blank_lines=True,
    usecols=[4, 5, 6],
    names=["score", "s1", "s2"])

# removes punctuation from sentences
tokenizer = nltk.RegexpTokenizer(r"\w+")

# For some reason some of the sentences were "float" datatypes 
dev_df['s1'] = dev_df['s1'].astype(str)
dev_df['s2'] = dev_df['s2'].astype(str)


dev_df['s1'] = dev_df.apply(lambda row: tokenizer.tokenize(row['s1']), axis=1)
dev_df['s1'] = dev_df.apply(lambda row: ' '.join(row['s1']).lower() , axis=1)

dev_df['s2'] = dev_df.apply(lambda row: tokenizer.tokenize(row['s2']), axis=1)
dev_df['s2'] = dev_df.apply(lambda row: ' '.join(row['s2']).lower() , axis=1)

In [9]:
dev_df.head()

Unnamed: 0,score,s1,s2
0,5.0,a man with a hard hat is dancing,a man wearing a hard hat is dancing
1,4.75,a young child is riding a horse,a child is riding a horse
2,5.0,a man is feeding a mouse to a snake,the man is feeding a mouse to the snake
3,2.4,a woman is playing the guitar,a man is playing guitar
4,2.75,a woman is playing the flute,a man is playing a flute


### 5.2 Get the scores and normalize them

In [10]:
dev_scores = dev_df['score'].tolist()

score_human = []

for row in dev_scores:
    score = row/5
    score_human.append(score)

In [11]:
score_machine = []

for row in dev_df.itertuples(index=False):
    s1_vec = ft.get_sentence_vector(str(row[1]))
    s2_vec = ft.get_sentence_vector(str(row[2]))
    score = (1-distance.cosine(s1_vec,s2_vec))
    score_machine.append(score)

### 5.3 Compare human and fastText scores

In [12]:
from scipy.stats import pearsonr

result, _ = pearsonr(score_machine, score_human)
print('Pearsonr:', end=' ')
print("%.1f" % (result*100))

Pearsonr: 55.3


## 6. Numbers and written numbers are expressed very differently in fasttext
Converting numbers to written form could produce better results.

In [13]:
vec_1 = ft.get_word_vector('40')
vec_2 = ft.get_word_vector('forty')
vec_3 = ft.get_word_vector('41')
vec_4 = ft.get_word_vector('forty-one')
print(f'40 and forty        =   {distance.cosine(vec_1,vec_2)}')
print(f'41 and forty-one    =   {distance.cosine(vec_3,vec_4)}')
print(f'40 and 41           =   {distance.cosine(vec_1,vec_3)}')
print(f'forty and forty-one =   {distance.cosine(vec_2,vec_4)}')

40 and forty        =   0.4287061095237732
41 and forty-one    =   0.534650593996048
40 and 41           =   0.11887586116790771
forty and forty-one =   0.18680739402770996
