# Fasttext Sandbox

Install fasttext if needed

In [2]:
# https://fasttext.cc/
pip install fasttext

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import fasttext
import fasttext.util

In [4]:
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')



In [5]:
# Load the data and remove lines that contain errors. 
# Remove "warn_bad_lines=False" to print the lines with errors.
train_df = pd.read_csv('stsbenchmark/sts-train.csv', sep='\t', engine='python', header=None, encoding='utf-8', error_bad_lines=False, warn_bad_lines=False)

#data = []
#with open('stsbenchmark/sts-train.csv') as f:
#    for line in f.read().splitlines():
#        splits = line.split('\t')
#        data.append({
#            'score': float(splits[4]),
#            's1': splits[5],
#            's2': splits[6]
#        }) 

In [6]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.


In [7]:
print(train_df[0].value_counts())
print('Train dataset shape: ' + str(train_df.shape))

main-news        2976
main-captions    2000
main-forum        438
Name: 0, dtype: int64
Train dataset shape: (5414, 7)


In [36]:
print(train_df.loc[0])
print('\n')
print(train_df.loc[45])

0                  main-captions
1                         MSRvid
2                       2012test
3                              1
4                              5
5         A plane is taking off.
6    An air plane is taking off.
Name: 0, dtype: object


0                     main-captions
1                            MSRvid
2                          2012test
3                                68
4                                 1
5       A man is playing the piano.
6    A woman is playing the violin.
Name: 45, dtype: object


In [38]:
s1 = train_df.loc[0][5]
s2 = train_df.loc[0][6]
s3 = train_df.loc[45][5]
s4 = train_df.loc[45][6]

print(f's1 = {s1}')
print(f's2 = {s2}')
print('\n')
print(f's3 = {s3}')
print(f's4 = {s4}')

s1 = A plane is taking off.
s2 = An air plane is taking off.


s3 = A man is playing the piano.
s4 = A woman is playing the violin.


In [42]:
from scipy.spatial import distance

s1_vec = ft.get_sentence_vector(s1)
s2_vec = ft.get_sentence_vector(s2)
s3_vec = ft.get_sentence_vector(s3)
s4_vec = ft.get_sentence_vector(s4)

print(f's1 vs s2 = {1-distance.cosine(s1_vec,s2_vec)}')
print(f's3 vs s4 = {1-distance.cosine(s3_vec,s4_vec)}')
print(f's1 vs s3 = {1-distance.cosine(s1_vec,s3_vec)}')
print(f's1 vs s4 = {1-distance.cosine(s1_vec,s4_vec)}')

s1 vs s2 = 0.8981232047080994
s3 vs s4 = 0.9621221423149109
s1 vs s3 = 0.718084990978241
s1 vs s4 = 0.7184442281723022


In [43]:
from scipy.stats import pearsonr

corr1, _ = pearsonr(s1_vec,s2_vec)
corr2, _ = pearsonr(s3_vec,s4_vec)
corr3, _ = pearsonr(s2_vec,s3_vec)
corr4, _ = pearsonr(s1_vec,s4_vec)

print(f's1 vs s2 = {corr1}')
print(f's3 vs s4 = {corr2}')
print(f's1 vs s3 = {corr3}')
print(f's1 vs s4 = {corr4}')

s1 vs s2 = 0.8981070027806705
s3 vs s4 = 0.9621134995201079
s1 vs s3 = 0.6361684902231468
s1 vs s4 = 0.7199339771232091
