# Testing TF-IDF for semantic similarity

We are using the scikit-learn [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html).

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

## 1. Load the sts-benchmark data and remove lines that contain erros.

In [3]:
# Remove "warn_bad_lines=False" to print the lines that have errors.
train_df = pd.read_csv('stsbenchmark/sts-train.csv', sep='\t', engine='python', header=None, encoding='utf-8', error_bad_lines=False, warn_bad_lines=False)


## 2. A quick look at the dataset we are using

In [4]:
print(train_df.loc[0])
print('\n')
print(train_df.loc[45])

train_df.head()

0                  main-captions
1                         MSRvid
2                       2012test
3                              1
4                              5
5         A plane is taking off.
6    An air plane is taking off.
Name: 0, dtype: object


0                     main-captions
1                            MSRvid
2                          2012test
3                                68
4                                 1
5       A man is playing the piano.
6    A woman is playing the violin.
Name: 45, dtype: object


Unnamed: 0,0,1,2,3,4,5,6
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.


## 3. Comparing two sentence paires with TF-IDF as an example

In [5]:
s1 = train_df.loc[0][5]
s2 = train_df.loc[0][6]
s3 = train_df.loc[45][5]
s4 = train_df.loc[45][6]

print(f's1 = {s1}')
print(f's2 = {s2}')
print('\n')
print(f's3 = {s3}')
print(f's4 = {s4}')

s1 = A plane is taking off.
s2 = An air plane is taking off.


s3 = A man is playing the piano.
s4 = A woman is playing the violin.


In [6]:
from scipy.spatial import distance

vectorizer = TfidfVectorizer()

sentence_vectors = vectorizer.fit_transform([s1, s2, s3, s4])

s1_vec = sentence_vectors.toarray()[0].tolist()
s2_vec = sentence_vectors.toarray()[1].tolist()
s3_vec = sentence_vectors.toarray()[2].tolist()
s4_vec = sentence_vectors.toarray()[3].tolist()

print(f's1 vs s2 = {distance.cosine(s1_vec,s2_vec)}')
print(f'Human score = {train_df.loc[0][4]}')
print(f'TF-IDF Score = {round((1-distance.cosine(s1_vec,s2_vec))*5,1)}')

print(f's3 vs s4 = {distance.cosine(s3_vec,s4_vec)}')
print(f'Human score = {train_df.loc[45][4]}')
print(f'TF-IDF Score = {round((1-distance.cosine(s3_vec,s4_vec))*5,1)}')

print(f's1 vs s3 = {distance.cosine(s1_vec,s3_vec)}')
print(f's1 vs s4 = {distance.cosine(s1_vec,s4_vec)}')


s1 vs s2 = 0.2812725311459734
Human score = 5.0
TF-IDF Score = 3.6
s3 vs s4 = 0.5689084892431033
Human score = 1.0
TF-IDF Score = 2.2
s1 vs s3 = 0.9006492066302417
s1 vs s4 = 0.9006492066302417


## 4. Getting the human score and the TF-IDF scores and comparing them

### 4.1 Load the data and preprocess it

In [8]:
import nltk

data = []
with open('stsbenchmark/sts-dev.csv') as f:
    for line in f.read().splitlines():
        splits = line.split('\t')
        data.append({
            'score': float(splits[4]),
            's1': splits[5],
            's2': splits[6]
        })

# removes punctuation from sentences
tokenizer = nltk.RegexpTokenizer(r"\w+")

# lowercase, tokenize and remove punctuation from sentences
for x in data:
    x['s1'].lower()
    x['s2'].lower()
    x['s1'] = tokenizer.tokenize(x['s1'])
    x['s2'] = tokenizer.tokenize(x['s2'])
    x['s1'] = ' '.join(x['s1'])
    x['s2'] = ' '.join(x['s2'])

In [9]:
data[3]

{'score': 2.4,
 's1': 'A woman is playing the guitar',
 's2': 'A man is playing guitar'}

### 4.2 Get the scores and normalize them

In [10]:
score_human = []

for x in data:
    score = x['score']/5
    score_human.append(score)

In [11]:
score_machine = []

vectorizer = TfidfVectorizer()

for x in data:
    sentence_vectors = vectorizer.fit_transform([x['s1'], x['s2']])
    s1_vec = sentence_vectors.toarray()[0].tolist()
    s2_vec = sentence_vectors.toarray()[1].tolist()
    score = (1-distance.cosine(s1_vec,s2_vec))
    score_machine.append(score)

### 4.3 Compare human and TF-IDF scores

In [18]:
from scipy.stats import pearsonr

result, _ = pearsonr(score_machine, score_human)
print('Pearsonr:', end=' ')
print("%.1f" % (result*100))

Pearsonr: 65.2
