##Finding Semantic Textual Similarity



*   0 means highly similar
*   1 means higly dissimilar



In [None]:
#importing libraries
import numpy as np
import pandas as pd

import re
from tqdm import tqdm

import collections

from sklearn.cluster import KMeans

from nltk.stem import WordNetLemmatizer  # For Lemmetization of words
from nltk.corpus import stopwords  # Load list of stopwords
from nltk import word_tokenize # Convert paragraph in tokens

import pickle
import sys

from gensim.models import word2vec # For represent words in vectors
import gensim

In [None]:
# importing dataset 
data = pd.read_csv("Text_Similarity_Dataset.csv")
print("Shape of data : ", data.shape)
data.head(5)

Shape of data :  (4023, 3)


Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [None]:
#checking if there are null values
data.isnull().sum()

Unique_ID    0
text1        0
text2        0
dtype: int64

In [None]:
#Data preprocessing 
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
preprocessed_text1 = []


for sentance in tqdm(data['text1'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)

    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text1.append(sent.lower().strip())

100%|██████████| 4023/4023 [03:18<00:00, 20.31it/s]


In [None]:
# merging preprocessed_text1 in data
data['text1'] = preprocessed_text1
data.head(5)


Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail spot ads internet search ...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions miss net 2025 40 uk population still ...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short ginepri fifteen year old...,ruddock backs yapp s credentials wales coach m...
3,3,diageo buy us wine firm diageo world biggest s...,mci shares climb on takeover bid shares in us ...
4,4,careful code new european directive could put ...,media gadgets get moving pocket-sized devices ...


In [None]:
from tqdm import tqdm
preprocessed_text2 = []

# tqdm is for printing the status bar
for sentance in tqdm(data['text2'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
   
    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text2.append(sent.lower().strip())

100%|██████████| 4023/4023 [03:17<00:00, 20.35it/s]


In [None]:
# merging preprocessed_text2 in data

data['text2'] = preprocessed_text2

data.head(3)

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail spot ads internet search ...,newcastle 2 1 bolton kieron dyer smashed home ...
1,1,millions miss net 2025 40 uk population still ...,nasdaq planning 100m share sale owner technolo...
2,2,young debut cut short ginepri fifteen year old...,ruddock backs yapp credentials wales coach mik...


In [None]:
def word_tokenizer(text):
            #tokenizes and stems the text
            tokens = word_tokenize(text)
            lemmatizer = WordNetLemmatizer() 
            tokens = [lemmatizer.lemmatize(t) for t in tokens]
            return tokens

In [None]:

# Loading pre_trained Google News Vectors 

wordmodelfile="https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
wordmodel= gensim.models.KeyedVectors.load_word2vec_format(wordmodelfile, binary=True)

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
#creating list to store similarity score
similarity = [] # List for store similarity score



for ind in data.index:
    
        s1 = data['text1'][ind]
        s2 = data['text2'][ind]
        
        if s1==s2:
                 similarity.append(0.0) # 0 means highly similar
                
        else:   

            s1words = word_tokenizer(s1)
            s2words = word_tokenizer(s2)
            
           
            
            vocab = wordmodel.vocab #the vocabulary considered in the word embeddings
            
            if len(s1words and s2words)==0:
                    similarity.append(1.0)

            else:
                
                for word in s1words.copy(): #remove sentence words not found in the vocab
                    if (word not in vocab):
                           
                            
                            s1words.remove(word)
                        
                    
                for word in s2words.copy(): #idem

                    if (word not in vocab):
                           
                            s2words.remove(word)
                            
                            
                similarity.append((1-wordmodel.n_similarity(s1words, s2words))) # as it is given 1 means highly dissimilar & 0 means highly similar

In [None]:
#arrangement of data 
final_score = pd.DataFrame({'Unique_ID':data.Unique_ID,
                     'Similarity_score':similarity})
final_score.head(10)

Unnamed: 0,Unique_ID,Similarity_score
0,0,0.389471
1,1,0.292066
2,2,0.27289
3,3,0.184955
4,4,0.171677
5,5,0.249288
6,6,0.319132
7,7,0.152613
8,8,0.20936
9,9,0.098709


In [None]:
#saving dataframe to csv file 
final_score.to_csv('Solution.csv',index=False)
