Implementing the following paper https://ieeexplore.ieee.org/document/1644735

In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn


[nltk_data] Downloading package wordnet to C:\Users\Nguyen Viet
[nltk_data]     Hoa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def check_if_in_wordnet(word):
    wn_lemmas = set(wordnet.all_lemma_names())

    if word in wn_lemmas:
        return True
    else: 
        return False

In [2]:
def compute_word_similarity(word1,word2):
    first_word = wn.synsets(word1)[0]
    second_word = wn.synsets(word2)[0]
    return (first_word.wup_similarity(second_word))

In [3]:
def max_length(sen1, sen2):
    len1= len(nltk.word_tokenize(sen1))
    len2= len(nltk.word_tokenize(sen2))
    return max(len1,len2)


In [4]:
first_sen = "Three dogs pulling a man on a bicycle through the snow"
second_sen = "The dogs are pulling a man on a type of bike through the snow"
maxlength = max_length(first_sen,second_sen)

In [5]:
tokenized_sen1 = [token in nltk.word_tokenize(first_sen)
tokenized_sen2 = nltk.word_tokenize(first_sen)

In [6]:
sentence = [tokenized_sen1,tokenized_sen2]
count = len(tokenized_sen1)*len(tokenized_sen2)

sum = 0.0
for i in sentence[0]:
    for j in sentence[1]:
        if compute_word_similarity(i,j) != None:
            sum = sum + float(compute_word_similarity(i,j))
            
print(sum/count)
        

IndexError: list index out of range

## Semantic Similarity


In [None]:
from collections import Counter

def unique_string(str_):
    words = str_.split(' ')
    c = Counter(words)
    return [w for w in words if c[w] == 1]

In [None]:
token1 = unique_string(first_sen)
token2 = unique_string(second_sen)
joint_token = token1 + token2
print(joint_token)

In [None]:
threshold = 0

word_score1 = [] 

for token in joint_token: 
    if token in token1: 
        word_score1.append(1)
    else:
        max = 0
        for word in token1: 
            if compute_word_similarity(token,word) != None:
                score = compute_word_similarity(token,word)
                if score > max : 
                    max = score
        word_score1.append(max)
    
word_score2 = [] 

for token in joint_token: 
    if token in token2: 
        word_score2.append(1)
    else:
        max = 0
        for word in token2: 
            if compute_word_similarity(token,word) != None:
                score = compute_word_similarity(token,word)
                if score > max : 
                    max = score
        word_score2.append(max)



In [None]:
from numpy import dot
from numpy.linalg import norm

semantic_score = dot(word_score1, word_score2)/(norm(word_score1)*norm(word_score2))
print(semantic_score)

## Word Order Similarity

In [None]:
def find_the_most_similar_word(token, sen):
    max = 0
    index = 0
    for i, word in enumerate(sen): 
        print(i,word)
        if compute_word_similarity(token,word) != None:
            score = compute_word_similarity(token,word)
            if score > max : 
                max = score
                index = i
    return (index,max)

In [None]:
print(find_the_most_similar_word("dog",token1))

In [None]:
threshold = 0.7
word_order_vec1 = []

for token in joint_token: 
    if token in token1: 
        for index,word in enumerate(token1): 
            if word == token: 
                word_order_vec1.append(index)
    else: 
        (index,score) = find_the_most_similar_word(token,token1)
        if score >= threshold: 
            word_order_vec1.append(index)
        else: 
            word_order_vec1.append(0)
            
word_order_vec2 = []
for token in joint_token: 
    if token in token2: 
        for index,word in enumerate(token2): 
            if word == token: 
                word_order_vec2.append(index)
    else: 
        (index,score) = find_the_most_similar_word(token,token2)
        if score >= threshold: 
            word_order_vec2.append(index)
        else: 
            word_order_vec2.append(0)

print(word_order_vec2)
print(word_order_vec1)

In [None]:
import numpy as np 

word_order_vec1 = np.array(word_order_vec1)
word_order_vec2 = np.array(word_order_vec2)

word_order_similarity = 1 - (norm(word_order_vec1-word_order_vec2)/norm(word_order_vec2+word_order_vec1))

print(word_order_similarity)

## Overall similarity

In [None]:
alpha = 0.7
overall_similarity = alpha*semantic_score + (1-alpha)*word_order_similarity
print(overall_similarity)