In [34]:
import warnings
warnings.filterwarnings('ignore')

## Task 1

Study Section 5 of Chapter 2 of NLTK online book, and try to reproduce the coding examples and try to use your own examples of wording to identify the synsets, hyponyms, hypernyms, and various semantic similarity between two words of your choice.

### 5.1 Synsets

**Example from Book** 

Consider the sentence in (1). If we replace the word motorcar in (1) by automobile, to get (2), the meaning of the sentence stays pretty much the same:
	
1. Benz is credited with the invention of the motorcar.
2. Benz is credited with the invention of the automobile.

Since everything else in the sentence has remained unchanged, we can conclude that the words motorcar and automobile have the same meaning, i.e. they are synonyms. We can explore these words with the help of WordNet.

In [22]:
from nltk.corpus import wordnet as wn

In [23]:
wn.synsets('motorcar')

[Synset('car.n.01')]

In [24]:
wn.synsets('automobile')

[Synset('car.n.01'), Synset('automobile.v.01')]

In [2]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [3]:
wn.synset('car.n.01').definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [4]:
wn.synset('car.n.01').examples()

['he needs a car to get to work']

In [5]:
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [6]:
wn.lemma('car.n.01.automobile')

Lemma('car.n.01.automobile')

In [7]:
wn.lemma('car.n.01.automobile').synset()

Synset('car.n.01')

In [8]:
wn.lemma('car.n.01.automobile').name()

'automobile'

In [9]:
wn.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [10]:
for synset in wn.synsets('car'):
    print(synset.lemma_names())

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']


In [13]:
wn.lemmas('car')

[Lemma('car.n.01.car'),
 Lemma('car.n.02.car'),
 Lemma('car.n.03.car'),
 Lemma('car.n.04.car'),
 Lemma('cable_car.n.01.car')]

### 5.2 Hyponyms

In [14]:
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
types_of_motorcar[0]

Synset('ambulance.n.01')

In [15]:
sorted(lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas())

['Model_T',
 'S.U.V.',
 'SUV',
 'Stanley_Steamer',
 'ambulance',
 'beach_waggon',
 'beach_wagon',
 'bus',
 'cab',
 'compact',
 'compact_car',
 'convertible',
 'coupe',
 'cruiser',
 'electric',
 'electric_automobile',
 'electric_car',
 'estate_car',
 'gas_guzzler',
 'hack',
 'hardtop',
 'hatchback',
 'heap',
 'horseless_carriage',
 'hot-rod',
 'hot_rod',
 'jalopy',
 'jeep',
 'landrover',
 'limo',
 'limousine',
 'loaner',
 'minicar',
 'minivan',
 'pace_car',
 'patrol_car',
 'phaeton',
 'police_car',
 'police_cruiser',
 'prowl_car',
 'race_car',
 'racer',
 'racing_car',
 'roadster',
 'runabout',
 'saloon',
 'secondhand_car',
 'sedan',
 'sport_car',
 'sport_utility',
 'sport_utility_vehicle',
 'sports_car',
 'squad_car',
 'station_waggon',
 'station_wagon',
 'stock_car',
 'subcompact',
 'subcompact_car',
 'taxi',
 'taxicab',
 'tourer',
 'touring_car',
 'two-seater',
 'used-car',
 'waggon',
 'wagon']

### 5.3 Hypernyms

In [16]:
motorcar.hypernyms()

[Synset('motor_vehicle.n.01')]

In [18]:
paths = motorcar.hypernym_paths()
len(paths)

2

In [19]:
[synset.name() for synset in paths[0]]

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'artifact.n.01',
 'instrumentality.n.03',
 'container.n.01',
 'wheeled_vehicle.n.01',
 'self-propelled_vehicle.n.01',
 'motor_vehicle.n.01',
 'car.n.01']

In [20]:
[synset.name() for synset in paths[1]]

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'artifact.n.01',
 'instrumentality.n.03',
 'conveyance.n.03',
 'vehicle.n.01',
 'wheeled_vehicle.n.01',
 'self-propelled_vehicle.n.01',
 'motor_vehicle.n.01',
 'car.n.01']

In [21]:
motorcar.root_hypernyms()

[Synset('entity.n.01')]

### 5.4 Semantic Similariy

In [26]:
motorcar = wn.synset('car.n.01')
automobile_1 = wn.synset('car.n.01')
automobile_2 = wn.synset('automobile.v.01')

In [28]:
motorcar.lowest_common_hypernyms(automobile_1)

[Synset('car.n.01')]

In [29]:
motorcar.lowest_common_hypernyms(automobile_2)

[]

In [30]:
wn.synset('car.n.01').min_depth()

10

In [32]:
motorcar.path_similarity(automobile_1)

1.0

In [33]:
motorcar.path_similarity(automobile_2)

0.07142857142857142

**Own Example**

Let's use the WordNet library to identify synsets, hyponyms, hypernyms and measure semantic similarity between two words, "dog" and "cat".

1. I have a pet dog.
2. I have a pet cat.

**Synsets**

In [51]:
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [52]:
wn.synsets('cat')

[Synset('cat.n.01'),
 Synset('guy.n.01'),
 Synset('cat.n.03'),
 Synset('kat.n.01'),
 Synset('cat-o'-nine-tails.n.01'),
 Synset('caterpillar.n.02'),
 Synset('big_cat.n.01'),
 Synset('computerized_tomography.n.01'),
 Synset('cat.v.01'),
 Synset('vomit.v.01')]

**Hyponyms**

In [53]:
dog = wn.synset('dog.n.01')
types_of_dog = dog.hyponyms()
types_of_dog

[Synset('basenji.n.01'),
 Synset('corgi.n.01'),
 Synset('cur.n.01'),
 Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'),
 Synset('griffon.n.02'),
 Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'),
 Synset('leonberg.n.01'),
 Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'),
 Synset('pooch.n.01'),
 Synset('poodle.n.01'),
 Synset('pug.n.01'),
 Synset('puppy.n.01'),
 Synset('spitz.n.01'),
 Synset('toy_dog.n.01'),
 Synset('working_dog.n.01')]

In [54]:
cat = wn.synset('cat.n.01')
types_of_cat = cat.hyponyms()
types_of_cat

[Synset('domestic_cat.n.01'), Synset('wildcat.n.03')]

**Hypernyms**

In [56]:
dog.hypernyms()

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [57]:
cat.hypernyms()

[Synset('feline.n.01')]

In [61]:
path_dog = dog.hypernym_paths()
len(path_dog)

2

In [62]:
path_cat = cat.hypernym_paths()
len(path_cat)

1

In [63]:
[synset.name() for synset in path_dog[0]]

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'living_thing.n.01',
 'organism.n.01',
 'animal.n.01',
 'chordate.n.01',
 'vertebrate.n.01',
 'mammal.n.01',
 'placental.n.01',
 'carnivore.n.01',
 'canine.n.02',
 'dog.n.01']

In [64]:
[synset.name() for synset in path_dog[1]]

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'living_thing.n.01',
 'organism.n.01',
 'animal.n.01',
 'domestic_animal.n.01',
 'dog.n.01']

In [65]:
[synset.name() for synset in path_cat[0]]

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'living_thing.n.01',
 'organism.n.01',
 'animal.n.01',
 'chordate.n.01',
 'vertebrate.n.01',
 'mammal.n.01',
 'placental.n.01',
 'carnivore.n.01',
 'feline.n.01',
 'cat.n.01']

In [67]:
dog.root_hypernyms() #most general hypernyms

[Synset('entity.n.01')]

In [68]:
cat.root_hypernyms() #most general hypernyms

[Synset('entity.n.01')]

**Semantic Similarity**

In [79]:
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')

In [80]:
dog.min_depth()

8

In [81]:
cat.min_depth()

13

In [82]:
dog.lowest_common_hypernyms(cat)

[Synset('carnivore.n.01')]

In [83]:
dog.path_similarity(cat)

0.2

## Task 2

Identify the synsets of the word “car” and rank them in the order of their frequency of occurrence (most common synset first, less common synset at the end).

In [84]:
car = wn.synsets('car', 'n')
car

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [99]:
for synset in car:
    print(sorted(synset.lemmas()))

[Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.car'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
[Lemma('car.n.02.car'), Lemma('car.n.02.railcar'), Lemma('car.n.02.railroad_car'), Lemma('car.n.02.railway_car')]
[Lemma('car.n.03.car'), Lemma('car.n.03.gondola')]
[Lemma('car.n.04.car'), Lemma('car.n.04.elevator_car')]
[Lemma('cable_car.n.01.cable_car'), Lemma('cable_car.n.01.car')]


In [100]:
synset_frequencies = [(synset, synset.lemmas()[0].count()) for synset in car]

In [101]:
synset_frequencies.sort(key=lambda x: x[1], reverse=True)

In [102]:
for i, (synset, frequency) in enumerate(synset_frequencies, start=1):
    print(f"{i}. Synset: {synset.name()} - Frequency: {frequency}")

1. Synset: car.n.01 - Frequency: 71
2. Synset: car.n.02 - Frequency: 2
3. Synset: car.n.03 - Frequency: 0
4. Synset: car.n.04 - Frequency: 0
5. Synset: cable_car.n.01 - Frequency: 0


In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/moinul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/moinul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
T1 = "Students feel unhappy today about the class today"
T2 = "Many students felt concepts of class test relevant"

In [3]:
#Tokenization

words_T1 = word_tokenize(T1)
words_T2 = word_tokenize(T2)

In [4]:
#Stopwords removal

stop_words = set(stopwords.words('english'))
filtered_words_T1 = [word.lower() for word in words_T1 if word.lower() not in stop_words]
filtered_words_T2 = [word.lower() for word in words_T2 if word.lower() not in stop_words]

In [5]:
#Stemming

stemmer = SnowballStemmer("english")
stemmed_words_T1 = [stemmer.stem(word) for word in filtered_words_T1]
stemmed_words_T2 = [stemmer.stem(word) for word in filtered_words_T2]

In [6]:
#Combining the tokens back into sentence

processed_T1 = " ".join(stemmed_words_T1)
processed_T2 = " ".join(stemmed_words_T2)

## Task 3

Now consider two sentences T1 and T2, each constituted with a set of tokens. For this purpose, study expression (1) of the aforementioned Mihalcea et al.’s paper above (see below).  You can check with a potential implementation available at https://nlpforhackers.io/wordnet-sentence-similarity/.

### Mihalcea Similarity

In [433]:
#Mihalcea's Similarity
import math
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

def mihalcea_similarity(tokens_T1, tokens_T2):
    similarity_score = 0
    
    word_counts = {}
    for doc in [tokens_T1, tokens_T2]:
        for word in doc:
            word_counts[word] = word_counts.get(word, 0) + 1

    idf_values = {}
    total_docs = len(tokens_T1) + len(tokens_T2)
    
    for word, count in word_counts.items():
        idf_values[word] = math.log(total_docs / (count + 1)) 

    def max_similarity(word1, word2):
        max_sim = 0
        for synset1 in wn.synsets(word1):
            for synset2 in wn.synsets(word2):
                sim = synset1.wup_similarity(synset2)
                if sim is not None and sim > max_sim:
                    max_sim = sim
        return max_sim
    
    for word in set(tokens_T1):
        max_sim = 0
        for word2 in tokens_T2:  # Iterate over tokens_T2
            max_sim = max(max_sim, max_similarity(word, word2))
        similarity_score += (max_sim * idf_values.get(word, 0))

    for word in set(tokens_T2):
        max_sim = 0
        for word2 in tokens_T1:  # Iterate over tokens_T1
            max_sim = max(max_sim, max_similarity(word, word2))
        similarity_score += (max_sim * idf_values.get(word, 0))

    denominator = sum(idf_values.get(word, 0) for word in set(tokens_T1)) + sum(idf_values.get(word, 0) for word in set(tokens_T2))

    similarity_score /= (2 * denominator)
    
    return similarity_score

similarity = mihalcea_similarity(words_T1, words_T2)
print("Mihalcea's Similarity:", similarity)

Mihalcea's Similarity: 0.3212801648924751


## Task 4

Start with sentences: T1: “Students feel unhappy today about the class today”. T2: ”Many students felt concepts of class test relevant”,  and study the influence of various preprocessing (stopword removal, stemming) on the result of the sentence-to-sentence similarity above.

### TF_IDF

In [132]:
#TF-IDF vectorization

vectorizer = TfidfVectorizer()

#for original sentences
tfidf_matrix_org = vectorizer.fit_transform([T1, T2])

#for preprocessed sentences
tfidf_matrix_pre = vectorizer.fit_transform([processed_T1, processed_T2])

In [133]:
# Calculating cosine similarity

#for original sentences
cosine_sim_org = cosine_similarity(tfidf_matrix_org[0], tfidf_matrix_org[1])[0][0]

#for preprocessed sentences
cosine_sim_pre = cosine_similarity(tfidf_matrix_pre[0], tfidf_matrix_pre[1])[0][0]

In [134]:
print("Original Sentence 1:", T1)
print("Original Sentence 2:", T2)
print("Cosine Similarity:", cosine_sim_org)
print('\n')
print("Processed Sentence 1:", processed_T1)
print("Processed Sentence 2:", processed_T2)
print("Cosine Similarity:", cosine_sim_pre)

Original Sentence 1: Students feel unhappy today about the class today
Original Sentence 2: Many students felt concepts of class test relevant
Cosine Similarity: 0.12735952979479354


Processed Sentence 1: student feel unhappi today class today
Processed Sentence 2: mani student felt concept class test relev
Cosine Similarity: 0.15592892548708365


## Task 5

Implement a program that calculates the sentence-to-sentence similarity as the result of the FuzzyWuzzy score of comparison of string of both sentences, after initial preprocessing and lemmatization using wordnet lemmatizer. Calculate the new similarity score between sentence T1 and T2.

### FuzzyWuzzy

In [135]:
!pip install fuzzywuzzy



In [35]:
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

#Lemmatization using WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words_T1 = [lemmatizer.lemmatize(word) for word in filtered_words_T1]
lemmatized_words_T2 = [lemmatizer.lemmatize(word) for word in filtered_words_T2]

# Combining the preprocessed sentence
processed_T1 = " ".join(lemmatized_words_T1)
processed_T2 = " ".join(lemmatized_words_T2)

# Calculate FuzzyWuzzy similarity score
similarity_score = fuzz.token_set_ratio(processed_T1, processed_T2) / 100.00

print("Processed Sentence 1:", processed_T1)
print("Processed Sentence 2:", processed_T2)
print("FuzzyWuzzy Similarity Score:", similarity_score)

Processed Sentence 1: student feel unhappy today class today
Processed Sentence 2: many student felt concept class test relevant
FuzzyWuzzy Similarity Score: 0.58


## Task 6

Now consider a new sentence-to-sentence similarity where the similarity score is calculated as the cosine similarity of embedding vectors of the two sentences and where the embedding vector of each sentence is the average of FastText embedding vector of each word constituting the sentence prior to any pre-processing stage. Write a program that implements this similarity metric and compute the sentence-to-sentence similarity of T1 and T2.

### FastText Embedding

In [137]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Obtaining dependency information for pybind11>=2.2 from https://files.pythonhosted.org/packages/06/55/9f73c32dda93fa4f539fafa268f9504e83c489f460c380371d94296126cd/pybind11-2.11.1-py3-none-any.whl.metadata
  Using cached pybind11-2.11.1-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp311-cp311-macosx_14_0_arm64.whl size=291526 sha256=fadfb94d7fac60f6f35a209d94579dac4aeed08cd055804462721492185ebbb1
  Stored in directory: /Users/moinul/Library/Caches/pip/wheels/12/89/c9/c932d04c4dd65abe347bbb3e6f7668688

In [44]:
import fasttext
import fasttext.util

# fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')



In [101]:
import numpy as np

#Calcuting average for each sentence using Fasttext embeddings
def get_average_embedding(sentence, model):
    embeddings = []
    for word in sentence:
        if word in model:
            embeddings.append(model[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.get_dimension())

embedding_T1 = get_average_embedding(words_T1, ft)
embedding_T2 = get_average_embedding(words_T2, ft)

# Reshape the embeddings for cosine similarity calculation
embedding_T1 = embedding_T1.reshape(1, -1)
embedding_T2 = embedding_T2.reshape(1, -1)

# Calculate cosine similarity between the sentence embeddings
similarity_score = cosine_similarity(embedding_T1, embedding_T2)[0][0]

print("Cosine Similarity of Sentence Embeddings:", similarity_score)

Cosine Similarity of Sentence Embeddings: 0.6717617


## Task 7

Repeat the above process when using Glove, word2vec embeddings.

### GloVe Embedding

In [142]:
#Loading GloVe word vectors
glove_embeddings = {}

with open('glove.6B.300d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split()
        word = parts[0]
        embedding = np.array(parts[1:], dtype=np.float32)
        glove_embeddings[word] = embedding


#Calcuting average for each sentence using GloVe embeddings
def get_average_embedding(sentence, embeddings):
    embeddings_list = [embeddings[word] for word in sentence if word in embeddings]
    if embeddings_list:
        return np.mean(embeddings_list, axis=0)
    else:
        return np.zeros(300)

embedding_T1 = get_average_embedding(words_T1, glove_embeddings)
embedding_T2 = get_average_embedding(words_T2, glove_embeddings)

# Reshape the embeddings for cosine similarity calculation
embedding_T1 = embedding_T1.reshape(1, -1)
embedding_T2 = embedding_T2.reshape(1, -1)

# Calculate cosine similarity
similarity_score = cosine_similarity(embedding_T1, embedding_T2)[0][0]

print("Cosine Similarity using GloVe embeddings:", similarity_score)

Cosine Similarity using GloVe embeddings: 0.7461909


### Word2Vec Embedding

In [144]:
!pip install gensim



In [42]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

w2v_model = api.load('word2vec-google-news-300')

In [43]:
#Calcuting average for each sentence using Word2Vec embeddings
def get_average_embedding(sentence, model):
    embeddings = []
    for word in sentence:
        if word in model:
            embeddings.append(model[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

embedding_T1 = get_average_embedding(words_T1, w2v_model)
embedding_T2 = get_average_embedding(words_T2, w2v_model)

# Reshape the embeddings for cosine similarity calculation
embedding_T1 = embedding_T1.reshape(1, -1)
embedding_T2 = embedding_T2.reshape(1, -1)

# Calculate cosine similarity
similarity_score = cosine_similarity(embedding_T1, embedding_T2)[0][0]

print("Cosine Similarity using Word2Vec embeddings:", similarity_score)

Cosine Similarity using Word2Vec embeddings: 0.6702926


## Task 8

Consider the Quora question-answer pair in Question Pairs Dataset. Write a program that evaluate the sentence-to-sentence similarity using the five methods above (**Mihalacea, FuzzyWuzzy, FastText, Word2Vec, GloVe**) and calculate the score over all pairs, testing pairs. Compare your result with some of results reported in the repository.

In [410]:
import numpy as np
import pandas as pd

In [411]:
df=pd.read_csv('questions.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [412]:
new_df=df.sample(3000,random_state=2)

In [413]:
def tokenize(text):
    return word_tokenize(text.lower())

new_df['Q1_tokens'] = new_df['question1'].apply(tokenize)
new_df['Q2_tokens'] = new_df['question2'].apply(tokenize)

In [425]:
#Mihalcea's Similary

def mihalcea_similarity(tokens_T1, tokens_T2):
    similarity_score = 0
    
    word_counts = {}
    for doc in [tokens_T1, tokens_T2]:
        for word in doc:
            word_counts[word] = word_counts.get(word, 0) + 1

    idf_values = {}
    total_docs = len(tokens_T1) + len(tokens_T2)
    
    for word, count in word_counts.items():
        idf_values[word] = math.log(total_docs / (count + 1)) 

    def max_similarity(word1, word2):
        max_sim = 0
        for synset1 in wn.synsets(word1):
            for synset2 in wn.synsets(word2):
                sim = synset1.wup_similarity(synset2)
                if sim is not None and sim > max_sim:
                    max_sim = sim
        return max_sim
    
    for word in set(tokens_T1):
        max_sim = 0
        for word2 in tokens_T2:  # Iterate over tokens_T2
            max_sim = max(max_sim, max_similarity(word, word2))
        similarity_score += (max_sim * idf_values.get(word, 0))

    for word in set(tokens_T2):
        max_sim = 0
        for word2 in tokens_T1:  # Iterate over tokens_T1
            max_sim = max(max_sim, max_similarity(word, word2))
        similarity_score += (max_sim * idf_values.get(word, 0))

    denominator = sum(idf_values.get(word, 0) for word in set(tokens_T1)) + sum(idf_values.get(word, 0) for word in set(tokens_T2))

    similarity_score /= (2 * denominator)
    
    return similarity_score

In [426]:
#FuzzyWuzzy Similarity

def fuzzywuzzy_similarity(q1,q2):
    return fuzz.token_set_ratio(q1, q2) / 100.00

In [427]:
#FastText

def fasttext_similarity(q1, q2):
    def get_average_embedding(sentence, model):
        embeddings = []
        for word in sentence:
            if word in model:
                embeddings.append(model[word])
        if len(embeddings) > 0:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(model.get_dimension())

    embedding_T1 = get_average_embedding(q1, ft)
    embedding_T2 = get_average_embedding(q2, ft)

    # Reshape the embeddings for cosine similarity calculation
    embedding_T1 = embedding_T1.reshape(1, -1)
    embedding_T2 = embedding_T2.reshape(1, -1)

    # Calculate cosine similarity between the sentence embeddings
    similarity_score = cosine_similarity(embedding_T1, embedding_T2)[0][0]
    return similarity_score

In [428]:
#GloVe
glove_embeddings = {}
with open('glove.6B.300d.txt', 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            word = parts[0]
            embedding = np.array(parts[1:], dtype=np.float32)
            glove_embeddings[word] = embedding

def glove_similarity(q1, q2):           
    def get_average_embedding(sentence, embeddings):
        embeddings_list = [embeddings[word] for word in sentence if word in embeddings]
        if embeddings_list:
            return np.mean(embeddings_list, axis=0)
        else:
            return np.zeros(300)
    
    q1_str = ' '.join(q1)
    q2_str = ' '.join(q2)
    
    embedding_T1 = get_average_embedding(q1_str.split(), glove_embeddings)
    embedding_T2 = get_average_embedding(q2_str.split(), glove_embeddings)


    embedding_T1 = embedding_T1.reshape(1, -1)
    embedding_T2 = embedding_T2.reshape(1, -1)

    similarity_score = cosine_similarity(embedding_T1, embedding_T2)[0][0]
    return similarity_score

In [429]:
#Word2Vec

def word2vec_similarity(q1, q2):
    def get_average_embedding(sentence, model):
        embeddings = []
        for word in sentence:
            if word in model:
                embeddings.append(model[word])
        if len(embeddings) > 0:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(model.vector_size)

    embedding_T1 = get_average_embedding(q1, w2v_model)
    embedding_T2 = get_average_embedding(q2, w2v_model)

    # Reshape the embeddings for cosine similarity calculation
    embedding_T1 = embedding_T1.reshape(1, -1)
    embedding_T2 = embedding_T2.reshape(1, -1)

    # Calculate cosine similarity
    similarity_score = cosine_similarity(embedding_T1, embedding_T2)[0][0]
    return similarity_score

In [430]:
#Calculate Similarity Scores of all 5 Methods

def calculate_similarity_scores(q1, q2):
    similarity_scores = []
    
#     mihalcea_score = mihalcea_similarity(q1, q2)
#     fuzzywuzzy_score = fuzzywuzzy_similarity(q1, q2)
        
    for i in range(len(q1)):
        q1_tokens = q1.iloc[i]  # Get the list of tokens for the current row
        q2_tokens = q2.iloc[i]  # Get the list of tokens for the current row
        mihalcea_score = mihalcea_similarity(q1_tokens, q2_tokens)
        fuzzywuzzy_score = fuzzywuzzy_similarity(q1_tokens, q2_tokens)
        fasttext_score = fasttext_similarity(q1_tokens, q2_tokens)
        glove_score = glove_similarity(q1_tokens, q2_tokens)
        word2vec_score = word2vec_similarity(q1_tokens, q2_tokens)
        similarity_scores.append([mihalcea_score, fuzzywuzzy_score, fasttext_score, glove_score, word2vec_score])
    return pd.DataFrame(similarity_scores, columns=['Mihalcea','FuzzyWuzzy','FastText','GloVe', 'Word2Vec'])

In [431]:
similarity_scores = calculate_similarity_scores(new_df['Q1_tokens'], new_df['Q2_tokens'])
similarity_scores

Unnamed: 0,Mihalcea,FuzzyWuzzy,FastText,GloVe,Word2Vec
0,0.221460,0.85,0.862519,0.878094,0.814089
1,0.224384,0.37,0.852516,0.910222,0.714067
2,0.252014,0.99,0.999985,0.998917,0.994769
3,0.316605,0.89,0.928288,0.915257,0.811866
4,0.305664,0.79,0.878342,0.927221,0.878093
...,...,...,...,...,...
2995,0.204544,0.53,0.822965,0.796477,0.536979
2996,0.309962,0.68,0.869877,0.903219,0.823236
2997,0.329448,0.84,0.977504,0.985161,0.962474
2998,0.269433,0.63,0.844196,0.915376,0.745623
