**I was able to complete most of the assignment. Some parts I was unsure how to do them**

In [1]:
import nltk
import numpy as np
import csv
from fuzzywuzzy import fuzz
from gensim.models import Word2Vec, FastText
from gensim.test.utils import common_texts
from nltk.tokenize import word_tokenize
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
genesis_ic = wn.ic(genesis, False, 0.0)

def preProcess(sentence):
    """Tokenize, remove stopwords, and clean the sentence."""
    Stopwords = list(set(nltk.corpus.stopwords.words('english')))
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha() and word not in Stopwords] 
    return words

def get_wordnet_pos(word):
    """Map POS tag to first character for lemmatization with WordNet."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    return tag_dict.get(tag, wn.NOUN)  

def wup(S1, S2):
    """Wu-Palmer similarity."""
    return S1.wup_similarity(S2)

def resnik(S1, S2):
    """Resnik similarity."""
    return S1.res_similarity(S2, genesis_ic)

options = {0: wup, 1: resnik}

def word_similarity(w1, w2, num):
    """Calculate similarity between two words only if they share the same POS."""
    pos1 = get_wordnet_pos(w1)
    pos2 = get_wordnet_pos(w2)

    synsets1 = wn.synsets(w1, pos=pos1)
    synsets2 = wn.synsets(w2, pos=pos2)
    
    if synsets1 and synsets2:
        S1 = synsets1[0]  
        S2 = synsets2[0]  
        try:
            similarity = options[num](S1, S2)
            if similarity:
                return round(similarity, 2)
        except nltk.corpus.reader.wordnet.WordNetError:
            return 0
    return 0

def Similarity(T1, T2, num):
    """Calculate sentence-to-sentence similarity using TF-IDF and WordNet similarity."""
    words1 = preProcess(T1)
    words2 = preProcess(T2)

    tf = TfidfVectorizer(use_idf=True)
    tf.fit_transform([' '.join(words1), ' '.join(words2)])
    
    Idf = dict(zip(tf.get_feature_names_out(), tf.idf_))
    
    Sim_score1 = 0
    Sim_score2 = 0

    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1, w2, num)
            if Max < score:
                Max = score
        Sim_score1 += Max * Idf.get(w1, 0)
    Sim_score1 /= sum([Idf.get(w1, 0) for w1 in words1])

    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1, w2, num)
            if Max < score:
                Max = score
        Sim_score2 += Max * Idf.get(w2, 0)
    Sim_score2 /= sum([Idf.get(w2, 0) for w2 in words2])

    Sim = (Sim_score1 + Sim_score2) / 2
    
    return round(Sim, 2)

model_w2v = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model_ft = FastText(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

def average_embedding(model, tokens):
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])

    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
        return avg_embedding
    else:
        return np.zeros(model.vector_size)

def cosine_sim(a, b):
    return cosine_similarity([a], [b])[0][0]

def fuzzywuzzy_similarity(sentence1, sentence2):
    return fuzz.ratio(sentence1, sentence2) / 100

In [3]:
file_path = 'msr_paraphrase_corpus.txt'

labels = []
similarity_scores_wup = []
similarity_scores_resnik = []
similarity_scores_w2v = []
similarity_scores_fasttext = []
similarity_scores_fuzzywuzzy = []

with open(file_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')
    next(reader)
    
    for row in reader:
        if len(row) >= 5:
            label = int(row[0])
            sentence1 = row[3]
            sentence2 = row[4]

            labels.append(label)
            """Wu-Palmer and Resnik"""
            sim_score_wup = Similarity(sentence1, sentence2, num = 0)
            similarity_scores_wup.append(sim_score_wup)
            sim_score_resnik = Similarity(sentence1, sentence2, num = 1)
            similarity_scores_resnik.append(sim_score_resnik)

            """Word2Vec"""
            sentence1_tokens = word_tokenize(sentence1)
            sentence2_tokens = word_tokenize(sentence2)
            w2c_vector_1 = average_embedding(model_w2v, sentence1_tokens)
            w2c_vector_2 = average_embedding(model_w2v, sentence2_tokens)
            sim_score_w2v = cosine_sim(w2c_vector_1, w2c_vector_2)
            similarity_scores_w2v.append(sim_score_w2v)


            """FastText"""
            ft_vector_1 = average_embedding(model_ft, sentence1_tokens)
            ft_vector_2 = average_embedding(model_ft, sentence2_tokens)
            sim_score_ft = cosine_sim(ft_vector_1, ft_vector_2)
            similarity_scores_fasttext.append(sim_score_ft)

            """FuzzyWuzzy"""
            sim_score_fuzzywuzzy = fuzzywuzzy_similarity(sentence1, sentence2)
            similarity_scores_fuzzywuzzy.append(sim_score_fuzzywuzzy)

correlation_wup, p_value_wup = pearsonr(labels, similarity_scores_wup)
correlation_resnik, p_value_resnik = pearsonr(labels, similarity_scores_resnik)
correlation_w2v, p_value_w2v = pearsonr(labels, similarity_scores_w2v)
correlation_ft, p_value_ft = pearsonr(labels, similarity_scores_fasttext)
correlation_fuzzywuzzy, p_value_fuzzywuzzy = pearsonr(labels, similarity_scores_fuzzywuzzy)

print(f"Wup-sim Pearson correlation: {correlation_wup:.2f}, p-value: {p_value_wup:.5f}")
print(f"Resnik-sim Pearson correlation: {correlation_resnik:.2f}, p-value: {p_value_resnik:.5f}")

print(f"Word2Vec Pearson correlation: {correlation_w2v:.2f}, p-value: {p_value_w2v:.5f}")
print(f"FastText Pearson correlation: {correlation_ft:.2f}, p-value: {p_value_ft:.5f}")

print(f"FuzzyWuzzy Pearson correlation: {correlation_fuzzywuzzy:.2f}, p-value: {p_value_fuzzywuzzy:.5f}")

Wup-sim Pearson correlation: 0.27, p-value: 0.00000
Resnik-sim Pearson correlation: 0.09, p-value: 0.00000
Word2Vec Pearson correlation: 0.04, p-value: 0.01003
FastText Pearson correlation: 0.22, p-value: 0.00000
FuzzyWuzzy Pearson correlation: 0.37, p-value: 0.00000


**None of the methods provide good results based on the Pearson correlation**

In [4]:
labels = np.array(labels)
similarity_scores = np.array(similarity_scores_wup)

paraphrase_scores = similarity_scores[labels == 1]
non_paraphrase_scores = similarity_scores[labels == 0]

min_paraphrase = np.min(paraphrase_scores)
max_non_paraphrase = np.max(non_paraphrase_scores)

#Let's assign a threshold
threshold = (min_paraphrase + max_non_paraphrase) / 2
predictions = (similarity_scores >= threshold).astype(int)
conf_matrix = confusion_matrix(labels, predictions)
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[ 447  832]
 [ 468 2194]]


**True negative: 447; True positive: 2194, False negative 468, False positive 832**