In [9]:
import csv
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import pearsonr

In [13]:
def preProcess(sentence):
    """Tokenize, remove stopwords, and clean the sentence."""
    Stopwords = list(set(nltk.corpus.stopwords.words('english')))
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha() and word not in Stopwords] 
    return words

def get_wordnet_pos(word):
    """Map POS tag to first character for lemmatization with WordNet."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    return tag_dict.get(tag, wn.NOUN)  

def word_similarity(w1, w2):
    """Calculate similarity between two words only if they share the same POS."""
    pos1 = get_wordnet_pos(w1)
    pos2 = get_wordnet_pos(w2)

    synsets1 = wn.synsets(w1, pos=pos1)
    synsets2 = wn.synsets(w2, pos=pos2)
    
    if synsets1 and synsets2:
        S1 = synsets1[0]  
        S2 = synsets2[0]  
        try:
            similarity = S1.wup_similarity(S2)
            if similarity:
                return round(similarity, 2)
        except nltk.corpus.reader.wordnet.WordNetError:
            return 0
    return 0

def Similarity(T1, T2):
    """Calculate sentence-to-sentence similarity using TF-IDF and WordNet similarity."""
    words1 = preProcess(T1)
    words2 = preProcess(T2)

    tf = TfidfVectorizer(use_idf=True)
    tf.fit_transform([' '.join(words1), ' '.join(words2)])
    
    Idf = dict(zip(tf.get_feature_names_out(), tf.idf_))
    
    Sim_score1 = 0
    Sim_score2 = 0

    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1, w2)
            if Max < score:
                Max = score
        Sim_score1 += Max * Idf.get(w1, 0)
    Sim_score1 /= sum([Idf.get(w1, 0) for w1 in words1])

    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1, w2)
            if Max < score:
                Max = score
        Sim_score2 += Max * Idf.get(w2, 0)
    Sim_score2 /= sum([Idf.get(w2, 0) for w2 in words2])

    Sim = (Sim_score1 + Sim_score2) / 2
    
    return round(Sim, 2)

def read_from_csv(file_path):
    '''Read sentences and the corresponding similarity scores from a csv file'''
    sentences = []
    scores = []
    
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        next(reader) # Skip the header
        for row in reader:
            if len(row) == 3:
                sentence1, sentence2, score = row
                sentences.append((sentence1.strip(), sentence2.strip()))  # Append tuple of sentences
                scores.append(float(score.strip()))
    return sentences, scores

In [16]:
sentences, human_similarities = read_from_csv("STSS-131.csv");

computed_similarities = []
for sentence1, sentence2 in sentences:
    score = Similarity(sentence1, sentence2)
    computed_similarities.append(score)
    
print(f"List lengths: {len(sentences)}, {len(human_similarities)}, {len(computed_similarities)}")

df = pd.DataFrame({
    'Sentence 1': [s[0] for s in sentences],
    'Sentence 2': [s[1] for s in sentences],
    'Human Similarity': human_similarities,
    'Computed Similarity': computed_similarities
})

'''You can see the table in the GitHub'''
df.to_excel('similarities.xlsx', index=False)

correlation_coefficient, p_value = pearsonr(human_similarities, computed_similarities)

print(f"Pearson correlation coefficient: {correlation_coefficient:.2f}")

List lengths: 66, 66, 66
Pearson correlation coefficient: 0.55


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5
