In [1]:
import csv
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import pearsonr
import spacy
from itertools import product

nlp = spacy.load("en_core_web_md")

### Assignment 1.

In [3]:
stopwords = list(set(nltk.corpus.stopwords.words('english')))

def preProcess(sentence):
    """Tokenize, remove stopwords, and clean the sentence."""
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha() and word not in stopwords] 
    return words

def get_wordnet_pos(word):
    """Map POS tag to first character for lemmatization with WordNet."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    return tag_dict.get(tag, wn.NOUN)  

def word_similarity(w1, w2):
    """Calculate similarity between two words only if they share the same POS."""
    pos1 = get_wordnet_pos(w1)
    pos2 = get_wordnet_pos(w2)

    synsets1 = wn.synsets(w1, pos=pos1)
    synsets2 = wn.synsets(w2, pos=pos2)
    
    if synsets1 and synsets2:
        S1 = synsets1[0]  
        S2 = synsets2[0]  
        try:
            similarity = S1.wup_similarity(S2)
            if similarity:
                return round(similarity, 2)
        except nltk.corpus.reader.wordnet.WordNetError:
            return 0
    return 0

def sim1(T1, T2):
    """Calculate sentence-to-sentence similarity using TF-IDF and WordNet similarity."""
    words1 = preProcess(T1)
    words2 = preProcess(T2)

    tf = TfidfVectorizer(use_idf=True)
    tf.fit_transform([' '.join(words1), ' '.join(words2)])
    
    Idf = dict(zip(tf.get_feature_names_out(), tf.idf_))
    
    Sim_score1 = 0
    Sim_score2 = 0

    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1, w2)
            if Max < score:
                Max = score
        Sim_score1 += Max * Idf.get(w1, 0)
    Sim_score1 /= sum([Idf.get(w1, 0) for w1 in words1])

    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1, w2)
            if Max < score:
                Max = score
        Sim_score2 += Max * Idf.get(w2, 0)
    Sim_score2 /= sum([Idf.get(w2, 0) for w2 in words2])

    Sim = (Sim_score1 + Sim_score2) / 2
    
    return round(Sim, 2)

def read_from_csv(file_path):
    '''Read sentences and the corresponding similarity scores from a csv file'''
    sentences = []
    scores = []
    
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        next(reader) # Skip the header
        for row in reader:
            if len(row) == 3:
                sentence1, sentence2, score = row
                sentences.append((sentence1.strip(), sentence2.strip()))  # Append tuple of sentences
                scores.append(float(score.strip()))
    return sentences, scores

In [4]:
sentences, human_similarities = read_from_csv("STSS-131.csv");

computed_similarities_1 = []
for sentence1, sentence2 in sentences:
    score = sim1(sentence1, sentence2)
    computed_similarities_1.append(score)
    
print(f"List lengths: {len(sentences)}, {len(human_similarities)}, {len(computed_similarities_1)}")

pearson_coeff_1, p_value = pearsonr(human_similarities, computed_similarities_1)

print(f"Pearson correlation coefficient Sim1: {pearson_coeff_1:.2f}")

List lengths: 66, 66, 66
Pearson correlation coefficient Sim1: 0.55


### Assignment 2.

In [8]:
negations = {"not", "no", "never", "n't"}

# Change words to noun if possible 
def to_noun_form(tokens):
    noun_tokens = []
    for token, pos in pos_tag(tokens):
            if pos.startswith('VB') or pos.startswith('JJ') or pos.startswith('RB'):
                synsets = wn.synsets(token)
                noun_form = token
                for s in synsets:
                    for lemma in s.lemmas():
                        noun_synsets = wn.synsets(lemma.name(), wn.NOUN)
                        if noun_synsets:
                            noun_form = lemma.name()
                            break
                    if noun_form != token:
                        break
            else:
                noun_synsets = wn.synsets(token, wn.NOUN)
                noun_tokens.append(noun_synsets[0].lemmas()[0].name() if noun_synsets else token)
    return noun_tokens

# Handle negation by finding antonyms of adjectives/adverbs
def handle_negation(tokens):
    modified_tokens = []
    skip_next = False
    negated_adverbs = []
    pos_tags = pos_tag(tokens)
    for i, (token, pos) in enumerate(pos_tags):
        if token.lower() in negations:
            if i + 1 < len(pos_tags):
                next_token, next_pos = pos_tags[i + 1]
                if next_pos.startswith("JJ") or next_pos.startswith("RB"):
                    antonym = get_antonym(next_token)
                    negated_adverbs.append(antonym if antonym else next_token)
                    skip_next = True
                else:
                    modified_tokens.append(token)
        elif skip_next:
            skip_next = False
        else:
            modified_tokens.append(token)
    
    modified_tokens.extend(negated_adverbs)
    return modified_tokens

def get_antonym(token):
    for synset in wn.synsets(token):
        for lemma in synset.lemmas():
            if lemma.antonyms():
                return lemma.antonyms()[0].name()
    return None

# Clean the tokens, removes stopwords and unnecessary characters
def clean_tokens(tokens):
    return [token for token in tokens if token.isalnum() and token.lower() not in stopwords]

# Calculate Wu-Palmer simlarity for nouns
def wu_palmer_similarity(nouns1, nouns2):
    similarities = []
    for n1 in nouns1:
        synset1 = wn.synsets(n1, wn.NOUN)
        if not synset1:  
            continue
        max_similarity = 0
        for n2 in nouns2:
            synset2 = wn.synsets(n2, wn.NOUN)
            if not synset2:  
                continue
            similarity = synset1[0].wup_similarity(synset2[0]) or 0
            max_similarity = max(max_similarity, similarity)
        similarities.append(max_similarity)
    return sum(similarities) / len(similarities) if similarities else 0

# Named entity cosine similarity
def entity_similarity(named_entities1, named_entities2):
    if not (named_entities1 and named_entities2):
        return 0
    return max(ent1.similarity(ent2) for ent1 in named_entities1 for ent2 in named_entities2)

# Calculate Sim2
def sim2(sentence1, sentence2, alpha=0.5):
    doc1, doc2 = nlp(sentence1), nlp(sentence2)
    named_entities1, named_entities2 = [ent for ent in doc1.ents], [ent for ent in doc2.ents]
    
    tokens1, tokens2 = word_tokenize(sentence1), word_tokenize(sentence2)
    tokens1, tokens2 = handle_negation(tokens1), handle_negation(tokens2)
    tokens1, tokens2 = clean_tokens(to_noun_form(tokens1)), clean_tokens(to_noun_form(tokens2))
    
    if named_entities1 and named_entities2:
        entity_sim = entity_similarity(named_entities1, named_entities2)
        semantic_sim = wu_palmer_similarity(tokens1, tokens2)
        return alpha * entity_sim + (1 - alpha) * semantic_sim
    else:
        return wu_palmer_similarity(tokens1, tokens2)


# Test with 10 sentence pairs
test_pairs = [
    ("The city was noisy.", "The forest was silent."),
    ("Did you finish your homework?", "Have you completed your assignments?"),
    ("The cat sat on the warm windowsill.", "A cat rested on a cozy window ledge."),
    ("He does not like apples.", "He dislike apples."),
    ("The food was delicious.", "The meal was tasty."),
    ("The quick brown fox jumps over the lazy dog.", "A quick fox leaps over a lazy hound."),
    ("She is not happy with the results.", "She is sad with the results."),
    ("Apple Inc. released a new product.", "Google LLC announced their latest software."),
    ("He did not find the answer quickly.", "He found the answer slowly."),
    ("NASA announced a new space mission.", "The European Space Agency confirmed another mission."),
]

for i, (s1, s2) in enumerate(test_pairs):
    similarity = sim2(s1, s2)
    print(f"Sentence Pair {i+1}:")
    print(f"S1: {s1}")
    print(f"S2: {s2}")
    print(f"Similarity (Sim2): {similarity}\n")

Sentence Pair 1:
S1: The city was noisy.
S2: The forest was silent.
Similarity (Sim2): 0.13333333333333333

Sentence Pair 2:
S1: Did you finish your homework?
S2: Have you completed your assignments?
Similarity (Sim2): 0.7

Sentence Pair 3:
S1: The cat sat on the warm windowsill.
S2: A cat rested on a cozy window ledge.
Similarity (Sim2): 0.763157894736842

Sentence Pair 4:
S1: He does not like apples.
S2: He dislike apples.
Similarity (Sim2): 0.7316017316017316

Sentence Pair 5:
S1: The food was delicious.
S2: The meal was tasty.
Similarity (Sim2): 0.8333333333333334

Sentence Pair 6:
S1: The quick brown fox jumps over the lazy dog.
S2: A quick fox leaps over a lazy hound.
Similarity (Sim2): 0.7261904761904763

Sentence Pair 7:
S1: She is not happy with the results.
S2: She is sad with the results.
Similarity (Sim2): 1.0

Sentence Pair 8:
S1: Apple Inc. released a new product.
S2: Google LLC announced their latest software.
Similarity (Sim2): 0.269919322265519

Sentence Pair 9:
S1: He

### Assignment 3

In [9]:
computed_similarities_2 = []
for sentence1, sentence2 in sentences:
    score = sim2(sentence1, sentence2)
    computed_similarities_2.append(score)
    
df = pd.DataFrame({
    'Sentence 1': [s[0] for s in sentences],
    'Sentence 2': [s[1] for s in sentences],
    'Human Similarity': human_similarities,
    'Computed Similarity Sim1': computed_similarities_1,
    'Computed Similarity Sim2': computed_similarities_2
})

#'''You can see the table in the GitHub'''
#df.to_excel('similarities.xlsx', index=False)

pearson_coeff_2, p_value = pearsonr(human_similarities, computed_similarities_2)

print(f"Pearson correlation coefficient Sim1: {pearson_coeff_1:.2f}")
print(f"Pearson correlation coefficient Sim2: {pearson_coeff_2:.2f}")

Pearson correlation coefficient Sim1: 0.55
Pearson correlation coefficient Sim2: 0.31


The pearson correlation yields a worse answer. Meaning that the method of turning everything to nouns, the antonym preprocessing and using the named-entities preprocessing give us worse results than with just the preprocesses used in task 1. This may be due to the added complexity, which weren't accounted for in the human similarity judgments.
The preprocessing methods may also lead to a loss of important semantic information. Named entities are supposed to help by focusing on specific terms, but if they are not well aligned with the context of the sentences, they may add more noise rather than clarity. 