In [1]:
import gc
gc.collect()

0

In [2]:
import csv
import pandas as pd
import nltk
import spacy
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from datasets import load_dataset
from SOC_PMI.main import similarity

nlp = spacy.load("en_core_web_md")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
stopwords = list(set(nltk.corpus.stopwords.words('english')))

Duplicate Module doesn't exist


### Task 1.

In [3]:
def preProcess(sentence):
    """Preprocess a single sentence by tokenizing, converting to lowercase, and removing stop words."""
    tokenized_sentence = nltk.word_tokenize(sentence.lower())
    filtered_sentence = [word for word in tokenized_sentence if word not in stopwords]
    return filtered_sentence


def sim1(sentence_list):
    """Calculate sentence-to-sentence similarity using TF-IDF and WordNet similarity."""
    computed_similarities = []
    tf = TfidfVectorizer(use_idf=True)

    for T1, T2 in sentence_list:
        words1 = preProcess(T1)
        words2 = preProcess(T2)

        tf_matrix = tf.fit_transform([' '.join(words1), ' '.join(words2)])
        
        sim_score = cosine_similarity(tf_matrix[0:1], tf_matrix[1:2])[0][0]
        computed_similarities.append(round(sim_score, 2))

    return computed_similarities

def read_from_csv(file_path):
    """Read sentences and the corresponding similarity scores from a csv file using pandas."""
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path, delimiter=';', encoding='utf-8')
    
    # Ensure the DataFrame has the correct number of columns
    if df.shape[1] != 3:
        raise ValueError("CSV file must contain exactly 3 columns.")

    # Extract sentences and scores
    sentences = list(zip(df.iloc[:, 0].str.strip(), df.iloc[:, 1].str.strip()))
    scores = df.iloc[:, 2].astype(float).tolist()
    
    return sentences, scores

sentences_stss, human_similarities = read_from_csv("STSS-131.csv")

computed_similarities_1 = sim1(sentences_stss)

print(f"List lengths: {len(sentences_stss)}, {len(human_similarities)}, {len(computed_similarities_1)}")

pearson_coeff_1, _ = pearsonr(human_similarities, computed_similarities_1)

print(f"Pearson correlation coefficient Sim1: {pearson_coeff_1:.2f}")

List lengths: 66, 66, 66
Pearson correlation coefficient Sim1: 0.67


### Task 2.

In [4]:
negations = {"not", "no", "never"}

synset_cache = {}

def get_synsets(word):
    """Get synsets for a word using a cache to avoid recalculating."""
    if word not in synset_cache:
        synset_cache[word] = wn.synsets(word)
    return synset_cache[word]

def antonym(word):
    """Get the antonym of a word using WordNet."""
    synonyms = wn.synsets(word)
    for syn in synonyms:
        for lemma in syn.lemmas():
            if lemma.antonyms():
                return lemma.antonyms()[0].name()
    return word

def process_negation(tokens):
    """Process negation in tokens and convert adjectives/adverbs to their antonyms."""
    for i, token in enumerate(tokens):
        if token in negations:
            negation_index = i
            # Check for adjectives/adverbs after the negation
            for j in range(negation_index + 1, len(tokens)):
                if pos_tag([tokens[j]])[0][1].startswith('JJ') or pos_tag([tokens[j]])[0][1].startswith('RB'):
                    tokens[j] = antonym(tokens[j])  # Change to antonym
                    break
    return tokens

def preprocess_and_extract_nouns(sentence):
    """Preprocess sentence to extract noun entities and process negation."""
    tokens = nltk.word_tokenize(sentence.lower())
    tokens = process_negation(tokens)
    tokens = [token for token in tokens if token.isalnum() and token not in stopwords]
    
    doc = nlp(sentence)
    named_entities = [ent.text for ent in doc.ents]
    tagged_tokens = pos_tag(tokens)
    noun_tokens = []
    
    for token, tag in tagged_tokens:
        if tag.startswith('NN') or tag.startswith('VB'):
            noun_tokens.append(token)

    # Convert verbs, adjectives/adverbs to nouns using WordNet
    noun_tokens = list(set(noun_tokens))
    final_nouns = []
    for token in noun_tokens:
        synsets = wn.synsets(token)
        if synsets:
            final_nouns.append(synsets[0].lemmas()[0].name())

    return final_nouns, named_entities

def wu_palmer_similarity(nouns1, nouns2):
    """Calculate Wu-Palmer similarity for all noun pairs."""
    if not nouns1 or not nouns2:
        return 0.0
    similarity_scores = []
    for n1 in nouns1:
        for n2 in nouns2:
            syn1 = wn.synsets(n1)
            syn2 = wn.synsets(n2)
            if syn1 and syn2:
                sim_score = syn1[0].wup_similarity(syn2[0])
                if sim_score is not None:
                    similarity_scores.append(sim_score)
    return sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0.0

def sim2(sentence_list):
    """Calculate similarity for a list of sentence pairs based on named entities and semantic similarity."""
    computed_similarities = []

    for S1, S2 in sentence_list:
        nouns1, named_entities1 = preprocess_and_extract_nouns(S1)
        nouns2, named_entities2 = preprocess_and_extract_nouns(S2)

        if named_entities1 and named_entities2:
            # Both sentences contain named entities
            entity_similarity = cosine_similarity(
                [nlp(' '.join(named_entities1)).vector],
                [nlp(' '.join(named_entities2)).vector]
            )[0][0]

            semantic_similarity = wu_palmer_similarity(nouns1, nouns2)
            final_similarity = 0.5 * entity_similarity + 0.5 * semantic_similarity
        elif not named_entities1 and not named_entities2:
            # No named entities in either sentence
            final_similarity = wu_palmer_similarity(nouns1, nouns2)
        else:
            # One sentence has named entities, discard named entities
            final_similarity = wu_palmer_similarity(nouns1, nouns2)

        computed_similarities.append(final_similarity)

    return computed_similarities

# Test with 10 sentence pairs
test_pairs = [
    ("The city was noisy.", "The forest was silent."),
    ("Did you finish your homework?", "Have you completed your assignments?"),
    ("The cat sat on the warm windowsill.", "A cat rested on a cozy window ledge."),
    ("He does not like apples.", "He dislike apples."),
    ("The food was delicious.", "The meal was tasty."),
    ("The quick brown fox jumps over the lazy dog.", "A quick fox leaps over a lazy hound."),
    ("She is not happy with the results.", "She is sad with the results."),
    ("Apple Inc. released a new product.", "Google LLC announced their latest software."),
    ("He did not find the answer quickly.", "He found the answer slowly."),
    ("NASA announced a new space mission.", "The European Space Agency confirmed another mission."),
]

computed_similarities = sim2(test_pairs)

for (S1, S2), sim2_score in zip(test_pairs, computed_similarities):
    print(f"Similarity between:\n'{S1}'\nand\n'{S2}'\nis: {sim2_score:.4f}\n")

Similarity between:
'The city was noisy.'
and
'The forest was silent.'
is: 0.3333

Similarity between:
'Did you finish your homework?'
and
'Have you completed your assignments?'
is: 0.4125

Similarity between:
'The cat sat on the warm windowsill.'
and
'A cat rested on a cozy window ledge.'
is: 0.3089

Similarity between:
'He does not like apples.'
and
'He dislike apples.'
is: 0.5212

Similarity between:
'The food was delicious.'
and
'The meal was tasty.'
is: 0.5417

Similarity between:
'The quick brown fox jumps over the lazy dog.'
and
'A quick fox leaps over a lazy hound.'
is: 0.3538

Similarity between:
'She is not happy with the results.'
and
'She is sad with the results.'
is: 1.0000

Similarity between:
'Apple Inc. released a new product.'
and
'Google LLC announced their latest software.'
is: 0.2584

Similarity between:
'He did not find the answer quickly.'
and
'He found the answer slowly.'
is: 0.4510

Similarity between:
'NASA announced a new space mission.'
and
'The European Spac

### Task 3

In [5]:
computed_similarities_2 = sim2(sentences_stss)

df = pd.DataFrame({
    'Sentence 1': [s[0] for s in sentences_stss],
    'Sentence 2': [s[1] for s in sentences_stss],
    'Human Similarity': human_similarities,
    'Computed Similarity Sim1': computed_similarities_1,
    'Computed Similarity Sim2': computed_similarities_2
})

#'''You can see the table in the GitHub'''
#df.to_excel('similarities.xlsx', index=False)

pearson_coeff_2, _ = pearsonr(human_similarities, computed_similarities_2)

print(f"Pearson correlation coefficient Sim1: {pearson_coeff_1:.2f}")
print(f"Pearson correlation coefficient Sim2: {pearson_coeff_2:.2f}")

Pearson correlation coefficient Sim1: 0.67
Pearson correlation coefficient Sim2: 0.32


The pearson correlation yields a worse answer. Meaning that the method of turning everything to nouns, the antonym preprocessing and using the named-entities preprocessing give us worse results than with just the preprocesses used in task 1. This may be due to the added complexity, which weren't accounted for in the human similarity judgments.
The preprocessing methods may also lead to a loss of important semantic information. Named entities are supposed to help by focusing on specific terms, but if they are not well aligned with the context of the sentences, they may add more noise rather than clarity. 

### Task 4


In [6]:
def compute_similarity_doc2vec(sentence_list, epochs=200):
    """Train a Doc2Vec model using a list of sentence pairs."""

    tagged_data = [TaggedDocument(words=preProcess(s[0]) + preProcess(s[1]), tags=[str(i)]) for i, s in enumerate(sentence_list)]

    doc2vec_model = Doc2Vec(vector_size=200, alpha=0.025, min_alpha=0.00025, min_count=1, dm=1, epochs=epochs)
    doc2vec_model.build_vocab(tagged_data)
    doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

    computed_similarities_doc2vec = []
    for sentence1, sentence2 in sentence_list:
        try:
            vec1 = doc2vec_model.infer_vector(preProcess(sentence1))
            vec2 = doc2vec_model.infer_vector(preProcess(sentence2))
            similarity = cosine_similarity([vec1], [vec2])[0][0]
            computed_similarities_doc2vec.append(similarity)
        except Exception as e:
            print(f"Error processing pair ({sentence1}, {sentence2}): {e}")

    return computed_similarities_doc2vec, doc2vec_model

computed_similarities_doc2vec, model = compute_similarity_doc2vec(sentences_stss)

pearson_coeff_doc2vec = pearsonr(human_similarities, computed_similarities_doc2vec)[0]
print(f"Pearson correlation coefficient with Doc2Vec: {pearson_coeff_doc2vec:.2f}")

Pearson correlation coefficient with Doc2Vec: 0.65


By tuning the epochs, we got a better pearson correlation. With only 100 epochs, the coefficient was 0.12. But after 200 epochs the coefficient drops again, indicating overfitting. 

In [7]:
def compute_spacy_embeddings(sentence_list):
    """Compute SpaCy embeddings for a list of sentence pairs."""
    computed_similarities = []
    for sentence1, sentence2 in sentence_list:
        # Generate embeddings using SpaCy
        vec1 = nlp(sentence1).vector
        vec2 = nlp(sentence2).vector
        # Calculate cosine similarity
        similarity = cosine_similarity([vec1], [vec2])[0][0]
        computed_similarities.append(similarity)
    return computed_similarities

computed_similarities_spacy_e = compute_spacy_embeddings(sentences_stss)
pearson_coeff_spacy_e = pearsonr(human_similarities, computed_similarities_spacy_e)[0]
print(f"Pearson correlation coefficient with SpaCy embedding: {pearson_coeff_spacy_e:.2f}")

Pearson correlation coefficient with SpaCy embedding: 0.54


In [8]:
def compute_distilbert_embeddings(sentence_list):
    """Compute DistilBERT embeddings for a list of sentence pairs."""
    computed_similarities = []
    for sentence1, sentence2 in sentence_list:
        inputs1 = tokenizer(sentence1, padding=True, truncation=True, return_tensors='pt')
        inputs2 = tokenizer(sentence2, padding=True, truncation=True, return_tensors='pt')
        
        with torch.no_grad():
            outputs1 = distilbert_model(**inputs1)
            outputs2 = distilbert_model(**inputs2)

        vec1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
        vec2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()

        similarity = cosine_similarity([vec1], [vec2])[0][0]
        computed_similarities.append(similarity)
    return computed_similarities

computed_similarities_distilbert_e = compute_distilbert_embeddings(sentences_stss)
pearson_coeff_distilbert_e = pearsonr(human_similarities, computed_similarities_distilbert_e)[0]
print(f"Pearson correlation coefficient with DistilBERT embedding: {pearson_coeff_distilbert_e:.2f}")

Pearson correlation coefficient with DistilBERT embedding: 0.84


In [9]:
#def compute_similarity_use(sentence_list):
#    """Compute cosine similarity for a list of sentence pairs using the Universal Sentence Encoder."""
#    computed_similarities = []
#    
#    for sentence1, sentence2 in sentence_list:
#        embeddings = embed([sentence1, sentence2]).numpy()
#        
#        similarity = cosine_similarity(embeddings)[0, 1]
#        computed_similarities.append(similarity)
#
#   return computed_similarities
  

#computed_similarities_use = compute_similarity_use(sentences_stss)
#pearson_coeff_use, _ = pearsonr(human_similarities, computed_similarities_use)
#print(f"Pearson correlation coefficient with Universal Sentence Encoder: {pearson_coeff_use:.2f}")

### Task 5

In [10]:
ds = load_dataset("SemRel/SemRel2024", "eng")

print(ds)

datasets = ["train", "test", "dev"]

def extract_sentences_and_labels(dataset_name):
    '''Extract sentences and labels from a dataset from SemRel2024'''
    
    dataset = ds[dataset_name].shuffle(seed=42)
    dataset = dataset.select(range(200))
    
    sentences = []
    labels = []

    for item in dataset:
        sentence1 = item['sentence1'].strip()
        sentence2 = item['sentence2'].strip()
        label = float(item['label'])

        sentences.append((sentence1, sentence2))
        labels.append(label)
        
    print(f"Extracted {len(sentences)} sentence pairs from the {dataset_name} set.")
    return sentences, labels 

results = {}
stored_scores = {}

for dataset in datasets:
    '''Use all the previous methods for SemRel2024 datasets'''

    sentences, labels = extract_sentences_and_labels(dataset)

    sim1_scores = sim1(sentences)
    sim2_scores = sim2(sentences)
    doc2vec_scores, model = compute_similarity_doc2vec(sentences, 150)
    spacy_scores = compute_spacy_embeddings(sentences)
    distilbert_scores = compute_distilbert_embeddings(sentences)
    #use_scores = compute_similarity_use(sentences)

    stored_scores[dataset] = {
        'doc2vec': doc2vec_scores,
        'spacy': spacy_scores,
        'distilbert': distilbert_scores
    }

    # Calculate Pearson correlation coefficients
    sim1_corr = pearsonr(sim1_scores, labels)[0]
    sim2_corr = pearsonr(sim2_scores, labels)[0]
    doc2vec_corr = pearsonr(doc2vec_scores, labels)[0]
    spacy_corr = pearsonr(spacy_scores, labels)[0]
    distilbert_corr = pearsonr(distilbert_scores, labels)[0]
    #use_corr = pearsonr(use_scores, labels)[0]
        
    results[dataset] = {
        'sim1': sim1_corr,
        'sim2': sim2_corr,
        'doc2vec': doc2vec_corr,
        'SpaCy': spacy_corr,
        'DistilBERT': distilbert_corr
        #'use': use_corr,
    }
    
for dataset, correlations in results.items():
    print(f"{dataset.capitalize()} Results:")
    for method, corr in correlations.items():
        print(f"  {method}: {corr:.2f}")

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 5500
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2600
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 250
    })
})
Extracted 200 sentence pairs from the train set.
Extracted 200 sentence pairs from the test set.
Extracted 200 sentence pairs from the dev set.
Train Results:
  sim1: 0.62
  sim2: 0.37
  doc2vec: 0.39
  SpaCy: 0.41
  DistilBERT: 0.59
Test Results:
  sim1: 0.69
  sim2: 0.52
  doc2vec: 0.53
  SpaCy: 0.42
  DistilBERT: 0.71
Dev Results:
  sim1: 0.68
  sim2: 0.42
  doc2vec: 0.45
  SpaCy: 0.39
  DistilBERT: 0.64


### Task 6

We need to weight the different models and see which weights produce the biggest coefficient. The Distilbert seemed to give the best answer constistently so let's weight that more. SpaCy and Doc2Vec performed around as well.

In [11]:
weights = [0.25, 0.25, 0.5]

for dataset in datasets:
    '''Use all the previous methods for SemRel2024 datasets'''
    sentences, labels = extract_sentences_and_labels(dataset)
    
    doc2vec_scores = stored_scores[dataset]['doc2vec']
    spacy_scores = stored_scores[dataset]['spacy']
    distilbert_scores = stored_scores[dataset]['distilbert']
    
    ensemble_scores = [
        sum(w * sim for w, sim in zip(weights, similarities))
        for similarities in zip(doc2vec_scores, spacy_scores, distilbert_scores)
    ]
    ensemble_corr = pearsonr(ensemble_scores, labels)[0]
    results[dataset] = {
        'Ensemble': ensemble_corr
    }
    
for dataset, correlations in results.items():
    print(f"{dataset.capitalize()} Results:")
    for method, corr in correlations.items():
        print(f"  {method}: {corr:.2f}")

Extracted 200 sentence pairs from the train set.
Extracted 200 sentence pairs from the test set.
Extracted 200 sentence pairs from the dev set.
Train Results:
  Ensemble: 0.63
Test Results:
  Ensemble: 0.73
Dev Results:
  Ensemble: 0.68


This performs (slightly) better than any other method.

### Task 7 & 8

In [None]:
SOC_similarities = []
for S1, S2 in sentences_stss:
    sim_score = similarity(S1, S2) # Call the similarity from the provided repository
    SOC_similarities.append(sim_score)
    
SOC_coefficient = pearsonr(SOC_similarities, human_similarities)[0]
print(f"Pearson correlation coefficient SOC-PMI-Short-Text-Similarity-: {pearson_coeff_1:.2f}")

Pearson correlation coefficient SOC-PMI-Short-Text-Similarity-: 0.67


### Task 9 (Interface)

In [16]:
def main():
    print("Welcome to the Sentence Similarity Checker!")
    while True:
        print("\nPlease enter a pair of sentences:")
        sentence1 = input("Sentence 1: ")
        sentence2 = input("Sentence 2: ")
        
        # Call similarity functions
        sentences_pair = [(sentence1, sentence2)]
        computed_similarities_1 = sim1(sentences_pair)
        computed_similarities_2 = sim2(sentences_pair)
        computed_similarities_doc2vec, _ = compute_similarity_doc2vec(sentences_pair)
        computed_similarities_spacy = compute_spacy_embeddings(sentences_pair)
        computed_similarities_distilbert = compute_distilbert_embeddings(sentences_pair)

        # Display results
        print("You input sentences: \"", sentence1, "\" and \"", sentence2, "\"")
        print("Similarity Scores:")
        print(f"Sim1 (TF-IDF + WordNet): {computed_similarities_1[0]:.4f}")
        print(f"Sim2 (Wu-Palmer + Named Entities): {computed_similarities_2[0]:.4f}")
        print(f"Doc2Vec Similarity: {computed_similarities_doc2vec[0]:.4f}")
        print(f"SpaCy Similarity: {computed_similarities_spacy[0]:.4f}")
        print(f"DistilBERT Similarity: {computed_similarities_distilbert[0]:.4f}")
        
        cont = input("\nDo you want to check another pair? (y/n): ")
        if cont.lower() != 'y':
            break

if __name__ == "__main__":
    main()

Welcome to the Sentence Similarity Checker!

Please enter a pair of sentences:
You input sentences: " The cat lounged on the windowsill, basking in the warm sunlight. " and " The dog rested on the porch, enjoying the cool evening breeze. "
Similarity Scores:
Sim1 (TF-IDF + WordNet): 0.0000
Sim2 (Wu-Palmer + Named Entities): 0.2712
Doc2Vec Similarity: 0.9702
SpaCy Similarity: 0.8733
DistilBERT Similarity: 0.9402

Please enter a pair of sentences:
You input sentences: " The cat curled up on the couch and fell asleep. " and " The cat snuggled on the couch and drifted off to sleep. "
Similarity Scores:
Sim1 (TF-IDF + WordNet): 0.2500
Sim2 (Wu-Palmer + Named Entities): 0.2492
Doc2Vec Similarity: 0.8576
SpaCy Similarity: 0.9757
DistilBERT Similarity: 0.9659
