# IHLT final project - Semantic Textual Similarity
Nikita Belooussov and Santiago del Rey Juárez

## Introduction

In [31]:
# requires visual studios builder from https://visualstudio.microsoft.com/visual-cpp-build-tools/
!pip install contractions
!pip install num2words



You should consider upgrading via the 'C:\Users\santi\Projects\IHLT\IHLT_Project\.venv\Scripts\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\santi\Projects\IHLT\IHLT_Project\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [32]:
import csv
import os
import pickle
import string

import contractions
import nltk
import num2words
import numpy as np
import pandas as pd
import spacy
from nltk.corpus import stopwords
from nltk.corpus import wordnet_ic
from nltk.corpus.reader import WordNetError
from nltk.metrics import jaccard_distance
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LassoCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('conll2000')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet_ic')
nltk.download('wordnet')

contractions.add('U.S.', 'United States')
contractions.add('U.S.A', 'United States of America')
contractions.add('E.U.', 'European Union')

#if this does not work run python -m spacy download en in terminal and restart the program running the code
nlp = spacy.load("en_core_web_sm")

brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Pac

### Remove contractions

In [33]:
def expand_contractions(s0, s1):
    s0 = contractions.fix(s0)
    s1 = contractions.fix(s1)
    return s0, s1

### Change numbers to words

In [34]:
def changeNums(s0):
    s0=s0.split()
    new_s0=[]
    for i in s0:
        if i.isdigit():
            new_s0.append(num2words.num2words(i))
        else:
            new_s0.append(i)
    s0 = ' '.join(new_s0)
    return s0

### Tokenize

In [35]:
punct = string.punctuation


def tokenize(sentence):
    return [w.lower() for w in nltk.word_tokenize(sentence) if
            not all(c in punct for c in w) and w.lower() not in stopwords.words('english')]


def name_entity_tokenization(sentence):
    doc = nlp(sentence.lower())
    with doc.retokenize() as retokenizer:
        tokens = [token for token in doc]
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end],
                              attrs={"LEMMA": " ".join([tokens[i].text for i in range(ent.start, ent.end)])})
    s0_ne = [token.text for token in doc]
    return s0_ne

### Lemmatize

In [36]:
from nltk.corpus import wordnet

wnl = nltk.stem.WordNetLemmatizer()


def lemmatize(pair):
    if pair[1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
        if pair[1][
            0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
            return wnl.lemmatize(pair[0].lower(), pos=wordnet.ADJ)
        return wnl.lemmatize(pair[0].lower(), pos=pair[1][0].lower())
    return pair[0]


def lemmatize_sentence(words):
    pairs = nltk.pos_tag(words)
    lemmas = [lemmatize(pair) for pair in pairs]
    return lemmas

### Synset similarity

In [37]:
def get_wordnet_similarity(s0, s1, method, ic):
    if s0 is not None and s1 is not None:
        if method == 'path':
            return s0.path_similarity(s1)
        elif method == 'wup':
            return s0.wup_similarity(s1)
        elif s0.pos() == s1.pos():
            if method == "lch":
                return s0.lch_similarity(s1)
            elif ic is None:
                raise ValueError("ic parameter is missing")
            elif method == "lin":
                try:
                    return s0.lin_similarity(s1, ic)
                except WordNetError:
                    return None
            elif method == 'res':
                try:
                    return s0.res_similarity(s1, ic)
                except WordNetError:
                    return None
            else:
                return None
        else:
            return None
    else:
        return None


# Dictionary used to store already computed synsets
try:
    with open('synset_dic.pkl', 'rb') as file:
        computed_synsets = pickle.load(file)
except IOError:
    computed_synsets = {}


def max_similarity(s0, s1, method, ic):
    if s0 == s1:
        return 1

    if (s0, s1, method) in computed_synsets:
        return computed_synsets[(s0, s1, method)]

    synsets0 = wordnet.synsets(s0)
    synsets1 = wordnet.synsets(s1)

    similarities = []
    for syn0 in synsets0:
        for syn1 in synsets1:
            similarity = get_wordnet_similarity(syn0, syn1, method, ic)
            if similarity is not None:
                similarities.append(similarity)

    if len(similarities) > 0:
        max_sim = max(similarities)
        computed_synsets[(s0, s1, method)] = max_sim
        return max_sim
    else:
        computed_synsets[(s0, s1, method)] = 0
        return 0


def mean_simimilarity(lemmas0, lemmas1, method, ic):
    similarity_sum = 0
    for l0 in lemmas0:
        similarity_sum += max([max_similarity(l0, l1, method, ic) for l1 in lemmas1])
    return similarity_sum / len(lemmas0)


def synset_similarity(lemmas0, lemmas1, method, ic=None):
    mean_sim0 = mean_simimilarity(lemmas0, lemmas1, method, ic)
    mean_sim1 = mean_simimilarity(lemmas1, lemmas0, method, ic)

    if mean_sim0 > 0 or mean_sim1 > 0:
        return mean_sim0 + mean_sim1 / 2
    else:
        return 0

### Lesk similarity

In [38]:
def lesk_similarity(words0, words1):
    w0_pos = nltk.pos_tag(words0)
    w1_pos = nltk.pos_tag(words1)

    s0_lesk = []
    for i in range(len(w0_pos)):
        if w0_pos[i][1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
            if w0_pos[i][1][
                0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
                s0_lesk.append(nltk.wsd.lesk(words0, w0_pos[i][0], pos=wordnet.ADJ))
            else:
                s0_lesk.append(nltk.wsd.lesk(words0, w0_pos[i][0], pos=w0_pos[i][1][0].lower()))

    s1_lesk = []
    for i in range(len(w1_pos)):
        if w1_pos[i][1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
            if w1_pos[i][1][
                0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
                s1_lesk.append(nltk.wsd.lesk(words1, w1_pos[i][0], pos=wordnet.ADJ))
            else:
                s1_lesk.append(nltk.wsd.lesk(words1, w1_pos[i][0], pos=w1_pos[i][1][0].lower()))

    return 1 - jaccard_distance(set(s0_lesk), set(s1_lesk))

### Jaccard similarity

In [39]:
def jaccard_similarity(words0, words1):
    return 1 - jaccard_distance(set(words0), set(words1))

### Synonyms


In [40]:
def calc_syn(s0, s1):
    if len(s1) < len(s0):
        s0, s1 = s1, s0
    #s0, s1 = expand_contractions(s0, s1)
    #s0 = tokenize(s0)
    #s1 = tokenize(s1)
    #s0 = lemmatize_sentence(s0)
    #s1 = lemmatize_sentence(s1)

    synonyms1 = []
    synonyms2 = []
    for i in s0:
        synonyms1 = [*synonyms1, *wordnet.synsets(i)]
    for i in s1:
        synonyms2 = [*synonyms2, *wordnet.synsets(i)]

    count = 0
    for i in synonyms1:
        if i in synonyms2:
            count = count + 1
    if (len(synonyms1) != 0) and (len(synonyms2) != 0):
        return count / len(synonyms1)
    else:
        return 0

'\ninputTexts = []\ndataPath = os.path.join(\'data\', \'train\')\nx = []\nfor filename in os.listdir(dataPath):\n    if "STS.input" in filename:\n        print(os.path.join(dataPath, filename))\n        file = os.path.join(dataPath, filename)\n        text = pd.read_csv(file, sep=\'\t\', lineterminator=\'\n\', names=[\'sentence0\', \'sentence1\'], header=None,\n                           quoting=csv.QUOTE_NONE)\n        for i in range(len(text[\'sentence0\'])):\n            metric = calc_syn(text[\'sentence0\'][i], text[\'sentence1\'][i])\n            x.append(metric)\n        inputTexts.append(text)\n#print ("avg:")\nnumSen = len(x)\nx = np.array(x)\n#print(sum(x[:, 0]) / numSen)\n#print(sum(x[:, 1]) / numSen)\n#print(sum(x[:, 2]) / numSen)\n#print(sum(x[:, 3]) / numSen)\n#print(sum(x[:, 4]) / numSen)\n#print(sum(x[:, 5]) / numSen)'

### TF IDF and cosine

In [41]:
def tf_similarity(s0, s1):
    # Generate the tf-idf vectors for the corpus
    words0 = ' '.join([str(elem) for elem in s0])
    words1 = ' '.join([str(elem) for elem in s1])

    tfvec = TfidfVectorizer()
    tfidf_matrix = tfvec.fit_transform([words0, words1])

    return cosine_similarity(tfidf_matrix, tfidf_matrix)[0, 1]

### N-Gram similarity

In [42]:
def compute_n_grams(s0, s1):
    #words1=[word for word in s0.split(" ") if word not in set(stopwords.words('english'))]
    low_size = 5
    if len(s1) < len(s0):
        s0, s1 = s1, s0
    if len(s0) < 5:
        low_size = len(s0)
    #s0, s1 = expand_contractions(s0, s1)
    #s0 = tokenize(s0)
    #s1 = tokenize(s1)
    #s0 = lemmatize_sentence(s0)
    #s1 = lemmatize_sentence(s1)
    #print (s0)
    #print (s1)
    #print("Sentence after removing stopwords:",s0)
    metrics = [0, 0, 0, 0, 0, 0]
    for i in range(2, low_size):
        n_grams1 = zip(*[s0[k:] for k in range(0, i)])
        n_grams2 = zip(*[s1[k:] for k in range(0, i)])
        count = 0

        n_grams1 = [' '.join(ngram) for ngram in n_grams1]
        n_grams2 = [' '.join(ngram) for ngram in n_grams2]
        #print (set(nGrams2))
        #print (set(nGrams1))
        for j in set(n_grams1):
            if j in set(n_grams2):
                count += 1
        if (len(n_grams1) != 0) and (len(n_grams2) != 0):
            #print(1-jaccard_distance(set(nGrams1), set(nGrams2)))
            #print (count)
            metrics[(i - 2) * 2] = count / len(set(n_grams1))
            metrics[((i - 2) * 2) + 1] = 1 - jaccard_distance(set(n_grams1), set(n_grams2))
    return metrics

'\ninputTexts = []\ndataPath = os.path.join(\'data\', \'train\')\nx = []\nfor filename in os.listdir(dataPath):\n    if "STS.input" in filename:\n        print(os.path.join(dataPath, filename))\n        file = os.path.join(dataPath, filename)\n        text = pd.read_csv(file, sep=\'\t\', lineterminator=\'\n\', names=[\'sentence0\', \'sentence1\'], header=None,\n                           quoting=csv.QUOTE_NONE)\n        for i in range(len(text[\'sentence0\'])):\n            metric = compute_n_grams(text[\'sentence0\'][i], text[\'sentence1\'][i])\n            x.append(metric)\n        inputTexts.append(text)\n#print ("avg:")\nnumSen = len(x)\nx = np.array(x)\nprint(sum(x[:, 0]) / numSen)\nprint(sum(x[:, 1]) / numSen)\nprint(sum(x[:, 2]) / numSen)\nprint(sum(x[:, 3]) / numSen)\nprint(sum(x[:, 4]) / numSen)\nprint(sum(x[:, 5]) / numSen)'

### Compute similarity

In [43]:
N_SYMBOLS = 50


def compute_similarity(x):
    features = []
    n_samples = x.shape[0]
    perc = round(0.02 * n_samples)
    counter = 0
    progress = 0
    for sentence_0, sentence_1 in x:
        sentence_0=changeNums(sentence_0)
        sentence_1=changeNums(sentence_1)
        sentence_0, sentence_1 = expand_contractions(sentence_0, sentence_1)
        words0 = tokenize(sentence_0)
        words1 = tokenize(sentence_1)
        s0_lemmas = lemmatize_sentence(words0)
        s1_lemmas = lemmatize_sentence(words1)
        s0_ne = name_entity_tokenization(sentence_0)
        s1_ne = name_entity_tokenization(sentence_1)
        n_grams_results = compute_n_grams(s0_lemmas, s1_lemmas)
        features.append([
            jaccard_similarity(words0, words1),
            jaccard_similarity(s0_lemmas, s1_lemmas),
            jaccard_similarity(s0_ne, s1_ne),
            tf_similarity(words0, words1),
            tf_similarity(s0_lemmas, s1_lemmas),
            synset_similarity(s0_lemmas, s1_lemmas, 'path'),
            synset_similarity(s0_lemmas, s1_lemmas, 'lch'),
            synset_similarity(s0_lemmas, s1_lemmas, 'wup'),
            synset_similarity(s0_lemmas, s1_lemmas, 'lin', brown_ic),
            synset_similarity(s0_lemmas, s1_lemmas, 'lin', semcor_ic),
            synset_similarity(s0_lemmas, s1_lemmas, 'res', brown_ic),
            synset_similarity(s0_lemmas, s1_lemmas, 'res', semcor_ic),
            lesk_similarity(words0, words1),
            n_grams_results[0],
            n_grams_results[1],
            n_grams_results[2],
            n_grams_results[3],
            n_grams_results[4],
            n_grams_results[5],
            calc_syn(s0_lemmas, s1_lemmas)
        ])

        progress = print_progress(counter, perc, progress)
        counter += 1

    print()
    return np.array(features, dtype=np.float64)


def print_progress(counter, perc, progress):
    if (counter % perc) == 0:
        print('<' + '#' * progress + '.' * (N_SYMBOLS - progress) + '>', end='\r')
        return progress + 1
    return progress

### Read data

In [44]:
def read_file(file_path):
    return pd.read_csv(file_path, sep='\t', lineterminator='\n', header=None,
                       quoting=csv.QUOTE_NONE)

In [45]:
#Train data
dataPath = os.path.join('data', 'train')
train_data = None
for filename in sorted(os.listdir(dataPath)):
    if "STS.input" in filename:
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if train_data is None:
            train_data = data
        else:
            train_data = np.concatenate((train_data, data))

y_train = None
for filename in sorted(os.listdir(dataPath)):
    if "STS.gs" in filename:
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if y_train is None:
            y_train = data
        else:
            y_train = np.concatenate((y_train, data))

y_train = y_train.ravel()

##Test data
dataPath = os.path.join('data', 'test-gold')
test_data = None
for filename in sorted(os.listdir(dataPath)):
    if "STS.input" in filename:
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if test_data is None:
            test_data = data
        else:
            test_data = np.concatenate((test_data, data))

y_test = None
for filename in sorted(os.listdir(dataPath)):
    if "STS.gs" in filename and "ALL" not in filename:
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if y_test is None:
            y_test = data
        else:
            y_test = np.concatenate((y_test, data))

y_test = y_test.ravel()

In [46]:
INF = np.finfo(float).max / 1e300

scaler = StandardScaler()

print('Starting computation of training data similarities')
train_features = compute_similarity(train_data)
train_features[train_features == np.inf] = INF
x_train = np.round(scaler.fit_transform(train_features), 3)
print('Finished computation of training data similarities\n')

print('Starting computation of testing data similarities')
test_features = compute_similarity(test_data)
test_features[test_features == np.inf] = INF
x_test = np.round(scaler.fit_transform(test_features), 3)
print('Finished computation of testing data similarities\n')

# We save the already computed synset similarities to speed up future runs
with open('synset_dic.pkl', 'wb') as file:
    pickle.dump(computed_synsets, file)

Starting computation of training data similarities
<#################################################.>
Finished computation of training data similarities

Starting computation of testing data similarities
<..................................................>

  temp **= 2
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2


<##################################################>
Finished computation of testing data similarities



  temp **= 2
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2


In [47]:
selected_features = [0,  # jaccard using words
                     1,  # jaccard using lemmas
                     2,  # jaccard using NEs
                     3,  # tf similarity using words
                     4,  # tf similairty using lemmas
                     5,  # path similarity
                     6,  # lch similarity
                     7,  # wup similarity
                     #  8, # lin brown similarity
                     9,  # lin semcor similarity
                     #  10, # res brown similarity
                     #  11, # res semcor similarity
                     #  12 # lesk similarity
                     13,  #n-Grams 0
                     14,  #n-Grams 1
                     15,  #n-grams 2
                     16,  #ngrams 3
                     17,  #ngrams 4
                     18,  #ngrams 5
                     #19 #synnonyms
                     ]

### Random Forest Model

In [48]:
param_grid = dict(n_estimators=np.arange(135, 145), max_features=['auto', 'sqrt'],
                  min_samples_leaf=np.arange(1, 10))

rf_regr = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=-1)
rf_regr.fit(x_train[:, selected_features], y_train)

print(f'Optimal parameters: {rf_regr.best_params_}')

Optimal parameters: {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 139}


In [49]:
# Use the forest's predict method on the test data
rf_pred = rf_regr.predict(x_test[:, selected_features])
# Calculate the absolute errors
rf_correlation = pearsonr(rf_pred, y_test)[0]
print(f'Pearson correlation using Random Forest Regressor: {rf_correlation}')

Pearson correlation using Random Forest Regressor: 0.6037846811081233


### SVR model

In [None]:
param_grid = dict(C=np.linspace(500, 550, 100), gamma=np.linspace(1e-5, 1e-2, 20))

svr_regr = GridSearchCV(SVR(), param_grid, n_jobs=-1)
svr_regr.fit(x_train[:, selected_features], y_train)

print('Optimal parameters:', svr_regr.best_params_)

In [None]:
svr_pred = svr_regr.predict(x_test[:, selected_features])
svr_correlation = pearsonr(svr_pred, y_test)[0]
print(f'Pearson correlation using Support Vector Regressor: {svr_correlation}')

### Linear KNN

In [None]:
param_grid = dict(n_neighbors=np.arange(1, 15), weights=['uniform', 'distance'],
                  metric=['minkowski', 'euclidean', 'manhattan'])

knn_regr = GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=-1)
knn_regr.fit(x_train[:, selected_features], y_train)

print(f'Optimal parameters: {knn_regr.best_params_}')

In [None]:
knn_pred = knn_regr.predict(x_test[:, selected_features])

knn_correlation = pearsonr(knn_pred, y_test)[0]
print(f'Pearson correlation using KNN Regressor: {knn_correlation}')

### MLP Regressor

In [None]:
param_grid = dict(hidden_layer_sizes=[(50, 50, 50), (50, 100, 50), (100, 1)], activation=['relu', 'tanh', 'logistic'],
                  alpha=np.linspace(1e-4, 0.1, 20), learning_rate=['constant', 'adaptive'], solver=['sgd', 'adam'])

mlp_regr = GridSearchCV(MLPRegressor(random_state=1), param_grid, n_jobs=-1)
mlp_regr.fit(x_train[:, selected_features], y_train)

print(f'Optimal parameters: {mlp_regr.best_params_}')

In [None]:
nn_pred = mlp_regr.predict(x_test[:, selected_features])

nn_correlation = pearsonr(nn_pred, y_test)[0]
print(f'Pearson correlation using MLP: {nn_correlation}')

### LASSO Regression

In [None]:
lasso_alphas = np.linspace(0, 1, 20)

lasso_regr = LassoCV(alphas=lasso_alphas, n_jobs=-1)
lasso_regr.fit(x_train[:, selected_features], y_train)

In [None]:
lasso_pred = lasso_regr.predict(x_test[:, selected_features])

lasso_correlation = pearsonr(lasso_pred, y_test)[0]
print(f'Pearson correlation using Lasso: {lasso_correlation}')

In [None]:
avg_pred = lasso_pred + nn_pred + svr_pred

avg_correlation = pearsonr(avg_pred, y_test)[0]
print(f'Pearson correlation using the average between Lasso, MLP and SVR: {avg_correlation}')