# IHLT final project - Semantic Textual Similarity
Nikita Belooussov and Santiago del Rey Juárez

## Introduction

In [181]:
# requires visual studios builder from https://visualstudio.microsoft.com/visual-cpp-build-tools/
!pip install contractions

ERROR: Invalid requirement: '#'
You should consider upgrading via the 'C:\Users\santi\Projects\IHLT\IHLT_Project\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [182]:
import csv
import os
import string

import contractions
import nltk
import numpy as np
import pandas as pd
import spacy
from nltk.corpus import stopwords
from nltk.corpus import wordnet_ic
from nltk.corpus.reader import WordNetError
from nltk.metrics import jaccard_distance
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('conll2000')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet_ic')
nltk.download('wordnet')

contractions.add('U.S.', 'United States')
contractions.add('U.S.A', 'United States of America')
contractions.add('E.U.', 'European Union')

#if this does not work run python -m spacy download en in terminal and restart the program running the code
nlp = spacy.load("en_core_web_sm")

brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

ModuleNotFoundError: No module named 'contractions'

### Remove contractions

In [None]:
def expand_contractions(s0,s1):
    s0=contractions.fix(s0)
    s1=contractions.fix(s1)
    return s0,s1

### Tokenize

In [None]:
punct = string.punctuation


def tokenize(sentence):
    return [w.lower() for w in nltk.word_tokenize(sentence) if
            not all(c in punct for c in w) and w.lower() not in stopwords.words('english')]


def name_entity_tokenization(sentence):
    doc = nlp(sentence.lower())
    with doc.retokenize() as retokenizer:
        tokens = [token for token in doc]
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end],
                              attrs={"LEMMA": " ".join([tokens[i].text for i in range(ent.start, ent.end)])})
    s0_ne = [token.text for token in doc]
    return s0_ne

### Lemmatize

In [None]:
from nltk.corpus import wordnet

wnl = nltk.stem.WordNetLemmatizer()


def lemmatize(pair):
    if pair[1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
        if pair[1][
            0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
            return wnl.lemmatize(pair[0].lower(), pos=wordnet.ADJ)
        return wnl.lemmatize(pair[0].lower(), pos=pair[1][0].lower())
    return pair[0]


def lemmatize_sentence(words):
    pairs = nltk.pos_tag(words)
    lemmas = [lemmatize(pair) for pair in pairs]
    return lemmas

### Synset similarity

In [None]:
def get_wordnet_similarity(s0, s1, method, ic):
    if s0 is not None and s1 is not None:
        if method == 'path':
            return s0.path_similarity(s1)
        elif method == 'wup':
            return s0.wup_similarity(s1)
        elif s0.pos() == s1.pos():
            if method == "lch":
                return s0.lch_similarity(s1)
            elif ic is None:
                raise ValueError("ic parameter is missing")
            elif method == "lin":
                try:
                    return s0.lin_similarity(s1, ic)
                except WordNetError:
                    return None
            elif method == 'res':
                try:
                    return s0.res_similarity(s1, ic)
                except WordNetError:
                    return None
            else:
                return None
        else:
            return None
    else:
        return None


# Dictionary used to store already computed synsets
computed_synsets = {}


def max_similarity(s0, s1, method, ic):
    if s0 == s1:
        return 1

    if (s0, s1, method) in computed_synsets:
        return computed_synsets[(s0, s1, method)]

    synsets0 = wordnet.synsets(s0)
    synsets1 = wordnet.synsets(s1)

    similarities = []
    for syn0 in synsets0:
        for syn1 in synsets1:
            similarity = get_wordnet_similarity(syn0, syn1, method, ic)
            if similarity is not None:
                similarities.append(similarity)

    if len(similarities) > 0:
        max_sim = max(similarities)
        computed_synsets[(s0, s1, method)] = max_sim
        return max_sim
    else:
        computed_synsets[(s0, s1, method)] = 0
        return 0


def mean_simimilarity(lemmas0, lemmas1, method, ic):
    similarity_sum = 0
    for l0 in lemmas0:
        similarity_sum += max([max_similarity(l0, l1, method, ic) for l1 in lemmas1])
    return similarity_sum / len(lemmas0)


def synset_similarity(lemmas0, lemmas1, method, ic=None):
    mean_sim0 = mean_simimilarity(lemmas0, lemmas1, method, ic)
    mean_sim1 = mean_simimilarity(lemmas1, lemmas0, method, ic)

    if mean_sim0 > 0 or mean_sim1 > 0:
        return (2 * mean_sim0 * mean_sim1) / (mean_sim0 + mean_sim0)
    else:
        return 0

### Lesk similarity

In [None]:
def lesk_similarity(words0, words1):
    w0_pos = nltk.pos_tag(words0)
    w1_pos = nltk.pos_tag(words1)

    s0_lesk = []
    for i in range(len(w0_pos)):
        if w0_pos[i][1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
            if w0_pos[i][1][
                0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
                s0_lesk.append(nltk.wsd.lesk(words0, w0_pos[i][0], pos=wordnet.ADJ))
            else:
                s0_lesk.append(nltk.wsd.lesk(words0, w0_pos[i][0], pos=w0_pos[i][1][0].lower()))

    s1_lesk = []
    for i in range(len(w1_pos)):
        if w1_pos[i][1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
            if w1_pos[i][1][
                0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
                s1_lesk.append(nltk.wsd.lesk(words1, w1_pos[i][0], pos=wordnet.ADJ))
            else:
                s1_lesk.append(nltk.wsd.lesk(words1, w1_pos[i][0], pos=w1_pos[i][1][0].lower()))

    return 1 - jaccard_distance(set(s0_lesk), set(s1_lesk))

### Jaccard similarity

In [None]:
def jaccard_similarity(words0, words1):
    return 1 - jaccard_distance(set(words0), set(words1))

### TF IDF and cosine

In [None]:
def compute_tf(s0,s1):
    #print (s0)
    #print (s1)
    s0,s1= expand_contractions(s0,s1)
    s0 = nltk.word_tokenize(s0)
    s1 = nltk.word_tokenize(s1)
    tfvec=TfidfVectorizer()



    s0= lemmatize_sentence(s0)
    s1 =lemmatize_sentence(s1)
    # Generate the tf-idf vectors for the corpus
    s0=' '.join([str(elem) for elem in s0])
    s1=' '.join([str(elem) for elem in s1])

    #print(s0)
    #print(s1)
    tfidf_matrix = tfvec.fit_transform([s0,s1])

    print(cosine_similarity(tfidf_matrix, tfidf_matrix)[0,1])
    return cosine_similarity(tfidf_matrix, tfidf_matrix)[0,1]


inputTexts=[]
dataPath=os.path.join('data', 'train')
x=[]
for filename in os.listdir(dataPath):
    if "STS.input" in filename:
        print (os.path.join(dataPath,filename))
        file=os.path.join(dataPath,filename)
        text=pd.read_csv(file, sep='\t', lineterminator='\n', names=['sentence0', 'sentence1'], header=None, quoting=csv.QUOTE_NONE)
        for i in range(len(text['sentence0'])):
            x.append(compute_tf(text['sentence0'][i],text['sentence1'][i]))
        inputTexts.append(text)
print ("avg:")
print (sum(x)/len(x))

### N-Gram similarity

In [None]:
def compute_n_grams(s0, s1):
    #words1=[word for word in s0.split(" ") if word not in set(stopwords.words('english'))]
    low_size=5
    if len(s1)<len(s0):
        s0,s1 = s1,s0
    if len(s0)<5:
        low_size=len(s0)
    s0,s1= expand_contractions(s0,s1)
    s0 = tokenize(s0)
    s1 = tokenize(s1)
    s0= lemmatize_sentence(s0)
    s1 =lemmatize_sentence(s1)
    ngrams=0
    #print (s0)
    #print (s1)
    #print("Sentence after removing stopwords:",s0)
    metrics=[0,0,0,0,0,0]
    for i in range(2,low_size):
        n_grams1=zip(*[s0[k:] for k in range(0, i)])
        n_grams2=zip(*[s1[k:] for k in range(0,i)])
        count=0

        n_grams1=[' '.join(ngram) for ngram in n_grams1]
        n_grams2=[' '.join(ngram) for ngram in n_grams2]
        #print (set(nGrams2))
        #print (set(nGrams1))
        for j in set(n_grams1):
            if j in set(n_grams2):
                count+=1
        if (len(n_grams1) != 0) and (len(n_grams2) != 0):
            #print(1-jaccard_distance(set(nGrams1), set(nGrams2)))
            #print (count)
            metrics[(i-2)*2]=count/len(set(n_grams1))
            metrics[((i-2)*2)+1]=1-jaccard_distance(set(n_grams1), set(n_grams2))
    return metrics

inputTexts=[]
dataPath=os.path.join('data', 'train')
x=[]
for filename in os.listdir(dataPath):
    if "STS.input" in filename:
        print (os.path.join(dataPath,filename))
        file=os.path.join(dataPath,filename)
        text=pd.read_csv(file, sep='\t', lineterminator='\n', names=['sentence0', 'sentence1'], header=None, quoting=csv.QUOTE_NONE)
        for i in range(len(text['sentence0'])):
            metric=compute_n_grams(text['sentence0'][i],text['sentence1'][i])
            x.append(metric)
        inputTexts.append(text)
#print ("avg:")
numSen=len(x)
x = np.array(x)
print (sum(x[:,0])/numSen)
print (sum(x[:,1])/numSen)
print (sum(x[:,2])/numSen)
print (sum(x[:,3])/numSen)
print (sum(x[:,4])/numSen)
print (sum(x[:,5])/numSen)

### Compute similarity

In [None]:
N_SYMBOLS = 99


def compute_similarity(x):
    features = []
    n_samples = x.shape[0]
    perc = round(0.02 * n_samples)
    counter = 0
    progress = 0
    for sentence_0, sentence_1 in x:
        words0 = tokenize(sentence_0)
        words1 = tokenize(sentence_1)
        s0_lemmas = lemmatize_sentence(words0)
        s1_lemmas = lemmatize_sentence(words1)
        s0_ne = name_entity_tokenization(sentence_0)
        s1_ne = name_entity_tokenization(sentence_1)

        features.append([
            jaccard_similarity(words0, words1),
            jaccard_similarity(s0_lemmas, s1_lemmas),
            jaccard_similarity(s0_ne, s1_ne),
            synset_similarity(s0_lemmas, s1_lemmas, 'path'),
            synset_similarity(s0_lemmas, s1_lemmas, 'lch'),
            synset_similarity(s0_lemmas, s1_lemmas, 'wup'),
            synset_similarity(s0_lemmas, s1_lemmas, 'lin', brown_ic),
            synset_similarity(s0_lemmas, s1_lemmas, 'lin', semcor_ic),
            synset_similarity(s0_lemmas, s1_lemmas, 'res', brown_ic),
            synset_similarity(s0_lemmas, s1_lemmas, 'res', semcor_ic),
            lesk_similarity(words0, words1)
        ])

        progress = print_progress(counter, perc, progress)

    print()
    return np.array(features, dtype=np.float64)


def print_progress(counter, perc, progress):
    if (counter % perc) == 0:
        print('<' + '#' * progress + '.' * (N_SYMBOLS - progress) + '>', end='\r')
        return progress + 1
    return progress

### Read data

In [None]:
def read_file(file_path):
    return pd.read_csv(file_path, sep='\t', lineterminator='\n', header=None,
                       quoting=csv.QUOTE_NONE)

In [166]:
#Train data
dataPath = os.path.join('data', 'train')
train_data = None
for filename in os.listdir(dataPath):
    if "STS.input" in filename:
        print(os.path.join(dataPath, filename))
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if train_data is None:
            train_data = data
        else:
            train_data = np.concatenate((train_data, data))

y_train = None
for filename in os.listdir(dataPath):
    if "STS.gs" in filename:
        print(os.path.join(dataPath, filename))
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if y_train is None:
            y_train = data
        else:
            y_train = np.concatenate((y_train, data))

y_train = y_train.ravel()

##Test data
dataPath = os.path.join('data', 'test-gold')
test_data = None
for filename in os.listdir(dataPath):
    if "STS.input" in filename:
        print(os.path.join(dataPath, filename))
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if test_data is None:
            test_data = data
        else:
            test_data = np.concatenate((test_data, data))

y_test = None
for filename in os.listdir(dataPath):
    if "STS.gs" in filename:
        print(os.path.join(dataPath, filename))
        data = read_file(os.path.join(dataPath, filename)).to_numpy()
        if y_test is None:
            y_test = data
        else:
            y_test = np.concatenate((y_test, data))

y_test = y_test.ravel()

data\train\STS.input.MSRpar.txt
data\train\STS.input.MSRvid.txt
data\train\STS.input.SMTeuroparl.txt
data\train\STS.gs.MSRpar.txt
data\train\STS.gs.MSRvid.txt
data\train\STS.gs.SMTeuroparl.txt
data\test-gold\STS.input.MSRpar.txt
data\test-gold\STS.input.MSRvid.txt
data\test-gold\STS.input.SMTeuroparl.txt
data\test-gold\STS.input.surprise.OnWN.txt
data\test-gold\STS.input.surprise.SMTnews.txt
data\test-gold\STS.gs.ALL.txt
data\test-gold\STS.gs.MSRpar.txt
data\test-gold\STS.gs.MSRvid.txt
data\test-gold\STS.gs.SMTeuroparl.txt
data\test-gold\STS.gs.surprise.OnWN.txt
data\test-gold\STS.gs.surprise.SMTnews.txt


In [180]:
INF = np.finfo(float).max / 1e200

scaler = StandardScaler()

print('Starting computation of training data similarities')
train_features = compute_similarity(train_data)
train_features[train_features == np.inf] = INF
x_train = scaler.fit_transform(train_features)
print('Finished computation of training data similarities\n')

print('Starting computation of testing data similarities')
test_features = compute_similarity(test_data)
test_features[test_features == np.inf] = INF
x_test = scaler.fit_transform(test_features)
print('Finished computation of testing data similarities\n')

Starting computation of training data similarities
<###################################################################################################>
Finished computation of training data similarities

Starting computation of testing data similarities
<###################################################################################################>
Finished computation of testing data similarities



Random Forest Model

In [177]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=6, random_state=42)
# Train the model on training data
rf.fit(x_train, y_train)

# Use the forest's predict method on the test data
predictions = rf.predict(x_test)
# Calculate the absolute errors
rf_correlation = pearsonr(predictions, y_test)[0]
print(f'Pearson correlation using Random Forest Regressor: {rf_correlation}')

Pearson correlation using Random Forest Regressor: 0.5794861352576831


SVR model

In [178]:
regr = SVR()
regr.fit(x_train, y_train)

svr_pred = regr.predict(x_test)
svr_correlation = pearsonr(svr_pred, y_test)[0]
print(f'Pearson correlation using Support Vector Regressor: {svr_correlation}')

Pearson correlation using Support Vector Regressor: 0.583534523746479


Linear KNN

In [179]:
model = KNeighborsRegressor(n_neighbors=5)

model.fit(x_train, y_train)
knn_pred = model.predict(x_test)

knn_correlation = pearsonr(knn_pred, y_test)[0]
print(f'Pearson correlation using Support Vector Regressor: {knn_correlation}')

Pearson correlation using Support Vector Regressor: 0.5674670208589774
