# IHLT final project - Semantic Textual Similarity
Nikita Belooussov and Santiago del Rey Juárez

## Introduction

In [147]:
import csv
import os
import string
from enum import Enum

import numpy as np
import nltk
import pandas as pd
import spacy
from nltk.corpus import stopwords
from nltk.corpus import wordnet_ic
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('conll2000')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet_ic')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_sm")  #if this does not work run python -m spacy download en in terminal and restart the program running the code

brown_ic = wordnet_ic.ic('ic-brown.dat')

!pip install svgling 
!pip install pycontractions # requires visual studios builder from https://visualstudio.microsoft.com/visual-cpp-build-tools/


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Pac

### Tokenize

In [148]:
punct = string.punctuation


def tokenize(sentence):
    return [w.lower() for w in nltk.word_tokenize(sentence) if
            not all(c in punct for c in w) and w.lower() not in stopwords.words('english')]


def name_entity_tokenization(sentence):
    doc = nlp(sentence.lower())
    with doc.retokenize() as retokenizer:
        tokens = [token for token in doc]
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end],
                              attrs={"LEMMA": " ".join([tokens[i].text for i in range(ent.start, ent.end)])})
    s0_ne = [token.text for token in doc]
    return s0_ne

### Lemmatize

In [149]:
from nltk.corpus import wordnet

wnl = nltk.stem.WordNetLemmatizer()


def lemmatize(pair):
    if pair[1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
        if pair[1][0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
            return wnl.lemmatize(pair[0].lower(), pos=wordnet.ADJ)
        return wnl.lemmatize(pair[0].lower(), pos=pair[1][0].lower())
    return pair[0]


def lemmatize_sentence(words):
    pairs = nltk.pos_tag(words)
    lemmas = [lemmatize(pair) for pair in pairs]
    return lemmas

### Synset similarity

In [150]:
from nltk.corpus.reader import WordNetError


def get_wordnet_similarity(s0, s1, method):
    if method == "path" and s0 is not None and s1 is not None:
        return s0.path_similarity(s1)
    elif method == "lch" and s0 is not None and s1 is not None and s0.pos() == s1.pos():
        return s0.lch_similarity(s1)
    elif method == "wup" and s0 is not None and s1 is not None:
        return s0.wup_similarity(s1)
    elif method == "lin" and s0 is not None and s1 is not None and s0.pos() == s1.pos():
        try:
            return s0.lin_similarity(s1, brown_ic)
        except WordNetError:
            return None
    else:
        return None


# Dictionary used to store already computed synsets
computed_synsets = {}


def max_similarity(s0, s1, method):
    if s0 == s1:
        return 1

    if (s0, s1, method) in computed_synsets:
        return computed_synsets[(s0, s1, method)]

    synsets0 = wordnet.synsets(s0)
    synsets1 = wordnet.synsets(s1)

    similarities = []
    for syn0 in synsets0:
        for syn1 in synsets1:
            similarity = get_wordnet_similarity(syn0, syn1, method)
            if similarity is not None:
                similarities.append(similarity)

    if len(similarities) > 0:
        max_sim = max(similarities)
        if method == 'lch':
            # If Leacock similarity divide by 3 in order to normalize the similarity
            computed_synsets[(s0, s1, method)] = max_sim / 3
            return max_sim / 3
        else:
            computed_synsets[(s0, s1, method)] = max_sim
            return max_sim
    else:
        computed_synsets[(s0, s1, method)] = 0
        return 0


def mean_simimilarity(lemmas0, lemmas1, method):
    sum = 0
    for l0 in lemmas0:
        sum += max([max_similarity(l0, l1, method) for l1 in lemmas1])
    return sum / len(lemmas0)


def synset_similarity(lemmas0, lemmas1, method):
    mean_sim0 = mean_simimilarity(lemmas0, lemmas1, method)
    mean_sim1 = mean_simimilarity(lemmas1, lemmas0, method)

    if mean_sim0 > 0 or mean_sim1 > 0:
        return (2 * mean_sim0 * mean_sim1) / (mean_sim0 + mean_sim0)
    else:
        return 0

### Lesk similarity

In [151]:
def lesk_similarity(words0, words1):
    w0_pos = nltk.pos_tag(words0)
    w1_pos = nltk.pos_tag(words1)

    s0_lesk = []
    for i in range(len(w0_pos)):
        if w0_pos[i][1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
            if w0_pos[i][1][0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
                s0_lesk.append(nltk.wsd.lesk(words0, w0_pos[i][0], pos=wordnet.ADJ))
            else:
                s0_lesk.append(nltk.wsd.lesk(words0, w0_pos[i][0], pos=w0_pos[i][1][0].lower()))

    s1_lesk = []
    for i in range(len(w1_pos)):
        if w1_pos[i][1][0] in {'N', 'V', 'J', 'R'}:  #N- noun, V- verb, J- adjective, R-adverb
            if w1_pos[i][1][0] == 'J':  #this is used due to wordnet using a different label for adjectives than one given by nltk
                s1_lesk.append(nltk.wsd.lesk(words1, w1_pos[i][0], pos=wordnet.ADJ))
            else:
                s1_lesk.append(nltk.wsd.lesk(words1, w1_pos[i][0], pos=w1_pos[i][1][0].lower()))

    return 1 - jaccard_distance(set(s0_lesk), set(s1_lesk))

### Compute similarity

In [152]:
class Methods(Enum):
    """
    Enumeration with the available similarity methods
    """
    JACCARD = 'jaccard'
    NE = 'NE'
    PATH = 'path'
    LEACOCK = 'lch'
    WU = 'wup'
    LIN = 'lin'
    LESK = 'lesk'

In [153]:
def compute_similarity(sentence_0, sentence_1, method='jaccard'):
    words0 = tokenize(sentence_0)
    words1 = tokenize(sentence_1)
    s0_lemmas = lemmatize_sentence(words0)
    s1_lemmas = lemmatize_sentence(words1)

    if method == 'jaccard':
        return 1 - jaccard_distance(set(s0_lemmas), set(s1_lemmas))

    elif method == 'NE':
        s0_ne = name_entity_tokenization(sentence_0)
        s1_ne = name_entity_tokenization(sentence_1)
        return 1 - jaccard_distance(set(s0_ne), set(s1_ne))

    elif method in ['path', 'lch', 'wup', 'lin']:
        return synset_similarity(s0_lemmas, s1_lemmas, method)

    elif method == 'lesk':
        return lesk_similarity(words0, words1)

### Read data

In [154]:
def read_file(file_path):
    return pd.read_csv(file_path, sep='\t', lineterminator='\n', names=['sentence0', 'sentence1'], header=None,
                       quoting=csv.QUOTE_NONE)

In [155]:
#path = os.path.join('data', 'train', 'STS.input.MSRpar.txt')

#df_input = read_file(path)
#df_input.head()


#Train data
inputTexts = []
dataPath = os.path.join('data', 'train')
for filename in os.listdir(dataPath):
    if "STS.input" in filename:
        print(os.path.join(dataPath, filename))
        inputTexts.append(read_file(os.path.join(dataPath, filename)))

inputGs = []
for filename in os.listdir(dataPath):
    if "STS.gs" in filename:
        print(os.path.join(dataPath, filename))
        inputGs.append(read_file(os.path.join(dataPath, filename)))

##Test data
inputTexts = []
dataPath = os.path.join('data', 'test-gold')
for filename in os.listdir(dataPath):
    if "STS.input" in filename:
        print(os.path.join(dataPath, filename))
        inputTexts.append(read_file(os.path.join(dataPath, filename)))

inputGs = []
for filename in os.listdir(dataPath):
    if "STS.gs" in filename:
        print(os.path.join(dataPath, filename))
        inputGs.append(read_file(os.path.join(dataPath, filename)))


data\train\STS.input.MSRpar.txt
data\train\STS.input.MSRvid.txt
data\train\STS.input.SMTeuroparl.txt
data\train\STS.gs.MSRpar.txt
data\train\STS.gs.MSRvid.txt
data\train\STS.gs.SMTeuroparl.txt
data\test-gold\STS.input.MSRpar.txt
data\test-gold\STS.input.MSRvid.txt
data\test-gold\STS.input.SMTeuroparl.txt
data\test-gold\STS.input.surprise.OnWN.txt
data\test-gold\STS.input.surprise.SMTnews.txt
data\test-gold\STS.gs.ALL.txt
data\test-gold\STS.gs.MSRpar.txt
data\test-gold\STS.gs.MSRvid.txt
data\test-gold\STS.gs.SMTeuroparl.txt
data\test-gold\STS.gs.surprise.OnWN.txt
data\test-gold\STS.gs.surprise.SMTnews.txt


In [156]:
# TODO: Remove testing cell
s0 = inputTexts[0].iloc[0,0]
s1 = inputTexts[0].iloc[0,1]
print(s0)
print(s1)
for method in Methods:
    print(f'Similarity using {method.value}: {compute_similarity(s0, s1, method.value)}')

The problem likely will mean corrective changes before the shuttle fleet starts flying again.
He said the problem needs to be corrected before the space shuttle fleet is cleared to fly again.
Similarity using jaccard: 0.2857142857142857
Similarity using NE: 0.28
Similarity using path: 0.6518518518518519
Similarity using lch: 0.8729193707472305
Similarity using wup: 0.8361314611314613
Similarity using lin: 0.810531749216512
Similarity using lesk: 0.0625


Random Forest Model

In [157]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=6, random_state=42)
# Train the model on training data
rf.fit(train_features, train_labels);

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)

ModuleNotFoundError: No module named 'sklearn'

SVR model

In [None]:
from sklearn import svm

regr = svm.SVR()
regr.fit(X, y)
SVR()
regr.predict([[1, 1]])

Linear KNN

In [None]:
model = neighbors.KNeighborsRegressor(n_neighbors=K)

model.fit(x_train, y_train)  #fit the model
pred = model.predict(x_test)  #make prediction on test set