In [1]:
# Some IPython magic
# Put these at the top of every notebook, here nbagg is used for interactive plots
%reload_ext autoreload
%autoreload 2

# Enable multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import nltk
import string

  _nan_object_mask = _nan_object_array != _nan_object_array


# Text similarity
We will compute text similarity by measuring distances between word frequencies. For this, we will have to remove punctuation from the text. And for better results, we will also remove stopwords.

In [2]:
from nltk import tokenize
from nltk import corpus
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

default_lemmatizer = WordNetLemmatizer()
default_stemmer = PorterStemmer()

def remove_chars(s, removals):
    return s.translate(str.maketrans('', '', removals))

def tokenize(text, lmtzr = default_lemmatizer, stmr = default_stemmer):
    # remove punctuation
    text = remove_chars(text, string.punctuation)
    # remove numbers
    words = [word.lower() for word in text.split() if word.isalpha()]
#     words = [stmr.stem(word.lower()) for word in text.split() if word.isalpha()]
#     words = [lmtzr.lemmatize(word.lower()) for word in text.split() if word.isalpha()]
    # remove stopwords, but remove punctuation from the stopwords first because we removed all punctuation from the original text too
    stopwords = set([remove_chars(sw, string.punctuation) for sw in corpus.stopwords.words('English')])
    words = [word for word in words if word not in stopwords]
    return words

# count word frequencies
def count_words(words, vocabulary):
    # compute frequencies and vocabulary for list of words
    fdist = nltk.FreqDist(words)
    # add words that are in the vocabulary but not in the current list of words
    words_not_present = [word for word in vocabulary if word not in fdist.keys()]
    for word in words_not_present:
        fdist[word] = 0
    
    v = np.array([fdist[word] for word in vocabulary])

    return v

def normalize(v):
    return (v - v.min() ) / (v.max() - v.min())

from math import acos, sqrt
#compute distance according to formula
# !!! arccos((V1*V2)/sqrt(V1*V1)*(V2*V2))
def compute_distance(v1, v2):
    distance = acos( (np.dot(v1,v2) / sqrt( (np.dot(v1,v1) * np.dot(v2,v2)))))
    return distance

def compute_similarity(text1, text2):
    w1, w2 = tokenize(text1), tokenize(text2)
    # create a sorted common vocabulary for the two lists of words
    common_vocab = sorted(set(w1) | set(w2))
    
    # create count vectors of the same lenght for the two lists of words
    v1 = count_words(w1, common_vocab)
    v2 = count_words(w2, common_vocab)
    
    distance = compute_distance(v1, v2)
    return distance

In [3]:
from nltk.corpus import wordnet
# wordnet.synsets('near')
s = wordnet.synset('near.s.03')
s.hypernyms()
s.hyponyms()

[]

[]

In [8]:
df = pd.read_csv('sick/SICK.txt', delimiter='\t')
ds = []
for i in range(10):
    text1 = df['sentence_A'][i]
    text2 = df['sentence_B'][i]
    w1, w2 = tokenize(text1), tokenize(text2)
    # create a sorted common vocabulary for the two lists of words
    common_vocab = sorted(set(w1) | set(w2))
    
    # create count vectors of the same lenght for the two lists of words
    v1 = count_words(w1, common_vocab)
    v2 = count_words(w2, common_vocab)
    
    distance = compute_distance(v1, v2)
    ds.append(distance)
    
    print(str(i) + '==========================')
    print(text1, '\n', text2)
    print(common_vocab)
    print(v1, '\n', v2)
    print(distance)
    print('\n\n')

print(np.mean(ds))

A group of kids is playing in a yard and an old man is standing in the background 
 A group of boys in a yard is playing and a man is standing in the background
['background', 'boys', 'group', 'kids', 'man', 'old', 'playing', 'standing', 'yard']
[1 0 1 1 1 1 1 1 1] 
 [1 1 1 0 1 0 1 1 1]
0.6405223126794245



A group of children is playing in the house and there is no man standing in the background 
 A group of kids is playing in a yard and an old man is standing in the background
['background', 'children', 'group', 'house', 'kids', 'man', 'old', 'playing', 'standing', 'yard']
[1 1 1 1 0 1 0 1 1 0] 
 [1 0 1 0 1 1 1 1 1 1]
0.8390726214483827



The young boys are playing outdoors and the man is smiling nearby 
 The kids are playing outdoors near a man with a smile
['boys', 'kids', 'man', 'near', 'nearby', 'outdoors', 'playing', 'smile', 'smiling', 'young']
[1 0 1 0 1 1 1 0 1 1] 
 [0 1 1 1 0 1 1 1 0 0]
1.0895209528525531



The young boys are playing outdoors and the man is smiling nearby