# Text similarity

How to compute the similarity between two strings?

In [1]:
from __future__ import unicode_literals
from __future__ import division

In [2]:
a = 'Refrigerador Brastemp CFR45 20L frostfree'
b = 'Geladeira Brastemp CFR45 20L com desgelo automático'

In [3]:
# Value for similar tokens:
tokensA = a.split()
tokensB = b.split()
set(tokensA).intersection(tokensB)

{u'20L', u'Brastemp', u'CFR45'}

In [4]:
similar = len(set(tokensA).intersection(tokensB))
total = len(set(tokensA).union(tokensB))
print '{} similars from {} tokens: {:0.2f}% of similarity'.format(similar, total, similar/total*100)

3 similars from 9 tokens: 33.33% of similarity


In [5]:
# several other metrics. See jellyfish, fuzzywuzzy, metaphone, etc
import jellyfish
import fuzzywuzzy
import metaphone

In [6]:
print metaphone.doublemetaphone('caza')
print metaphone.doublemetaphone('casa')

('KS', '')
('KS', '')


In [7]:
# The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. 
jellyfish.jaro_distance(a,b)

0.6568129284234019

## Other possibilities:

* extract named features for measuring the importance of each token
* use some basic text preprocessing (lowecase, stemm, etc)
* remove stopword
* weight the words using a measure of importante (TF/IDF, for example) 

## Using word2vec to computer vector similairties

It is possibel to use [word2vec]((http://nbviewer.ipython.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb) or [gensim](https://radimrehurek.com/gensim/models/word2vec.html)

In [8]:
# read the corpus
import codecs

# this could be done in a iterate way for performance in huge corpus
with codecs.open('corpus.txt', encoding='utf8') as fp:
    corpus = fp.read()

In [9]:
# sent and word tokenize with ntlk
# It may take a while to process
from nltk import sent_tokenize, word_tokenize
sentences = [[w.lower() for w in word_tokenize(sentence, language='portuguese')] for sentence in sent_tokenize(corpus, language='portuguese')]

In [10]:
# It may take a while to train
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=8)
model.init_sims(replace=True)

In [11]:
model.most_similar('geladeira')

[(u'duplex', 0.7450528144836426),
 (u'frost', 0.7416204214096069),
 (u'refrigerador', 0.7210657000541687),
 (u'df80', 0.7155492901802063),
 (u'crb39', 0.7043531537055969),
 (u'dw42x', 0.7033456563949585),
 (u'geladeira/refrigerador', 0.6923708319664001),
 (u'inverse', 0.6831749677658081),
 (u'dt52x', 0.6795977354049683),
 (u'smeg', 0.6782817840576172)]

In [12]:
tokensA = [t.lower() for t in tokensA]
vectorsA = sum([model[token] for token in tokensA if token in model.vocab])

tokensB = [t.lower() for t in tokensB]
vectorsB = sum([model[token] for token in tokensB if token in model.vocab])

In [13]:
from nltk.cluster.util import cosine_distance
print 'Similarity: {}'.format(abs(1 - cosine_distance(vectorsA, vectorsB)))

Similarity: 0.857831723319
