# Similarity Measurement

You are provided with a document corpus:

In [31]:
document0 = 'preliminary finding in cancer research'
document1 = 'novel cancer research findings'
document2 = 'new research to heal cancer'
document3 = 'healing novel cancer research'
corpus = [document0, document1, document2, document3]

## Compute the Jaccard score for each of the pair of documents

In [32]:
def jaccord_score(doc1, doc2): 
    words_1 = doc1.split()
    words_2 = doc2.split()
    unique_words_1 = list(set(words_1))
    unique_words_2 = list(set(words_2))
    
    intersection = [value for value in unique_words_1 if value in unique_words_2]
    union = list(set(unique_words_1 + unique_words_2))
    return round(len(intersection) / len(union),3)

In [33]:
for i in range(len(corpus)):
    for j in range(len(corpus)):
        if i<=j:
            print('Jaccard score for document',i, ' and document',j,' is: ', jaccord_score(corpus[i],corpus[j]))

Jaccard score for document 0  and document 0  is:  1.0
Jaccard score for document 0  and document 1  is:  0.286
Jaccard score for document 0  and document 2  is:  0.25
Jaccard score for document 0  and document 3  is:  0.286
Jaccard score for document 1  and document 1  is:  1.0
Jaccard score for document 1  and document 2  is:  0.286
Jaccard score for document 1  and document 3  is:  0.6
Jaccard score for document 2  and document 2  is:  1.0
Jaccard score for document 2  and document 3  is:  0.286
Jaccard score for document 3  and document 3  is:  1.0


## Term frequency vector

In [34]:
def get_unique_words(corpus):
    words = []
    for doc in corpus:
        words += doc.split()
        
    return list(set(words))

In [35]:
unique_words = get_unique_words(corpus)
unique_words

['finding',
 'cancer',
 'findings',
 'healing',
 'new',
 'to',
 'novel',
 'preliminary',
 'in',
 'heal',
 'research']

In [36]:
def term_frequency(term, document):
    """ the number of occurences of given term in given document"""
    count = 0
    for word in document.split():
        if word == term:
            count += 1
    return count

In [37]:
for word in unique_words:
    print('tf(',word,', doc0): ', term_frequency(word, corpus[0]))

tf( finding , doc0):  1
tf( cancer , doc0):  1
tf( findings , doc0):  0
tf( healing , doc0):  0
tf( new , doc0):  0
tf( to , doc0):  0
tf( novel , doc0):  0
tf( preliminary , doc0):  1
tf( in , doc0):  1
tf( heal , doc0):  0
tf( research , doc0):  1


In [38]:
def tf_vector(document, corpus):
    """ Calculates the term frequency vector. """   
    unique_terms = get_unique_words(corpus)
    
    result = []
    for term in unique_terms:
        result += [term_frequency(term, document)]
            
    return result

In [39]:
print('unique words are: \n', unique_words, '\n')
for i in range(len(corpus)):
    print('tf_vector(doc',i,'): ', tf_vector(corpus[i], corpus))

unique words are: 
 ['finding', 'cancer', 'findings', 'healing', 'new', 'to', 'novel', 'preliminary', 'in', 'heal', 'research'] 

tf_vector(doc 0 ):  [1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1]
tf_vector(doc 1 ):  [0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1]
tf_vector(doc 2 ):  [0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1]
tf_vector(doc 3 ):  [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1]


## TF-IDF document and query representation

In [40]:
def document_frequency(corpus, term):
    """ number of documents that contains given term """
    count = 0
    for doc in corpus:
        words = doc.split()
        if term in words:
            count += 1
    return count

In [41]:
for word in unique_words:
    print('df(',word,'): ', document_frequency(corpus, word))

df( finding ):  1
df( cancer ):  4
df( findings ):  1
df( healing ):  1
df( new ):  1
df( to ):  1
df( novel ):  2
df( preliminary ):  1
df( in ):  1
df( heal ):  1
df( research ):  4


In [42]:
def inversed_document_frequency(corpus, term):
    """ logarythm from the number of documents divided by document frequency """
    import math
    df = document_frequency(corpus, term)
    return round(math.log10(len(corpus)/df),3)

In [43]:
for word in unique_words:
    print('idf(',word,'): ', inversed_document_frequency(corpus, word))

idf( finding ):  0.602
idf( cancer ):  0.0
idf( findings ):  0.602
idf( healing ):  0.602
idf( new ):  0.602
idf( to ):  0.602
idf( novel ):  0.301
idf( preliminary ):  0.602
idf( in ):  0.602
idf( heal ):  0.602
idf( research ):  0.0


In [44]:
def tf_idf_weight(term, document, corpus):
    return round(term_frequency(term, document) * inversed_document_frequency(corpus, term), 3)

In [45]:
tf_idf_weight('novel', corpus[0], corpus)

0.0

In [46]:
def tf_idf_vector(document, corpus):
    unique_words = get_unique_words(corpus)
    
    result = []
    for w in unique_words:
        result += [tf_idf_weight(w, document, corpus)]
        
    return result

In [47]:
for i in range(len(corpus)):
    print('tf_idf vector for document',i,' is: ', tf_idf_vector(corpus[i], corpus))

tf_idf vector for document 0  is:  [0.602, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.602, 0.602, 0.0, 0.0]
tf_idf vector for document 1  is:  [0.0, 0.0, 0.602, 0.0, 0.0, 0.0, 0.301, 0.0, 0.0, 0.0, 0.0]
tf_idf vector for document 2  is:  [0.0, 0.0, 0.0, 0.0, 0.602, 0.602, 0.0, 0.0, 0.0, 0.602, 0.0]
tf_idf vector for document 3  is:  [0.0, 0.0, 0.0, 0.602, 0.0, 0.0, 0.301, 0.0, 0.0, 0.0, 0.0]


### queries

In [48]:
q1 = "novel novel preliminary new finding"
q2 = "healing research"

In [49]:
print('tf_idf_vector(q1): ', tf_idf_vector(q1, corpus))
print('tf_idf_vector(q2): ', tf_idf_vector(q2, corpus))

tf_idf_vector(q1):  [0.602, 0.0, 0.0, 0.0, 0.602, 0.0, 0.602, 0.602, 0.0, 0.0, 0.0]
tf_idf_vector(q2):  [0.0, 0.0, 0.0, 0.602, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


## Cosine similarity

In [50]:
def vector_length(vector):
    import math
    result = 0
    for el in vector:
        result += math.pow(el, 2)
    return math.sqrt(result)

In [51]:
def cosine_similarity(document, query, corpus):
    result = 0
    d_vec = tf_idf_vector(document, corpus)
    q_vec = tf_idf_vector(query, corpus)
    
    for i in range(len(d_vec)):
        result += d_vec[i] * q_vec[i]
    
    return round(result / (vector_length(d_vec)* vector_length(q_vec)), 3)

In [52]:
import operator
print('Ranking for 1st query:')

ranking = dict()
for i in range(len(corpus)):
    ranking[i] = [cosine_similarity(corpus[i], q1, corpus)]
ranking = sorted(ranking.items(), key=operator.itemgetter(1), reverse=True)

print(['doc' + str(d) + '(' + str(score) + ')' for d, score in ranking])

Ranking for 1st query:
['doc0([0.577])', 'doc2([0.289])', 'doc1([0.224])', 'doc3([0.224])']


In [53]:
import operator
print('Ranking for 2nd query:')

ranking = dict()
for i in range(len(corpus)):
    ranking[i] = [cosine_similarity(corpus[i], q2, corpus)]
ranking = sorted(ranking.items(), key=operator.itemgetter(1), reverse=True)

print(['doc' + str(d) + '(' + str(score) + ')' for d, score in ranking])

Ranking for 2nd query:
['doc3([0.894])', 'doc0([0.0])', 'doc1([0.0])', 'doc2([0.0])']
