In [1]:
import pandas as pd
from math import log10, sqrt
import numpy as np

In [2]:
d0 = "cats runs behind rats"
d1 = "dogs runs behind cats"
d2 = "rats runs cats"
d3 = "behind runs cats dogs"
q = "behind rats"



corpus = [d0, d1, d2, d3, q]
word_bag =set(d0.split() + d1.split() + d2.split() + d3.split() + q.split())

In [3]:
corpus, word_bag, len(word_bag)

(['cats runs behind rats',
  'dogs runs behind cats',
  'rats runs cats',
  'behind runs cats dogs',
  'behind rats'],
 {'behind', 'cats', 'dogs', 'rats', 'runs'},
 5)

# Creating Dataframe

### Calculating the term frequency of the corpus  

In [4]:
def term_frequency(word_bag, corpus):
    tf_df = pd.DataFrame(index = list(word_bag), columns = ["d0", "d1", "d2", "d3", "q"])
    for doc, col in zip(corpus, tf_df.columns):
        for word in word_bag:
            tf_df[col][word] = doc.count(word)
    
    return tf_df                     

In [5]:
tf_df= term_frequency(word_bag, corpus)

In [6]:
tf_df

Unnamed: 0,d0,d1,d2,d3,q
behind,1,1,0,1,1
runs,1,1,1,1,0
rats,1,0,1,0,1
dogs,0,1,0,1,0
cats,1,1,1,1,0


In [7]:
tf_idf_df = pd.DataFrame(index = list(word_bag), columns = ["d0", "d1", "d2", "d3", "q"])
document_frequencies = {}
tf_idf_df

Unnamed: 0,d0,d1,d2,d3,q
behind,,,,,
runs,,,,,
rats,,,,,
dogs,,,,,
cats,,,,,


# Calculating Document Frequency

In [8]:
def document_frequency():
    for term in word_bag:
        for doc in corpus[:-1]:
            if term not in document_frequencies.keys():
                document_frequencies[term] = 1
            elif term in doc:
                document_frequencies[term] += 1
                

In [9]:
document_frequency()
document_frequencies

{'behind': 3, 'runs': 4, 'rats': 2, 'dogs': 3, 'cats': 4}

# TF_IDF

In [10]:
def tf_idf():
    for term in word_bag:
        tf_idf_df.loc[term] = tf_df.loc[term] * log10((len(corpus)-1)/ document_frequencies[term])

In [11]:
tf_idf()
tf_idf_df

Unnamed: 0,d0,d1,d2,d3,q
behind,0.124939,0.124939,0.0,0.124939,0.124939
runs,0.0,0.0,0.0,0.0,0.0
rats,0.30103,0.0,0.30103,0.0,0.30103
dogs,0.0,0.124939,0.0,0.124939,0.0
cats,0.0,0.0,0.0,0.0,0.0


## Rochio Feedback

In [19]:
def Rochio_Feedback(alpha, beta, gamma):
    q_updates = (alpha * tf_idf_df["q"]) + (beta * tf_idf_df["d1"]) - (gamma * tf_idf_df["d3"])
    return q_updates

In [20]:
Rochio_Feedback(1, 0.8, 0.1)

behind     0.212396
runs              0
rats        0.30103
dogs      0.0874571
cats              0
dtype: object

In [54]:
Rochio_Feedback(1, 0.1, 0.9)

cats              0
dogs      -0.099951
runs              0
rats        0.30103
behind    0.0249877
dtype: object

In [55]:
Rochio_Feedback(1, 1, 1)

cats             0
dogs             0
runs             0
rats       0.30103
behind    0.124939
dtype: object

# Jaccard Similarity

In [56]:
# Q-2 : Jaccard similarity
D1 = "this is a text about web science"
D2 = "web science is covering the analysis of text corpa"
D3 = "scientific methods are used to analyze web pages"

def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    return len(set1.intersection(set2)) / len(set1.union(set2))
    
print("Jaccard similarity between D1 & D2 is: "+str(jaccard_similarity(D1.split(),D2.split())))
print("Jaccard similarity between D1 & D3 is: "+str(jaccard_similarity(D1.split(),D3.split())))
print("Jaccard similarity between D2 & D3 is: "+str(jaccard_similarity(D2.split(),D3.split())))

Jaccard similarity between D1 & D2 is: 0.3333333333333333
Jaccard similarity between D1 & D3 is: 0.07142857142857142
Jaccard similarity between D2 & D3 is: 0.0625


# Cosine Similarity

In [22]:
from collections import Counter
# Q-4 : Cosine Similarity
D1 = "preliminary findings in corona research"
D2 = "novel corona research findings"
D3 = "new research to corona healing"
D4 = "healing novel corona research"
q = "novel novel preliminary new research"

def str_to_vec(string):
    # count the characters in string
    words_count = Counter(string.split())
    # precomputes a set of the different characters
    words_set = set(words_count)
    # precomputes the "length" of the word vector
    vector_length = sqrt(sum(count*count for count in words_count.values()))
    return (words_count, words_set, vector_length)

def cosine_dist(v1, v2):
    # using intersection() to find common characters.
    common = v1[1].intersection(v2[1])
    # returning the cosine distance
    return sum((v1[0][word])*(v2[0][word]) for word in common)/(v1[2]*v2[2])

print("Cosine Similarity between D1 & D2 is: "+str(cosine_dist(str_to_vec(D1),str_to_vec(D2))))
print("Cosine Similarity between D1 & D3 is: "+str(cosine_dist(str_to_vec(D1),str_to_vec(D3))))
print("Cosine Similarity between Q & D1 is: "+str(cosine_dist(str_to_vec(q),str_to_vec(D1))))

Cosine Similarity between D1 & D2 is: 0.6708203932499369
Cosine Similarity between D1 & D3 is: 0.3999999999999999
Cosine Similarity between Q & D1 is: 0.33806170189140655


# Model the document as Vectors

In [66]:
D0 = "linear venn venn artificial"
D1 = "linear artificial scikit artificial regression artificial"
D2 = "scikit regression intelligence regression"
D3 = "artificial venn tandem intelligence artificial"
D4 = "regression scikit regression"

In [67]:
s0 = D0.split()
s1 = D1.split()
s2 = D2.split()
s3 = D3.split()
s4 = D4.split()

# 1.2 Term Frequency vector for each of the documents

In [68]:
Total = s0+s1+s2+s3+s4

In [69]:
def remove_duplicates(D):
    return list(dict.fromkeys(D))

In [70]:
Corpus = remove_duplicates(Total)

In [71]:
Corpus

['linear',
 'venn',
 'artificial',
 'scikit',
 'regression',
 'intelligence',
 'tandem']

In [72]:
def TF_Vectors(Corpus,Document):
    vector = []
    for i in range (len(Corpus)):
        if Corpus[i] in Document:
            count = Document.count(Corpus[i])
            vector.append(count)
        else:
            vector.append(0)    
    return vector

In [73]:
Vector0 = TF_Vectors(Corpus,s0)
Vector1 = TF_Vectors(Corpus,s1)
Vector2 = TF_Vectors(Corpus,s2)
Vector3 = TF_Vectors(Corpus,s3)
Vector4 = TF_Vectors(Corpus,s4)

In [74]:
Vectors = {
    "Vector0": Vector0,
    "Vector1": Vector1,
    "Vector2": Vector2,
    "Vector3": Vector3,
    "Vector4": Vector4
}

In [75]:
df = pd.DataFrame(Vectors, index=Corpus)

# 1.3 Rank the documents using the Cosine Similarity (using the tf-idf weighting of terms)

# document frequency

In [76]:
docfreq = []
for i in range(len(Corpus)):
    
    count = 0
    if list(df.loc[Corpus[i]]) != 0:
        count = np.count_nonzero(list(df.loc[Corpus[i]]))
        docfreq.append(count)
    else:
        docfreq.append(count)

In [77]:
docfreq

[2, 2, 3, 3, 3, 2, 1]

In [78]:
DocumentFrequency = {
   
    "DocumentFrequency": docfreq
}

In [79]:
df1 = pd.DataFrame(DocumentFrequency, index=Corpus)

In [80]:
idf=[]
N=4
for i in range(len(docfreq)):
    idf.append(np.log10(N/docfreq[i]))

In [81]:
# 'linear',
#  'venn',
#  'artificial',
#  'scikit',
#  'regression',
#  'intelligence',
#  'tandem'
idf

[0.3010299956639812,
 0.3010299956639812,
 0.12493873660829993,
 0.12493873660829993,
 0.12493873660829993,
 0.3010299956639812,
 0.6020599913279624]

In [82]:
Index = {
   
    "DocumentFrequency": docfreq,
    "InverseDocumentFrequency": idf
}

In [83]:
df2 = pd.DataFrame(Index, index=Corpus)

In [84]:
def TF_IDF(TF,IDF):
    tf_idf=[]
    for i in range(len(TF)):
        tf_idf.append(TF[i]*IDF[i])
    return tf_idf

In [86]:
d0 = TF_IDF(Vector0,idf)
d1 = TF_IDF(Vector1,idf)
d2 = TF_IDF(Vector2,idf)
d3 = TF_IDF(Vector3,idf)
d4 = TF_IDF(Vector4,idf)

In [87]:
TF_IDF_Vectors = {
    "d0": d0,
    "d1": d1,
    "d2": d2,
    "d3": d3,
    "d4": d4
}

In [88]:
df3 = pd.DataFrame(TF_IDF_Vectors, index=Corpus)
df3

Unnamed: 0,d0,d1,d2,d3,d4
linear,0.30103,0.30103,0.0,0.0,0.0
venn,0.60206,0.0,0.0,0.30103,0.0
artificial,0.124939,0.374816,0.0,0.249877,0.0
scikit,0.0,0.124939,0.124939,0.0,0.124939
regression,0.0,0.124939,0.249877,0.0,0.249877
intelligence,0.0,0.0,0.30103,0.30103,0.0
tandem,0.0,0.0,0.0,0.60206,0.0
