In [4]:
import math
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apurv\AppData\Roaming\nltk_data...
[nltk_data] Error downloading 'punkt' from
[nltk_data]     <https://raw.githubusercontent.com/nltk/nltk_data/gh-
[nltk_data]     pages/packages/tokenizers/punkt.zip>:   <urlopen error
[nltk_data]     [Errno 2] No such file or directory>
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\apurv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\apurv\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
# Documents
docs =[ "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee.",       "Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18.",       "People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India.",       "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy"]

# Query
query = "Maha Shivratri will be celebrated on February 18."

In [13]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apurv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
# Preprocessing
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas

doc_tokens = [preprocess(doc) for doc in docs]
query_tokens = preprocess(query)

In [15]:
# Vocabulary of Terms
all_tokens_set = set([token for tokens in doc_tokens for token in tokens])
print("Vocabulary of terms:")
print(sorted(all_tokens_set))

Vocabulary of terms:
[',', '.', '18', 'a', 'accompanied', 'and', 'at', 'auspicious', 'awake', 'be', 'beautiful', 'blessing', 'by', 'celebrate', 'celebrated', 'celebrates', 'chant', 'considered', 'dance', 'devotee', 'energy', 'epitome', 'etc', 'every', 'evil', 'fast', 'february', 'fervour', 'festival', 'folk', 'for', 'from', 'glee', 'grandeur', 'grandness', 'happiness', 'he', 'hindu', 'his', 'hold', 'hope', 'important', 'in', 'india', 'is', 'it', 'keep', 'lord', 'lot', 'maha', 'mantra', 'million', 'momentous', 'most', 'negative', 'night', 'occasion', 'of', 'on', 'one', 'people', 'pomp', 'power', 'powerful', 'pray', 'prayer', 'prosperity', 'protects', 'shiva', 'shivratri', 'significance', 'since', 'song', 'special', 'spirit', 'stay', 'the', 'this', 'time', 'to', 'very', 'will', 'with', 'year']


In [16]:
# Term Frequency
doc_word_counts = [Counter(tokens) for tokens in doc_tokens]
doc_term_frequency = []
for word_count in doc_word_counts:
    frequencies = {word: count/sum(word_count.values()) for word, count in word_count.items()}
    doc_term_frequency.append(frequencies)

In [17]:
# Inverse Document Frequency
def inverse_document_frequency(term, all_docs_tokens):
    num_docs_with_term = sum(1 for doc in all_docs_tokens if term in doc)
    if num_docs_with_term > 0:
        return math.log(len(all_docs_tokens) / num_docs_with_term)
    else:
        return 0

doc_inverse_document_frequency = {token: inverse_document_frequency(token, doc_tokens) for token in all_tokens_set}

In [18]:
# print results
print("Term frequency:")
for i, frequencies in enumerate(doc_term_frequency):
    print(f"Document {i+1}: {frequencies}")
    
print("Inverse document frequency:")
for term, idf in doc_inverse_document_frequency.items():
    print(f"{term}: {idf}")

Term frequency:
Document 1: {'every': 0.023809523809523808, 'year': 0.047619047619047616, 'maha': 0.023809523809523808, 'shivratri': 0.023809523809523808, 'is': 0.047619047619047616, 'celebrated': 0.023809523809523808, 'with': 0.047619047619047616, 'a': 0.07142857142857142, 'lot': 0.047619047619047616, 'of': 0.09523809523809523, 'pomp': 0.023809523809523808, 'and': 0.047619047619047616, 'grandeur': 0.023809523809523808, '.': 0.047619047619047616, 'it': 0.023809523809523808, 'considered': 0.023809523809523808, 'to': 0.023809523809523808, 'be': 0.023809523809523808, 'very': 0.023809523809523808, 'special': 0.023809523809523808, 'time': 0.023809523809523808, 'the': 0.023809523809523808, 'since': 0.023809523809523808, 'million': 0.023809523809523808, 'people': 0.023809523809523808, 'celebrate': 0.023809523809523808, 'this': 0.023809523809523808, 'momentous': 0.023809523809523808, 'occasion': 0.023809523809523808, 'fervour': 0.023809523809523808, 'glee': 0.023809523809523808}
Document 2: {'

In [19]:
# cosine similarity
def cosine_similarity(doc_vector, query_vector):
    numerator = sum([doc_vector.get(term, 0) * query_vector.get(term, 0) for term in set(doc_vector.keys()) & set(query_vector.keys())])
    denominator = math.sqrt(sum([count**2 for count in doc_vector.values()])) * math.sqrt(sum([count**2 for count in query_vector.values()]))
    if denominator > 0:
        return numerator / denominator
    else:
        return 0

doc_vectors = []
for frequencies in doc_term_frequency:
    vector = {term: frequency*doc_inverse_document_frequency[term] for term, frequency in frequencies.items()}
    doc_vectors.append(vector)
 
query_vector = {term: query_tokens.count(term)/len(query_tokens) * doc_inverse_document_frequency[term] for term in query_tokens}

def cosine_similarity(doc_vector, query_vector):
    numerator = sum([doc_vector.get(term, 0) * query_vector.get(term, 0) for term in set(doc_vector.keys()) & set(query_vector.keys())])
    denominator = math.sqrt(sum([count**2 for count in doc_vector.values()])) * math.sqrt(sum([count**2 for count in query_vector.values()]))
    if denominator > 0:
       return numerator / denominator
    else:
       return 0

similarities = [cosine_similarity(doc_vector, query_vector) for doc_vector in doc_vectors]
print()
for i, score in enumerate(similarities):
    print(f"Document {i+1} cosine similarity score: {score}")


Document 1 cosine similarity score: 0.012525376910438596
Document 2 cosine similarity score: 0.2011140729233289
Document 3 cosine similarity score: 0.009560317414142573
Document 4 cosine similarity score: 0.19430731013853395


In [20]:
doc1 = "Every year Maha Shivratri is celebrated with a lot of pomp and grandeur. It is considered to be a very special time of the year since millions of people celebrate this momentous occasion with a lot of fervour and glee."
doc2 = "Lord Shiva devotees celebrate this occasion with a lot of grandness. It is accompanied by folk dances, songs, prayers, chants, mantras etc. This year, the beautiful occasion of Maha Shivratri will be celebrated on February 18."
doc3 = "People keep a fast on this Maha shivratri, stay awake at night and pray to the lord for blessings, happiness, hope and prosperity. This festival holds a lot of significance and is considered to be one of the most important festivals in India."
doc4 = "The festival of Maha Shivratri will be celebrated on February 18 and is a very auspicious festival. This Hindu festival celebrates the power of Lord Shiva. Lord Shiva protects his devotees from negative and evil spirits. He is the epitome of powerful and auspicious energy."
     


In [21]:
def Jaccard_Similarity(doc1, doc2): 
    # List the unique words in a document
    words_doc1 = set(doc1.lower().split()) 
    words_doc2 = set(doc2.lower().split()) 
    # Find the intersection of words list of doc1 & doc2
    intersection = words_doc1.intersection(words_doc2)
    # Find the union of words list of doc1 & doc2
    union = words_doc1.union(words_doc2)    
    # Calculate Jaccard similarity score 
    # using length of intersection set divided by length of union set
    return float(len(intersection)) / len(union)

In [26]:
Jaccard_Similarity(doc1,doc2)

0.2857142857142857

In [27]:
Jaccard_Similarity(doc2,doc3)

0.1694915254237288

In [28]:
Jaccard_Similarity(doc3,doc4)


0.18333333333333332

In [29]:
Jaccard_Similarity(doc4,doc1)

0.2037037037037037