In [48]:
!pip install nltk
!pip install gensim

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m


In [49]:
import nltk
import numpy as np
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nasim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
"""Word tokenizations which basically splits a sentence into words"""

from nltk.tokenize import word_tokenize 

data = "The sun rises in the East and sets in the west."
print(word_tokenize(data))

['The', 'sun', 'rises', 'in', 'the', 'East', 'and', 'sets', 'in', 'the', 'west', '.']


In [51]:
"""Sentence tokenization is needed to count the average words per sentence. So we need to use to calculate the ratio"""

from nltk.tokenize import sent_tokenize

data = "Earth is not a perfect sphere. As Earth spins, gravity points toward the center of our planet (assuming for explanation's sake that Earth is a perfect sphere), and a centrifugal force pushes outward."
print(sent_tokenize(data))

['Earth is not a perfect sphere.', "As Earth spins, gravity points toward the center of our planet (assuming for explanation's sake that Earth is a perfect sphere), and a centrifugal force pushes outward."]


In [52]:
"""File opening and sentence tokenization"""

import nltk, gensim
from nltk.tokenize import word_tokenize, sent_tokenize 


file_docs = []

with open('demofile1.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens: 
        file_docs.append(line)
        
print("Number of documents:", len(file_docs))
print(file_docs)

Number of documents: 3
['Mars is the fourth planet in our solar system.', 'It is second-smallest planet in the Solar System after Mercury.', 'Saturn is yellow planet.']


In [53]:
"""The next step is to tokenize into words and create a dictionary. It is important to convert the tokens into unique ids 
which then allows Genism to create a Dictionary object that maps each word to unique id"""

gen_docs = [[w.lower() for w in word_tokenize(text)] for text in file_docs]
print(gen_docs[0])

dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary.token2id)

['mars', 'is', 'the', 'fourth', 'planet', 'in', 'our', 'solar', 'system', '.']
{'.': 0, 'fourth': 1, 'in': 2, 'is': 3, 'mars': 4, 'our': 5, 'planet': 6, 'solar': 7, 'system': 8, 'the': 9, 'after': 10, 'it': 11, 'mercury': 12, 'second-smallest': 13, 'saturn': 14, 'yellow': 15}


In [54]:
"""Bag of words. (Lists the number of times each word occurs in the sentence)"""
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


In [55]:
"""TFIDF(Term Frequency and Inverse Document Frequency)
It is a measure of originality of a word by comparing  the no of times a word appears in a doc with the number of docs the word appears in.
Basically words that occur more frequently across the documents get smaller weights"""

tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])


[['fourth', 0.53], ['in', 0.2], ['mars', 0.53], ['our', 0.53], ['solar', 0.2], ['system', 0.2], ['the', 0.2]]
[['in', 0.17], ['solar', 0.17], ['system', 0.17], ['the', 0.17], ['after', 0.47], ['it', 0.47], ['mercury', 0.47], ['second-smallest', 0.47]]
[['saturn', 0.71], ['yellow', 0.71]]


In [70]:
"""Creating Similariy Object 
It builds an index for a given set of documents. The index is split into several smaller sub indexes and saved into disk."""

sims = gensim.similarities.Similarity('indexdir/', tf_idf[corpus], num_features=len(dictionary))
print(sims)

Similarity index with 3 documents in 0 shards (stored under indexdir/)


In [71]:
"""Create  query document 
need to calculate how similar is the query document to each document in the index. """

query_docs = []

with open("demofile2.txt") as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        query_docs.append(line)
        
print("Number of documents:", len(query_docs))

for line in query_docs:
    q_doc = [w.lower() for w in word_tokenize(line)]
    q_doc_bow = dictionary.doc2bow(q_doc)
    print(q_doc_bow)


Number of documents: 1
[(0, 1), (3, 1), (6, 1), (9, 2), (14, 1)]


In [73]:
"""Document similarities to query
As can be seen the third document is most similar """
q_doc_tf_idf = tf_idf[q_doc_bow]
print('Comparing Result:', sims[q_doc_tf_idf]) 

Comparing Result: [0.11641413 0.10281226 0.56890744]
