In [2]:
!pip install nltk
!pip install gensim

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m


In [35]:
import nltk
import numpy as np
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nasim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
"""Word tokenizations which basically splits a sentence into words"""

from nltk.tokenize import word_tokenize 

data = "The sun rises in the East and sets in the west."
print(word_tokenize(data))

['The', 'sun', 'rises', 'in', 'the', 'Ease', 'and', 'sets', 'in', 'the', 'west', '.']


In [10]:
"""Sentence tokenization is needed to count the average words per sentence. So we need to use to calculate the ratio"""

from nltk.tokenize import sent_tokenize

data = "Earth is not a perfect sphere. As Earth spins, gravity points toward the center of our planet (assuming for explanation's sake that Earth is a perfect sphere), and a centrifugal force pushes outward."
print(sent_tokenize(data))

['Earth is not a perfect sphere.', "As Earth spins, gravity points toward the center of our planet (assuming for explanation's sake that Earth is a perfect sphere), and a centrifugal force pushes outward."]


In [21]:
"""File opening and sentence tokenization"""

import nltk, gensim
from nltk.tokenize import word_tokenize, sent_tokenize 


file_docs = []

with open('document.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens: 
        file_docs.append(line)
        
print("Number of documents:", len(file_docs))
print(file_docs)

Number of documents: 4
['Vehicles with no human driver on board were previously tested by a select group of Waymo customers, before the pandemic halted the service.', 'The driver-free cars will initially be available only to friends and family of those early Waymo adopters, but will extend to all app users within weeks.', 'The service comes two years later than originally promised.', 'The cars will be remotely monitored, with vehicle operators intervening in extreme circumstances - such as a road ahead being closed.']


In [30]:
"""The next step is to tokenize into words and create a dictionary. It is important to convert the tokens into unique ids 
which then allows Genism to create a Dictionary object that maps each word to unique id"""

gen_docs = [[w.lower() for w in word_tokenize(text)] for text in file_docs]
print(gen_docs[0])

dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary.token2id)

['vehicles', 'with', 'no', 'human', 'driver', 'on', 'board', 'were', 'previously', 'tested', 'by', 'a', 'select', 'group', 'of', 'waymo', 'customers', ',', 'before', 'the', 'pandemic', 'halted', 'the', 'service', '.']
{',': 0, '.': 1, 'a': 2, 'before': 3, 'board': 4, 'by': 5, 'customers': 6, 'driver': 7, 'group': 8, 'halted': 9, 'human': 10, 'no': 11, 'of': 12, 'on': 13, 'pandemic': 14, 'previously': 15, 'select': 16, 'service': 17, 'tested': 18, 'the': 19, 'vehicles': 20, 'waymo': 21, 'were': 22, 'with': 23, 'adopters': 24, 'all': 25, 'and': 26, 'app': 27, 'available': 28, 'be': 29, 'but': 30, 'cars': 31, 'driver-free': 32, 'early': 33, 'extend': 34, 'family': 35, 'friends': 36, 'initially': 37, 'only': 38, 'those': 39, 'to': 40, 'users': 41, 'weeks': 42, 'will': 43, 'within': 44, 'comes': 45, 'later': 46, 'originally': 47, 'promised': 48, 'than': 49, 'two': 50, 'years': 51, '-': 52, 'ahead': 53, 'as': 54, 'being': 55, 'circumstances': 56, 'closed': 57, 'extreme': 58, 'in': 59, 'inter

In [32]:
"""Bag of words. (Lists the number of times each word occurs in the sentence)"""
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1)]


In [36]:
"""TFIDF(Term Frequency and Inverse Document Frequency)
It is a measure of originality of a word by comparing  the no of times a word appears in a doc with the number of docs the word appears in.
Basically words that occur more frequently across the documents get smaller weights"""

tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])


[[',', 0.05], ['a', 0.12], ['before', 0.24], ['board', 0.24], ['by', 0.24], ['customers', 0.24], ['driver', 0.24], ['group', 0.24], ['halted', 0.24], ['human', 0.24], ['no', 0.24], ['of', 0.12], ['on', 0.24], ['pandemic', 0.24], ['previously', 0.24], ['select', 0.24], ['service', 0.12], ['tested', 0.24], ['vehicles', 0.24], ['waymo', 0.12], ['were', 0.24], ['with', 0.12]]
[[',', 0.04], ['of', 0.1], ['waymo', 0.1], ['adopters', 0.21], ['all', 0.21], ['and', 0.21], ['app', 0.21], ['available', 0.21], ['be', 0.1], ['but', 0.21], ['cars', 0.1], ['driver-free', 0.21], ['early', 0.21], ['extend', 0.21], ['family', 0.21], ['friends', 0.21], ['initially', 0.21], ['only', 0.21], ['those', 0.21], ['to', 0.42], ['users', 0.21], ['weeks', 0.21], ['will', 0.21], ['within', 0.21]]
[['service', 0.19], ['comes', 0.37], ['later', 0.37], ['originally', 0.37], ['promised', 0.37], ['than', 0.37], ['two', 0.37], ['years', 0.37]]
[[',', 0.05], ['a', 0.12], ['with', 0.12], ['be', 0.12], ['cars', 0.12], ['wil