In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from tqdm import tqdm

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
text = '''William Shakespeare was an English poet and playwright who is considered one of the greatest writers to ever use the English language. He is also the most famous playwright in the world, with his plays being translated in over 50 languages and performed across the globe for audiences of all ages. Known colloquially as "The Bard" or "The Bard of Avon," Shakespeare was also an actor and the creator of the Globe Theatre, a historical theatre, and company that is visited by hundreds of thousands of tourists every year.
His works span tragedy, comedy, and historical works, both in poetry and prose. And although the man is the most-recognized playwright in the world, very little of his life is actually known. No known autobiographical letters or diaries have survived to modern day, and with no surviving descendants, Shakespeare is a figure both of magnificent genius and mystery. 
'''

### Sentence Splitting

In [None]:
from nltk import sent_tokenize

sentences = sent_tokenize(text)
print(sentences[:5])

['Gregor Samsa wakes up one morning to find himself transformed into a "monstrous vermin".', 'He initially considers the transformation to be temporary and slowly ponders the consequences of this metamorphosis.', 'Stuck on his back and unable to get up and leave the bed, Gregor reflects on his job as a traveling salesman and cloth merchant, which he characterizes as being full of "temporary and constantly changing human relationships, which never come from the heart".', "He sees his employer as a despot and would quickly quit his job if he were not his family's sole breadwinner and working off his bankrupt father's debts.", "While trying to move Gregor finds that his office manager, the chief clerk, has shown up to check on him, indignant about Gregor's unexcused absence."]


### Tokenization

In [None]:
words = [0]*len(sentences)
for ind,sentence in enumerate(sentences):
    w = sentence.split()
    words[ind] = w
print(words[:2])

[['Gregor', 'Samsa', 'wakes', 'up', 'one', 'morning', 'to', 'find', 'himself', 'transformed', 'into', 'a', '"monstrous', 'vermin".'], ['He', 'initially', 'considers', 'the', 'transformation', 'to', 'be', 'temporary', 'and', 'slowly', 'ponders', 'the', 'consequences', 'of', 'this', 'metamorphosis.']]


### Removing Punctuations

In [None]:
new_words = []
for word in words:
    temp = []
    for w in word:
        temp_word = re.sub(r'[^\w\s]', '', w)
        if temp != '':
            temp.append(temp_word)
    new_words.append(temp)
new_words[:2]

[['Gregor',
  'Samsa',
  'wakes',
  'up',
  'one',
  'morning',
  'to',
  'find',
  'himself',
  'transformed',
  'into',
  'a',
  'monstrous',
  'vermin'],
 ['He',
  'initially',
  'considers',
  'the',
  'transformation',
  'to',
  'be',
  'temporary',
  'and',
  'slowly',
  'ponders',
  'the',
  'consequences',
  'of',
  'this',
  'metamorphosis']]

### Stop Word Removal

In [None]:
stop_words = set(stopwords.words('english'))
stop_removed_arr = list()
for word in new_words:
    stop_removed_arr.append([w for w in word if not w in stop_words])
words = stop_removed_arr
words[:2]

[['Gregor',
  'Samsa',
  'wakes',
  'one',
  'morning',
  'find',
  'transformed',
  'monstrous',
  'vermin'],
 ['He',
  'initially',
  'considers',
  'transformation',
  'temporary',
  'slowly',
  'ponders',
  'consequences',
  'metamorphosis']]

### Lower Casing

In [None]:
temp = list()
for word in words:
    temp.append([w.lower() for w in word])
words = temp
words

[['gregor',
  'samsa',
  'wakes',
  'one',
  'morning',
  'find',
  'transformed',
  'monstrous',
  'vermin'],
 ['he',
  'initially',
  'considers',
  'transformation',
  'temporary',
  'slowly',
  'ponders',
  'consequences',
  'metamorphosis'],
 ['stuck',
  'back',
  'unable',
  'get',
  'leave',
  'bed',
  'gregor',
  'reflects',
  'job',
  'traveling',
  'salesman',
  'cloth',
  'merchant',
  'characterizes',
  'full',
  'temporary',
  'constantly',
  'changing',
  'human',
  'relationships',
  'never',
  'come',
  'heart'],
 ['he',
  'sees',
  'employer',
  'despot',
  'would',
  'quickly',
  'quit',
  'job',
  'familys',
  'sole',
  'breadwinner',
  'working',
  'bankrupt',
  'fathers',
  'debts'],
 ['while',
  'trying',
  'move',
  'gregor',
  'finds',
  'office',
  'manager',
  'chief',
  'clerk',
  'shown',
  'check',
  'indignant',
  'gregors',
  'unexcused',
  'absence'],
 ['gregor',
  'attempts',
  'communicate',
  'manager',
  'family',
  'hear',
  'behind',
  'door',
  'i

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_word = list()
for word in words:
    lemmatized_word.append([wordnet_lemmatizer.lemmatize(w) for w in word])
words = lemmatized_word
words

[['gregor',
  'samsa',
  'wake',
  'one',
  'morning',
  'find',
  'transformed',
  'monstrous',
  'vermin'],
 ['he',
  'initially',
  'considers',
  'transformation',
  'temporary',
  'slowly',
  'ponders',
  'consequence',
  'metamorphosis'],
 ['stuck',
  'back',
  'unable',
  'get',
  'leave',
  'bed',
  'gregor',
  'reflects',
  'job',
  'traveling',
  'salesman',
  'cloth',
  'merchant',
  'characterizes',
  'full',
  'temporary',
  'constantly',
  'changing',
  'human',
  'relationship',
  'never',
  'come',
  'heart'],
 ['he',
  'see',
  'employer',
  'despot',
  'would',
  'quickly',
  'quit',
  'job',
  'family',
  'sole',
  'breadwinner',
  'working',
  'bankrupt',
  'father',
  'debt'],
 ['while',
  'trying',
  'move',
  'gregor',
  'find',
  'office',
  'manager',
  'chief',
  'clerk',
  'shown',
  'check',
  'indignant',
  'gregors',
  'unexcused',
  'absence'],
 ['gregor',
  'attempt',
  'communicate',
  'manager',
  'family',
  'hear',
  'behind',
  'door',
  'incomprehe

### Stemming

In [None]:
from nltk.stem.porter import PorterStemmer

stemmed = list()
porter = PorterStemmer()
for word in words:
    stemmed.append([porter.stem(w) for w in word])
words = stemmed
words

[['gregor',
  'samsa',
  'wake',
  'one',
  'morn',
  'find',
  'transform',
  'monstrou',
  'vermin'],
 ['he',
  'initi',
  'consid',
  'transform',
  'temporari',
  'slowli',
  'ponder',
  'consequ',
  'metamorphosi'],
 ['stuck',
  'back',
  'unabl',
  'get',
  'leav',
  'bed',
  'gregor',
  'reflect',
  'job',
  'travel',
  'salesman',
  'cloth',
  'merchant',
  'character',
  'full',
  'temporari',
  'constantli',
  'chang',
  'human',
  'relationship',
  'never',
  'come',
  'heart'],
 ['he',
  'see',
  'employ',
  'despot',
  'would',
  'quickli',
  'quit',
  'job',
  'famili',
  'sole',
  'breadwinn',
  'work',
  'bankrupt',
  'father',
  'debt'],
 ['while',
  'tri',
  'move',
  'gregor',
  'find',
  'offic',
  'manag',
  'chief',
  'clerk',
  'shown',
  'check',
  'indign',
  'gregor',
  'unexcus',
  'absenc'],
 ['gregor',
  'attempt',
  'commun',
  'manag',
  'famili',
  'hear',
  'behind',
  'door',
  'incomprehens',
  'vocal'],
 ['gregor', 'labori', 'drag', 'across', 'floor'

### Inverted Index

In [None]:
inverted_index_temp = dict()
inverted_index = dict()
for id,word in enumerate(words):
    for  w in word:
        if w not in inverted_index_temp.keys():
            inverted_index_temp[w] = [id]
        else:
            inverted_index_temp[w].append(id)

for key in inverted_index_temp.keys():
    k = (key,len(inverted_index_temp[key]))
    inverted_index[k] = inverted_index_temp[key]
inverted_index

{('absenc', 1): [4],
 ('across', 1): [6],
 ('apart', 1): [7],
 ('attempt', 1): [5],
 ('back', 2): [2, 8],
 ('bankrupt', 1): [3],
 ('bed', 1): [2],
 ('behind', 1): [5],
 ('breadwinn', 1): [3],
 ('chang', 1): [2],
 ('character', 1): [2],
 ('check', 1): [4],
 ('chief', 1): [4],
 ('clerk', 2): [4, 7],
 ('cloth', 1): [2],
 ('come', 1): [2],
 ('commun', 1): [5],
 ('consequ', 1): [1],
 ('consid', 1): [1],
 ('constantli', 1): [2],
 ('debt', 1): [3],
 ('despot', 1): [3],
 ('door', 2): [5, 6],
 ('doorway', 1): [8],
 ('drag', 1): [6],
 ('drive', 1): [8],
 ('employ', 1): [3],
 ('famili', 3): [3, 5, 8],
 ('father', 2): [3, 8],
 ('find', 2): [0, 4],
 ('flee', 1): [7],
 ('floor', 1): [6],
 ('full', 1): [2],
 ('get', 2): [2, 8],
 ('gregor', 8): [0, 2, 4, 4, 5, 6, 7, 8],
 ('he', 2): [1, 3],
 ('hear', 1): [5],
 ('heart', 1): [2],
 ('horrifi', 1): [8],
 ('human', 1): [2],
 ('incomprehens', 1): [5],
 ('indign', 1): [4],
 ('initi', 1): [1],
 ('injur', 1): [8],
 ('job', 2): [2, 3],
 ('labori', 1): [6],
 ('l