N.SRI ABHINAV
AM.EN.U4CSE19137

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from tqdm import tqdm

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
text = '''William Shakespeare was an English poet and playwright who is considered one of the greatest writers to ever use the English language. He is also the most famous playwright in the world, with his plays being translated in over 50 languages and performed across the globe for audiences of all ages. Known colloquially as "The Bard" or "The Bard of Avon," Shakespeare was also an actor and the creator of the Globe Theatre, a historical theatre, and company that is visited by hundreds of thousands of tourists every year.
His works span tragedy, comedy, and historical works, both in poetry and prose. And although the man is the most-recognized playwright in the world, very little of his life is actually known. No known autobiographical letters or diaries have survived to modern day, and with no surviving descendants, Shakespeare is a figure both of magnificent genius and mystery. 
'''

### Sentence Splitting

In [4]:
from nltk import sent_tokenize

sentences = sent_tokenize(text)
print(sentences[:5])

['William Shakespeare was an English poet and playwright who is considered one of the greatest writers to ever use the English language.', 'He is also the most famous playwright in the world, with his plays being translated in over 50 languages and performed across the globe for audiences of all ages.', 'Known colloquially as "The Bard" or "The Bard of Avon," Shakespeare was also an actor and the creator of the Globe Theatre, a historical theatre, and company that is visited by hundreds of thousands of tourists every year.', 'His works span tragedy, comedy, and historical works, both in poetry and prose.', 'And although the man is the most-recognized playwright in the world, very little of his life is actually known.']


### Tokenization

In [5]:
words = [0]*len(sentences)
for ind,sentence in enumerate(sentences):
    w = sentence.split()
    words[ind] = w
print(words[:2])

[['William', 'Shakespeare', 'was', 'an', 'English', 'poet', 'and', 'playwright', 'who', 'is', 'considered', 'one', 'of', 'the', 'greatest', 'writers', 'to', 'ever', 'use', 'the', 'English', 'language.'], ['He', 'is', 'also', 'the', 'most', 'famous', 'playwright', 'in', 'the', 'world,', 'with', 'his', 'plays', 'being', 'translated', 'in', 'over', '50', 'languages', 'and', 'performed', 'across', 'the', 'globe', 'for', 'audiences', 'of', 'all', 'ages.']]


### Removing Punctuations

In [6]:
new_words = []
for word in words:
    temp = []
    for w in word:
        temp_word = re.sub(r'[^\w\s]', '', w)
        if temp != '':
            temp.append(temp_word)
    new_words.append(temp)
new_words[:2]

[['William',
  'Shakespeare',
  'was',
  'an',
  'English',
  'poet',
  'and',
  'playwright',
  'who',
  'is',
  'considered',
  'one',
  'of',
  'the',
  'greatest',
  'writers',
  'to',
  'ever',
  'use',
  'the',
  'English',
  'language'],
 ['He',
  'is',
  'also',
  'the',
  'most',
  'famous',
  'playwright',
  'in',
  'the',
  'world',
  'with',
  'his',
  'plays',
  'being',
  'translated',
  'in',
  'over',
  '50',
  'languages',
  'and',
  'performed',
  'across',
  'the',
  'globe',
  'for',
  'audiences',
  'of',
  'all',
  'ages']]

### Stop Word Removal

In [7]:
stop_words = set(stopwords.words('english'))
stop_removed_arr = list()
for word in new_words:
    stop_removed_arr.append([w for w in word if not w in stop_words])
words = stop_removed_arr
words[:2]

[['William',
  'Shakespeare',
  'English',
  'poet',
  'playwright',
  'considered',
  'one',
  'greatest',
  'writers',
  'ever',
  'use',
  'English',
  'language'],
 ['He',
  'also',
  'famous',
  'playwright',
  'world',
  'plays',
  'translated',
  '50',
  'languages',
  'performed',
  'across',
  'globe',
  'audiences',
  'ages']]

### Lower Casing

In [8]:
temp = list()
for word in words:
    temp.append([w.lower() for w in word])
words = temp
words

[['william',
  'shakespeare',
  'english',
  'poet',
  'playwright',
  'considered',
  'one',
  'greatest',
  'writers',
  'ever',
  'use',
  'english',
  'language'],
 ['he',
  'also',
  'famous',
  'playwright',
  'world',
  'plays',
  'translated',
  '50',
  'languages',
  'performed',
  'across',
  'globe',
  'audiences',
  'ages'],
 ['known',
  'colloquially',
  'the',
  'bard',
  'the',
  'bard',
  'avon',
  'shakespeare',
  'also',
  'actor',
  'creator',
  'globe',
  'theatre',
  'historical',
  'theatre',
  'company',
  'visited',
  'hundreds',
  'thousands',
  'tourists',
  'every',
  'year'],
 ['his',
  'works',
  'span',
  'tragedy',
  'comedy',
  'historical',
  'works',
  'poetry',
  'prose'],
 ['and',
  'although',
  'man',
  'mostrecognized',
  'playwright',
  'world',
  'little',
  'life',
  'actually',
  'known'],
 ['no',
  'known',
  'autobiographical',
  'letters',
  'diaries',
  'survived',
  'modern',
  'day',
  'surviving',
  'descendants',
  'shakespeare',
  'fi

### Lemmatization

In [9]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_word = list()
for word in words:
    lemmatized_word.append([wordnet_lemmatizer.lemmatize(w) for w in word])
words = lemmatized_word
words

[['william',
  'shakespeare',
  'english',
  'poet',
  'playwright',
  'considered',
  'one',
  'greatest',
  'writer',
  'ever',
  'use',
  'english',
  'language'],
 ['he',
  'also',
  'famous',
  'playwright',
  'world',
  'play',
  'translated',
  '50',
  'language',
  'performed',
  'across',
  'globe',
  'audience',
  'age'],
 ['known',
  'colloquially',
  'the',
  'bard',
  'the',
  'bard',
  'avon',
  'shakespeare',
  'also',
  'actor',
  'creator',
  'globe',
  'theatre',
  'historical',
  'theatre',
  'company',
  'visited',
  'hundred',
  'thousand',
  'tourist',
  'every',
  'year'],
 ['his',
  'work',
  'span',
  'tragedy',
  'comedy',
  'historical',
  'work',
  'poetry',
  'prose'],
 ['and',
  'although',
  'man',
  'mostrecognized',
  'playwright',
  'world',
  'little',
  'life',
  'actually',
  'known'],
 ['no',
  'known',
  'autobiographical',
  'letter',
  'diary',
  'survived',
  'modern',
  'day',
  'surviving',
  'descendant',
  'shakespeare',
  'figure',
  'magn

### Stemming

In [10]:
from nltk.stem.porter import PorterStemmer

stemmed = list()
porter = PorterStemmer()
for word in words:
    stemmed.append([porter.stem(w) for w in word])
words = stemmed
words

[['william',
  'shakespear',
  'english',
  'poet',
  'playwright',
  'consid',
  'one',
  'greatest',
  'writer',
  'ever',
  'use',
  'english',
  'languag'],
 ['he',
  'also',
  'famou',
  'playwright',
  'world',
  'play',
  'translat',
  '50',
  'languag',
  'perform',
  'across',
  'globe',
  'audienc',
  'age'],
 ['known',
  'colloqui',
  'the',
  'bard',
  'the',
  'bard',
  'avon',
  'shakespear',
  'also',
  'actor',
  'creator',
  'globe',
  'theatr',
  'histor',
  'theatr',
  'compani',
  'visit',
  'hundr',
  'thousand',
  'tourist',
  'everi',
  'year'],
 ['hi',
  'work',
  'span',
  'tragedi',
  'comedi',
  'histor',
  'work',
  'poetri',
  'prose'],
 ['and',
  'although',
  'man',
  'mostrecogn',
  'playwright',
  'world',
  'littl',
  'life',
  'actual',
  'known'],
 ['no',
  'known',
  'autobiograph',
  'letter',
  'diari',
  'surviv',
  'modern',
  'day',
  'surviv',
  'descend',
  'shakespear',
  'figur',
  'magnific',
  'geniu',
  'mysteri']]

### Inverted Index

In [11]:
inverted_index_temp = dict()
inverted_index = dict()
for id,word in enumerate(words):
    for  w in word:
        if w not in inverted_index_temp.keys():
            inverted_index_temp[w] = [id]
        else:
            inverted_index_temp[w].append(id)

for key in inverted_index_temp.keys():
    k = (key,len(inverted_index_temp[key]))
    inverted_index[k] = inverted_index_temp[key]
inverted_index

{('50', 1): [1],
 ('across', 1): [1],
 ('actor', 1): [2],
 ('actual', 1): [4],
 ('age', 1): [1],
 ('also', 2): [1, 2],
 ('although', 1): [4],
 ('and', 1): [4],
 ('audienc', 1): [1],
 ('autobiograph', 1): [5],
 ('avon', 1): [2],
 ('bard', 2): [2, 2],
 ('colloqui', 1): [2],
 ('comedi', 1): [3],
 ('compani', 1): [2],
 ('consid', 1): [0],
 ('creator', 1): [2],
 ('day', 1): [5],
 ('descend', 1): [5],
 ('diari', 1): [5],
 ('english', 2): [0, 0],
 ('ever', 1): [0],
 ('everi', 1): [2],
 ('famou', 1): [1],
 ('figur', 1): [5],
 ('geniu', 1): [5],
 ('globe', 2): [1, 2],
 ('greatest', 1): [0],
 ('he', 1): [1],
 ('hi', 1): [3],
 ('histor', 2): [2, 3],
 ('hundr', 1): [2],
 ('known', 3): [2, 4, 5],
 ('languag', 2): [0, 1],
 ('letter', 1): [5],
 ('life', 1): [4],
 ('littl', 1): [4],
 ('magnific', 1): [5],
 ('man', 1): [4],
 ('modern', 1): [5],
 ('mostrecogn', 1): [4],
 ('mysteri', 1): [5],
 ('no', 1): [5],
 ('one', 1): [0],
 ('perform', 1): [1],
 ('play', 1): [1],
 ('playwright', 3): [0, 1, 4],
 ('poe