In [1]:
import re
import nltk

In [2]:
paragraph = """Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen â¦ thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency â¦ my entire team. I have to thank everyone from the very onset of my career â¦ To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are."""

In [3]:
paragraph

'Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen â\x80¦ thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency â\x80¦ my entire team. I have to thank everyone from the very onset of my career â\x80¦ To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are.'

In [4]:
datasets = nltk.sent_tokenize(paragraph)

In [5]:
datasets

['Thank you all so very much.',
 'Thank you to the Academy.',
 'Thank you to all of you in this room.',
 'I have to congratulate the other incredible nominees this year.',
 'The Revenant was the product of the tireless efforts of an unbelievable cast and crew.',
 'First off, to my brother in this endeavor, Mr. Tom Hardy.',
 'Tom, your talent on screen can only be surpassed by your friendship off screen â\x80¦ thank you for creating a transcendent cinematic experience.',
 'Thank you to everybody at Fox and New Regency â\x80¦ my entire team.',
 'I have to thank everyone from the very onset of my career â\x80¦ To my parents; none of this would be possible without you.',
 'And to my friends, I love you dearly; you know who you are.']

In [6]:
len(datasets)

10

In [7]:
for i in range(len(datasets)):
    datasets[i] = datasets[i].lower()
    datasets[i] = re.sub(r'\W', ' ',datasets[i])
    datasets[i] = re.sub(r'\s+', ' ',datasets[i])
    datasets[i] = re.sub(r'\s+$', '',datasets[i])

In [8]:
datasets

['thank you all so very much',
 'thank you to the academy',
 'thank you to all of you in this room',
 'i have to congratulate the other incredible nominees this year',
 'the revenant was the product of the tireless efforts of an unbelievable cast and crew',
 'first off to my brother in this endeavor mr tom hardy',
 'tom your talent on screen can only be surpassed by your friendship off screen â thank you for creating a transcendent cinematic experience',
 'thank you to everybody at fox and new regency â my entire team',
 'i have to thank everyone from the very onset of my career â to my parents none of this would be possible without you',
 'and to my friends i love you dearly you know who you are']

In [9]:
word2count = {}
for data in datasets:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] +=1

In [10]:
word2count

{'thank': 6,
 'you': 10,
 'all': 2,
 'so': 1,
 'very': 2,
 'much': 1,
 'to': 8,
 'the': 6,
 'academy': 1,
 'of': 5,
 'in': 2,
 'this': 4,
 'room': 1,
 'i': 3,
 'have': 2,
 'congratulate': 1,
 'other': 1,
 'incredible': 1,
 'nominees': 1,
 'year': 1,
 'revenant': 1,
 'was': 1,
 'product': 1,
 'tireless': 1,
 'efforts': 1,
 'an': 1,
 'unbelievable': 1,
 'cast': 1,
 'and': 3,
 'crew': 1,
 'first': 1,
 'off': 2,
 'my': 5,
 'brother': 1,
 'endeavor': 1,
 'mr': 1,
 'tom': 2,
 'hardy': 1,
 'your': 2,
 'talent': 1,
 'on': 1,
 'screen': 2,
 'can': 1,
 'only': 1,
 'be': 2,
 'surpassed': 1,
 'by': 1,
 'friendship': 1,
 'â': 3,
 'for': 1,
 'creating': 1,
 'a': 1,
 'transcendent': 1,
 'cinematic': 1,
 'experience': 1,
 'everybody': 1,
 'at': 1,
 'fox': 1,
 'new': 1,
 'regency': 1,
 'entire': 1,
 'team': 1,
 'everyone': 1,
 'from': 1,
 'onset': 1,
 'career': 1,
 'parents': 1,
 'none': 1,
 'would': 1,
 'possible': 1,
 'without': 1,
 'friends': 1,
 'love': 1,
 'dearly': 1,
 'know': 1,
 'who': 1,
 'are

In [11]:
import heapq

In [12]:
freq_words = heapq.nlargest(15,word2count,key=word2count.get)

In [13]:
freq_words

['you',
 'to',
 'thank',
 'the',
 'of',
 'my',
 'this',
 'i',
 'and',
 'â',
 'all',
 'very',
 'in',
 'have',
 'off']

In [14]:
X = []
for data in datasets:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)

In [15]:
X

[[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
 [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1],
 [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
 [1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0],
 [1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]]

In [16]:
import numpy as np

In [17]:
X = np.asarray(X)

In [18]:
X.shape

(10, 15)

In [19]:
X

array([[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]])