### Bag of words using NLTK

In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize
import re, heapq
import numpy as np

In [2]:
sent = """Here’s a sample of reviews about a particular horror movie:

Review 1: This movie is very scary and long
Review 2: This movie is not scary and is slow
Review 3: This movie is spooky and good
You can see that there are some contrasting reviews about the movie as well as the length and pace of the movie. Imagine looking at a thousand reviews like these. Clearly, there is a lot of interesting insights we can draw from them and build upon them to gauge how well the movie performed.

However, as we saw above, we cannot simply give these sentences to a machine learning model and ask it to tell us whether a review was positive or negative. We need to perform certain text preprocessing steps.

Bag-of-Words and TF-IDF are two examples of how to do this. Let’s understand them in detail.

 

Creating Vectors from Text
Can you think of some techniques we could use to vectorize a sentence at the beginning? The basic requirements would be:

It should not result in a sparse matrix since sparse matrices result in high computation cost
We should be able to retain most of the linguistic information present in the sentence
Word Embedding is one such technique where we can represent the text using vectors. The more popular forms of word embeddings are:

BoW, which stands for Bag of Words
TF-IDF, which stands for Term Frequency-Inverse Document Frequency
Now, let us see how we can represent the above movie reviews as embeddings and get them ready for a machine learning model."""

In [3]:
data = nltk.sent_tokenize(sent)

In [4]:
data

['Here’s a sample of reviews about a particular horror movie:\n\nReview 1: This movie is very scary and long\nReview 2: This movie is not scary and is slow\nReview 3: This movie is spooky and good\nYou can see that there are some contrasting reviews about the movie as well as the length and pace of the movie.',
 'Imagine looking at a thousand reviews like these.',
 'Clearly, there is a lot of interesting insights we can draw from them and build upon them to gauge how well the movie performed.',
 'However, as we saw above, we cannot simply give these sentences to a machine learning model and ask it to tell us whether a review was positive or negative.',
 'We need to perform certain text preprocessing steps.',
 'Bag-of-Words and TF-IDF are two examples of how to do this.',
 'Let’s understand them in detail.',
 'Creating Vectors from Text\nCan you think of some techniques we could use to vectorize a sentence at the beginning?',
 'The basic requirements would be:\n\nIt should not result in

In [5]:
for i in range(len(data)):
    data[i] = data[i].lower()
    data[i] = re.sub(r'\W',' ',data[i])
    data[i] = re.sub(r'\s+',' ',data[i])

In [6]:
data

['here s a sample of reviews about a particular horror movie review 1 this movie is very scary and long review 2 this movie is not scary and is slow review 3 this movie is spooky and good you can see that there are some contrasting reviews about the movie as well as the length and pace of the movie ',
 'imagine looking at a thousand reviews like these ',
 'clearly there is a lot of interesting insights we can draw from them and build upon them to gauge how well the movie performed ',
 'however as we saw above we cannot simply give these sentences to a machine learning model and ask it to tell us whether a review was positive or negative ',
 'we need to perform certain text preprocessing steps ',
 'bag of words and tf idf are two examples of how to do this ',
 'let s understand them in detail ',
 'creating vectors from text can you think of some techniques we could use to vectorize a sentence at the beginning ',
 'the basic requirements would be it should not result in a sparse matrix s

In [7]:
word2count = {}
for i in data:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [8]:
word2count

{'here': 1,
 's': 2,
 'a': 9,
 'sample': 1,
 'of': 9,
 'reviews': 4,
 'about': 2,
 'particular': 1,
 'horror': 1,
 'movie': 8,
 'review': 4,
 '1': 1,
 'this': 4,
 'is': 6,
 'very': 1,
 'scary': 2,
 'and': 8,
 'long': 1,
 '2': 1,
 'not': 3,
 'slow': 1,
 '3': 1,
 'spooky': 1,
 'good': 1,
 'you': 2,
 'can': 6,
 'see': 2,
 'that': 1,
 'there': 2,
 'are': 3,
 'some': 2,
 'contrasting': 1,
 'the': 11,
 'as': 4,
 'well': 2,
 'length': 1,
 'pace': 1,
 'imagine': 1,
 'looking': 1,
 'at': 2,
 'thousand': 1,
 'like': 1,
 'these': 2,
 'clearly': 1,
 'lot': 1,
 'interesting': 1,
 'insights': 1,
 'we': 8,
 'draw': 1,
 'from': 2,
 'them': 4,
 'build': 1,
 'upon': 1,
 'to': 7,
 'gauge': 1,
 'how': 3,
 'performed': 1,
 'however': 1,
 'saw': 1,
 'above': 2,
 'simply': 1,
 'give': 1,
 'sentences': 1,
 'machine': 2,
 'learning': 2,
 'model': 2,
 'ask': 1,
 'it': 2,
 'tell': 1,
 'us': 2,
 'whether': 1,
 'was': 1,
 'positive': 1,
 'or': 1,
 'negative': 1,
 'need': 1,
 'perform': 1,
 'certain': 1,
 'text': 3

In [9]:
freq_words = heapq.nlargest(100,word2count, key=word2count.get)

In [10]:
freq_words

['the',
 'a',
 'of',
 'movie',
 'and',
 'we',
 'to',
 'is',
 'can',
 'reviews',
 'review',
 'this',
 'as',
 'them',
 'in',
 'not',
 'are',
 'how',
 'text',
 'for',
 's',
 'about',
 'scary',
 'you',
 'see',
 'there',
 'some',
 'well',
 'at',
 'these',
 'from',
 'above',
 'machine',
 'learning',
 'model',
 'it',
 'us',
 'bag',
 'words',
 'tf',
 'idf',
 'let',
 'vectors',
 'sentence',
 'be',
 'should',
 'result',
 'sparse',
 'word',
 'represent',
 'embeddings',
 'which',
 'stands',
 'frequency',
 'here',
 'sample',
 'particular',
 'horror',
 '1',
 'very',
 'long',
 '2',
 'slow',
 '3',
 'spooky',
 'good',
 'that',
 'contrasting',
 'length',
 'pace',
 'imagine',
 'looking',
 'thousand',
 'like',
 'clearly',
 'lot',
 'interesting',
 'insights',
 'draw',
 'build',
 'upon',
 'gauge',
 'performed',
 'however',
 'saw',
 'simply',
 'give',
 'sentences',
 'ask',
 'tell',
 'whether',
 'was',
 'positive',
 'or',
 'negative',
 'need',
 'perform',
 'certain',
 'preprocessing',
 'steps']

In [11]:
import heapq
freq_words = heapq.nlargest(100, word2count, key=word2count.get)
freq_words

['the',
 'a',
 'of',
 'movie',
 'and',
 'we',
 'to',
 'is',
 'can',
 'reviews',
 'review',
 'this',
 'as',
 'them',
 'in',
 'not',
 'are',
 'how',
 'text',
 'for',
 's',
 'about',
 'scary',
 'you',
 'see',
 'there',
 'some',
 'well',
 'at',
 'these',
 'from',
 'above',
 'machine',
 'learning',
 'model',
 'it',
 'us',
 'bag',
 'words',
 'tf',
 'idf',
 'let',
 'vectors',
 'sentence',
 'be',
 'should',
 'result',
 'sparse',
 'word',
 'represent',
 'embeddings',
 'which',
 'stands',
 'frequency',
 'here',
 'sample',
 'particular',
 'horror',
 '1',
 'very',
 'long',
 '2',
 'slow',
 '3',
 'spooky',
 'good',
 'that',
 'contrasting',
 'length',
 'pace',
 'imagine',
 'looking',
 'thousand',
 'like',
 'clearly',
 'lot',
 'interesting',
 'insights',
 'draw',
 'build',
 'upon',
 'gauge',
 'performed',
 'however',
 'saw',
 'simply',
 'give',
 'sentences',
 'ask',
 'tell',
 'whether',
 'was',
 'positive',
 'or',
 'negative',
 'need',
 'perform',
 'certain',
 'preprocessing',
 'steps']

In [12]:
x = []
for d in data:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(d):
            vector.append(1)
        else:
            vector.append(0)

x.append(vector)
x = np.asarray(x)
x

array([[1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### Bag of words using Tensorflow

In [13]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
sentence = ["""Here’s a sample of reviews about a particular horror movie:

Review 1: This movie is very scary and long
Review 2: This movie is not scary and is slow
Review 3: This movie is spooky and good
You can see that there are some contrasting reviews about the movie as well as the length and pace of the movie. Imagine looking at a thousand reviews like these. Clearly, there is a lot of interesting insights we can draw from them and build upon them to gauge how well the movie performed.

However, as we saw above, we cannot simply give these sentences to a machine learning model and ask it to tell us whether a review was positive or negative. We need to perform certain text preprocessing steps.

Bag-of-Words and TF-IDF are two examples of how to do this. Let’s understand them in detail.

 

Creating Vectors from Text
Can you think of some techniques we could use to vectorize a sentence at the beginning? The basic requirements would be:

It should not result in a sparse matrix since sparse matrices result in high computation cost
We should be able to retain most of the linguistic information present in the sentence
Word Embedding is one such technique where we can represent the text using vectors. The more popular forms of word embeddings are:

BoW, which stands for Bag of Words
TF-IDF, which stands for Term Frequency-Inverse Document Frequency
Now, let us see how we can represent the above movie reviews as embeddings and get them ready for a machine learning model."""]
def print_bow(sentence: str) -> None:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentence)
    sequences = tokenizer.texts_to_sequences(sentence)
    word_index = tokenizer.word_index
    bow = {}
    for key in word_index:
        for key in word_index:
            bow[key] = sequences[0].count(word_index[key])
        print(bow)
        print("Bag of Word sentence 1: \n",bow)
        print("we found ", len(word_index), "unique tokens.")
print_bow(sentence)



{'the': 11, 'a': 9, 'of': 9, 'movie': 8, 'and': 8, 'we': 8, 'to': 7, 'is': 6, 'can': 5, 'reviews': 4, 'review': 4, 'this': 4, 'as': 4, 'them': 4, 'in': 4, 'are': 3, 'how': 3, 'text': 3, 'for': 3, 'about': 2, 'scary': 2, 'not': 2, 'you': 2, 'see': 2, 'there': 2, 'some': 2, 'well': 2, 'at': 2, 'these': 2, 'from': 2, 'above': 2, 'machine': 2, 'learning': 2, 'model': 2, 'it': 2, 'us': 2, 'bag': 2, 'words': 2, 'tf': 2, 'idf': 2, 'vectors': 2, 'sentence': 2, 'be': 2, 'should': 2, 'result': 2, 'sparse': 2, 'word': 2, 'represent': 2, 'embeddings': 2, 'which': 2, 'stands': 2, 'frequency': 2, 'here’s': 1, 'sample': 1, 'particular': 1, 'horror': 1, '1': 1, 'very': 1, 'long': 1, '2': 1, 'slow': 1, '3': 1, 'spooky': 1, 'good': 1, 'that': 1, 'contrasting': 1, 'length': 1, 'pace': 1, 'imagine': 1, 'looking': 1, 'thousand': 1, 'like': 1, 'clearly': 1, 'lot': 1, 'interesting': 1, 'insights': 1, 'draw': 1, 'build': 1, 'upon': 1, 'gauge': 1, 'performed': 1, 'however': 1, 'saw': 1, 'cannot': 1, 'simply': 

{'the': 11, 'a': 9, 'of': 9, 'movie': 8, 'and': 8, 'we': 8, 'to': 7, 'is': 6, 'can': 5, 'reviews': 4, 'review': 4, 'this': 4, 'as': 4, 'them': 4, 'in': 4, 'are': 3, 'how': 3, 'text': 3, 'for': 3, 'about': 2, 'scary': 2, 'not': 2, 'you': 2, 'see': 2, 'there': 2, 'some': 2, 'well': 2, 'at': 2, 'these': 2, 'from': 2, 'above': 2, 'machine': 2, 'learning': 2, 'model': 2, 'it': 2, 'us': 2, 'bag': 2, 'words': 2, 'tf': 2, 'idf': 2, 'vectors': 2, 'sentence': 2, 'be': 2, 'should': 2, 'result': 2, 'sparse': 2, 'word': 2, 'represent': 2, 'embeddings': 2, 'which': 2, 'stands': 2, 'frequency': 2, 'here’s': 1, 'sample': 1, 'particular': 1, 'horror': 1, '1': 1, 'very': 1, 'long': 1, '2': 1, 'slow': 1, '3': 1, 'spooky': 1, 'good': 1, 'that': 1, 'contrasting': 1, 'length': 1, 'pace': 1, 'imagine': 1, 'looking': 1, 'thousand': 1, 'like': 1, 'clearly': 1, 'lot': 1, 'interesting': 1, 'insights': 1, 'draw': 1, 'build': 1, 'upon': 1, 'gauge': 1, 'performed': 1, 'however': 1, 'saw': 1, 'cannot': 1, 'simply': 

In [14]:
keras.__version__

'2.4.0'