# Bag of words example code

In [1]:
vocab = {}
word_encoding = 1

def bag_of_words(text):
    global word_encoding
    
    words = text.lower().split(" ")
    bag = {}
    
    for word in words:
        if word in vocab:
            encoding = vocab[word] # get the encoding value for the word from vocab
        else:
            vocab[word] = word_encoding
            encoding = word_encoding
            word_encoding += 1
            
        if encoding in bag:
            bag[encoding] += 1
        else:
            bag[encoding] = 1
    return bag

text = "this is a test to see if this test will work is is test a a"
bag = bag_of_words(text)
print("Bag of words")
print(bag)
print("Vocabulary")
print(vocab)

Bag of words
{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
Vocabulary
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


# Sentiment analysis
    * eg:- whether a movie review is positive or negative or neutral

In [2]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [3]:
vocab_size = 88584 # number of unique words there can be
maxlen = 250
batch_size = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = vocab_size)

In [4]:
print(train_data[1])

[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 10156, 4, 1153, 9, 194, 775, 7, 8255, 11596, 349, 2637, 148, 605, 15358, 8003, 15, 123, 125, 68, 23141, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 36893, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 25249, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 46151, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]


## Preprocessing
    * length of the records are not the same, inorder to feed the data to the model the record lengths must be the same - make length into the maxlen for all the records

In [5]:
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, maxlen)
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, maxlen)

# Create the model

In [6]:
model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size, 32),       # eventhough we have already converted the values into integers, by using this embedding layer we can  convert those ints into meaningful vectors
            tf.keras.layers.LSTM(32),                        # 32 is the number of dimensions for every single word.. will impleemnt the memory capsity
            tf.keras.layers.Dense(1, activation = "sigmoid") # sigmoid was given as the activation function because it is easier to predict because the predicted values will anyway be between 0 and 1
        ])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


# Training the model

In [8]:
# 1. compile
model.compile(
    optimizer = 'rmsprop',
    loss = 'binary_crossentropy',
    metrics = ['acc']
)

# 2. Train
history = model.fit(
            x = train_data,
            y = train_labels,
            epochs = 5,
            validation_split = 0.2 # 20% of the data wont be fed to be trained but will be used to validate the trained model
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate the model

In [9]:
results = model.evaluate(test_data, test_labels)



In [10]:
print(results)

[0.5035121440887451, 0.846560001373291]


# Making predictions
    * When making predictions we need make sure that the input is converted to integer then a vector inorder for the trained model can understand

In [11]:
word_index = imdb.get_word_index() # the vocabulary 

In [24]:
def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)                # convert text into a sequence of words
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]  # map the word into integers
    return tf.keras.preprocessing.sequence.pad_sequences(
               sequences = [tokens],
               maxlen = maxlen
           )[0]

In [13]:
# keras.preprocessing.text.text_to_word_sequence

sample_text = 'This is a sample sentence.'
x = keras.preprocessing.text.text_to_word_sequence(
        input_text = sample_text,
        lower = False,
        split = ' '
    )
print(x)

['This', 'is', 'a', 'sample', 'sentence']


In [14]:
text = "That movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [15]:
reverse_word_index = {value: key for (key,value) in word_index.items()} # reversing the word index

In [16]:
def decode_integers(integers):
    pad = 0
    text = ""
    for num in integers:
        if num != pad:
            text = text + reverse_word_index[num] + " "
    return text[:-1]

In [17]:
print(decode_integers(encoded))

that movie was just amazing so amazing


In [25]:
# Prediction function

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1, 250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])

In [38]:
import math

text = "That movie was just amazing, so amazing"
predicted_value = predict(text)

# if math.floor(predicted_value) >= 0.5:
#     print("Positive review")
# else:
#     print("Negative review")

positive_review = "That movie was! really loved it and would great watch it again because it was amazingly great"
predict(positive_review)

negative_review = "that movie really sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)


[0.8425161]
[0.9073368]
[0.07161315]
