# Sentiment Classifier using Python

1. preprocess the data
2. convert English data to numerical representations
3. prepare it to be fed as input for our deeplearning model with GRUs.

## Importing the modules :

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
tf.random.set_seed(42)

## Loading the dataset :

In [2]:
#importing dataset
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
print(datasets.keys())

dict_keys(['test', 'train', 'unsupervised'])


In [3]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples
print(train_size, test_size)

25000 25000


## Exploring the dataset :

In [4]:
for X_batch, y_batch in datasets["train"].batch(2).take(2):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review : ", review.decode("utf-8")[:200], "...")
        print("Label : ", label, " = Positive" if label else " = Negative")
        print()

Review :  This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label :  0  = Negative

Review :  I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label :  0  = Negative

Review :  Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Moun ...
Label :  0  = Negative

Review :  This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...
Label :  1  = Positive



## Defining the preprocess function :

In [5]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z]", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value = b"<pad>"), y_batch

In [6]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 59), dtype=string, numpy=
 array([[b'Mann', b'photographs', b'the', b'Alberta', b'Rocky',
         b'Mountains', b'in', b'a', b'superb', b'fashion', b'and',
         b'Jimmy', b'Stewart', b'and', b'Walter', b'Brennan', b'give',
         b'enjoyable', b'performances', b'as', b'they', b'always',
         b'seem', b'to', b'do', b'But', b'come', b'on', b'Hollywood',
         b'a', b'Mountie', b'telling', b'the', b'people', b'of',
         b'Dawson', b'City', b'Yukon', b'to', b'elect', b'themselves',
         b'a', b'marshal', b'yes', b'a', b'marshal', b'and', b'to', b'e',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
        [b'This', b'is', b'the', b'kind', b'of', b'film', b'for', b'a',
         b'snowy', b'Sunday', b'afternoon', b'when', b'the', b'rest',
         b'of', b'the', b'world', b'can', b'go', b'ahead', b'with',
         b'its', b'own', b'business', b'as', b'you', b'descend', b'into',

## Constructing the Vocabulary :

In [7]:
from collections import Counter

In [8]:
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(2).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [9]:
vocabulary.most_common()[:5]

[(b'<pad>', 65828),
 (b'the', 61156),
 (b'a', 38569),
 (b'of', 33984),
 (b'and', 33432)]

In [10]:
len(vocabulary)

49739

## Truncating the Vocabulary :

In [11]:
vocab_size = 10000
truncated_vocabulary = [words for words, count in vocabulary.most_common()[:vocab_size]]

## Creating a lookup table :

In [12]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
#print(vocab_init)

In [13]:
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [14]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   24,    12,    13, 10053]])>

## Creating the Final Train and Test sets :

In [15]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [16]:
test_set = datasets["test"].batch(1000).map(preprocess)
test_set = test_set.map(encode_words)

In [17]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  24   13   31 ...    0    0    0]
 [   5   23   73 ...    0    0    0]
 [3454 6898    1 ...    0    0    0]
 ...
 [  24   12  122 ...    0    0    0]
 [1681 3980  451 ...    0    0    0]
 [3361 4259    5 ...    0    0    0]], shape=(32, 63), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


## Building the Model :

In [18]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                          mask_zero = True,
                          input_shape = [None]),
    keras.layers.GRU(4, return_sequences = True),
    keras.layers.GRU(2),
    keras.layers.Dense(1, activation = 'sigmoid')
])

In [19]:
model.compile(loss = 'binary_crossentropy', optimizer="adam", metrics = ["accuracy"])

## Training and Testing the Model :

In [20]:
import time
start = time.time()

In [21]:
model.fit(train_set, steps_per_epoch=train_size//32, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fdde5d40220>

In [22]:
end=time.time()
print("Time of execution : ", end-start)

Time of execution :  99.370929479599


In [23]:
model.evaluate(test_set)



[0.549174427986145, 0.7524799704551697]