In [2]:
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt 
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.compat.v1.keras.layers import Flatten

2.1.0


# Load the data

In [9]:
imdb = tfds.load("imdb_reviews",as_supervised= True)

In [10]:
imdb

{'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [11]:
train_set= imdb["train"]
test_set = imdb["test"]


In [12]:
train_sentence = []
train_label = []
test_sentence = []
test_label = []

In [13]:
for sentence, label in test_set:
    test_sentence.append(str(sentence.numpy()))
    test_label.append(label.numpy())

In [14]:
for sentence, label in train_set:
    train_sentence.append(str(sentence.numpy()))
    train_label.append(label.numpy())

In [15]:
print(len(train_sentence), len(train_label), len(test_sentence), len(test_label))

25000 25000 25000 25000


In [16]:
print(type(train_sentence[0]), type(train_label), type(test_sentence[0]), type(test_label))

<class 'str'> <class 'list'> <class 'str'> <class 'list'>


In [17]:
train_labels = np.array(train_label)
test_labels = np.array(test_label)

In [18]:
print(type(train_label), type(test_label))

<class 'list'> <class 'list'>


# Hyperparameters

In [29]:
vocab_size = 1000
drop_embed = 0.2 
embedding_dim = 16
max_length = 120
num_epoch =5
trunc_type = "post"
oov_tok = "<OOV>"
n_lstm = 256
drop_lstm = 0.2

# Preprocess data

In [20]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [21]:
tokenizer.fit_on_texts(train_sentence)

In [22]:
train_to_sequences_sentence = tokenizer.texts_to_sequences(train_sentence)

In [23]:
test_to_sequences_sentence = tokenizer.texts_to_sequences(test_sentence)

In [24]:
print(train_to_sequences_sentence[0])

[59, 12, 14, 35, 439, 400, 18, 174, 29, 1, 9, 33, 1, 1, 42, 496, 1, 197, 25, 88, 156, 19, 12, 211, 340, 29, 70, 248, 213, 9, 486, 62, 70, 88, 116, 99, 24, 1, 12, 1, 657, 777, 12, 18, 7, 35, 406, 1, 178, 1, 426, 2, 92, 1, 140, 72, 149, 55, 2, 1, 1, 72, 229, 70, 1, 16, 1, 1, 1, 1, 1, 1, 3, 40, 1, 119, 1, 17, 1, 14, 163, 19, 4, 1, 927, 1, 9, 4, 18, 13, 14, 1, 5, 102, 148, 1, 11, 240, 692, 13, 44, 25, 101, 39, 12, 1, 1, 39, 1, 1, 52, 409, 11, 99, 1, 874, 145, 10]


# Padding

In [25]:
train_padded = pad_sequences(train_to_sequences_sentence, maxlen= max_length, truncating=trunc_type)

In [26]:
test_padded = pad_sequences(test_to_sequences_sentence,maxlen= max_length, truncating=trunc_type)

In [27]:
print(train_padded.shape, test_padded.shape)

(25000, 120) (25000, 120)


# Developing the Model

In [30]:
model = Sequential([
    Embedding(input_dim=vocab_size,output_dim=embedding_dim, input_length=max_length),
    SpatialDropout1D(drop_embed),
    Bidirectional(LSTM(units=n_lstm, dropout=drop_lstm)),
    Flatten(),
    Dense(1,activation="sigmoid")
])



In [31]:
model.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 120, 16)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 512)               559104    
_________________________________________________________________
flatten (Flatten)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 513       
Total params: 575,617
Trainable params: 575,617
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.compile(loss = "binary_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])



In [33]:
model.fit(train_padded,train_labels,
         epochs = num_epoch,
         validation_data = (test_padded, test_labels))



Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fb7005b8690>

In [116]:
sentence_1 = ["This movie is the worst thing ever I watched",
           "I hate the actor in this movie",
              "This movie is fucking awful",
              "This movie is the most terrible movie i have watched with horrible actor",
           "Do not watch this movie because this movie is really bad",
           "Fuck this movie. It is really bad",
           "This movie is the best"]

In [117]:
sentence_1_text_to_sequence = tokenizer.texts_to_sequences(sentence_1)

In [119]:
sentence_1_sequence_padded = pad_sequences(sentence_1_text_to_sequence,maxlen = max_length)

In [121]:
output_probabilities= model.predict_proba(sentence_1_sequence_padded)
output_probabilities

array([[0.1393053 ],
       [0.42358458],
       [0.12012677],
       [0.06273458],
       [0.25805876],
       [0.31871018],
       [0.64907765]], dtype=float32)

In [122]:
for i in range(len(output_probabilities)):
    print("\n")
    print(sentence_1[i], "\n", list(map('{:.2f}%'.format,output_probabilities[i])))



This movie is the worst thing ever I watched 
 ['0.14%']


I hate the actor in this movie 
 ['0.42%']


This movie is fucking awful 
 ['0.12%']


This movie is the most terrible movie i have watched with horrible actor 
 ['0.06%']


Do not watch this movie because this movie is really bad 
 ['0.26%']


Fuck this movie. It is really bad 
 ['0.32%']


This movie is the best 
 ['0.65%']
