In [None]:
import tensorflow_datasets as tfds

In [None]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [None]:
import numpy as np

train_data, test_data = imdb["train"], imdb["test"]

In [None]:
train_sentences =[]
test_sentences = []

train_labels = []
test_labels = []

for s, l in train_data:
  train_sentences.append(str(s.numpy()))
  train_labels.append(l.numpy())

for s, l in test_data:
  test_sentences.append(str(s.numpy()))
  test_labels.append(l.numpy())

In [None]:
len(train_sentences)

25000

In [None]:
len(train_labels)

25000

In [None]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [None]:
vocab_size = 10000
embedding_dim = 64
max_length = 140

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)

In [None]:
tokenizer.word_index

{'<OOV>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'br': 8,
 'in': 9,
 'it': 10,
 'i': 11,
 'this': 12,
 'that': 13,
 'was': 14,
 'as': 15,
 'for': 16,
 'with': 17,
 'movie': 18,
 'but': 19,
 'film': 20,
 "'s": 21,
 'on': 22,
 'you': 23,
 'not': 24,
 'are': 25,
 'his': 26,
 'he': 27,
 'have': 28,
 'be': 29,
 'one': 30,
 'all': 31,
 'at': 32,
 'by': 33,
 'they': 34,
 'an': 35,
 'who': 36,
 'so': 37,
 'from': 38,
 'like': 39,
 'her': 40,
 "'t": 41,
 'or': 42,
 'just': 43,
 'there': 44,
 'about': 45,
 'out': 46,
 "'": 47,
 'has': 48,
 'if': 49,
 'some': 50,
 'what': 51,
 'good': 52,
 'more': 53,
 'very': 54,
 'when': 55,
 'she': 56,
 'up': 57,
 'can': 58,
 'b': 59,
 'time': 60,
 'no': 61,
 'even': 62,
 'my': 63,
 'would': 64,
 'which': 65,
 'story': 66,
 'only': 67,
 'really': 68,
 'see': 69,
 'their': 70,
 'had': 71,
 'were': 72,
 'me': 73,
 'well': 74,
 'we': 75,
 'than': 76,
 'much': 77,
 'been': 78,
 'get': 79,
 'bad': 80,
 'will': 81,
 'people': 82,
 'do': 83,

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [None]:
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_length, truncating="post", padding="post")

In [None]:
padded_train_sequences.shape

(25000, 140)

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_length, truncating="post", padding="post")

In [None]:
padded_test_sequences.shape

(25000, 140)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers

In [None]:
num_heads = 6
ff_dim = 140

In [None]:
class TransformersBlock(layers.Layer):
  def __init__(self, embedding_dim, num_heads, ff_dims, rate=0.1):
    super().__init__()
    self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
    self.ffn = Sequential([Dense(ff_dims, activation="relu"), Dense(embedding_dim),])
    self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, inputs, training):
    attn_output = self.att(inputs, inputs)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    return self.layernorm2(out1 + ffn_output)

In [None]:
import tensorflow as tf
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_length, vocab_size, embedding_dim):
          super().__init__()
          self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
          self.pos_emb = layers.Embedding(input_dim=max_length, output_dim=embedding_dim)
    def call(self, x):
          maxlen = tf.shape(x)[-1]
          positions = tf.range(start=0, limit=max_length, delta = 1)
          positions = self.pos_emb(positions)
          x = self.token_emb(x)
          return x + positions

In [None]:
from tensorflow import keras

inputs = layers.Input(shape=(max_length,))
embedding_layer = TokenAndPositionEmbedding(max_length, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformerblock = TransformersBlock(embedding_dim, num_heads, ff_dim)
x = transformerblock(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32)(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation='softmax')(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 140)]             0         
                                                                 
 token_and_position_embeddi  (None, 140, 64)           648960    
 ng_2 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformers_block_2 (Tran  (None, 140, 64)           117900    
 sformersBlock)                                                  
                                                                 
 global_average_pooling1d_2  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_8 (Dropout)         (None, 64)                0   

In [None]:
model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(padded_train_sequences, train_labels, batch_size = 32, epochs=5, validation_data=(padded_test_sequences, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test_sen = ["this movie is very interesting"]
test_seq = tokenizer.texts_to_sequences(test_sen)
padded_text_seq = pad_sequences(test_seq, maxlen=max_length, truncating="post", padding="post")

In [None]:
import numpy as np
prediction = model.predict(padded_text_seq)[0]

choice = np.argmax(prediction)
if choice == 1:
  print("This is positive review")
else:
  print("This is negative review")