In [1]:
data = [
    'Dil Dil Pakistan',
    'Jan Jan Pakistan',
    'Dil Dil Pakistan',
    'Jan Jan Pakistan',
    'Aesi zameen or asman',
    'in kay siwa jana kaha',
    'Barhti rahay yeh dosti',
    'chalta rahay yeh karwan',
    'Dil Dil Pakistan',
    'Jan Jan Pakistan'
]

### Integer tokenizer and how it works

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokens = Tokenizer()

In [3]:
tokens.fit_on_texts(data)

In [4]:
# Gives us the index of each word
tokens.word_index

{'dil': 1,
 'pakistan': 2,
 'jan': 3,
 'rahay': 4,
 'yeh': 5,
 'aesi': 6,
 'zameen': 7,
 'or': 8,
 'asman': 9,
 'in': 10,
 'kay': 11,
 'siwa': 12,
 'jana': 13,
 'kaha': 14,
 'barhti': 15,
 'dosti': 16,
 'chalta': 17,
 'karwan': 18}

In [5]:
# Count each word how many time it came
tokens.word_counts

OrderedDict([('dil', 6),
             ('pakistan', 6),
             ('jan', 6),
             ('aesi', 1),
             ('zameen', 1),
             ('or', 1),
             ('asman', 1),
             ('in', 1),
             ('kay', 1),
             ('siwa', 1),
             ('jana', 1),
             ('kaha', 1),
             ('barhti', 1),
             ('rahay', 2),
             ('yeh', 2),
             ('dosti', 1),
             ('chalta', 1),
             ('karwan', 1)])

In [6]:
# Gives us the sentence but in numbers which are index like how they were tokenized
sequences = tokens.texts_to_sequences(data)
sequences

[[1, 1, 2],
 [3, 3, 2],
 [1, 1, 2],
 [3, 3, 2],
 [6, 7, 8, 9],
 [10, 11, 12, 13, 14],
 [15, 4, 5, 16],
 [17, 4, 5, 18],
 [1, 1, 2],
 [3, 3, 2]]

In [7]:
# Tells us about total sentences
tokens.document_count

10

In [8]:
from keras.utils import pad_sequences

pad_sequences(sequences,padding='pre')

array([[ 0,  0,  1,  1,  2],
       [ 0,  0,  3,  3,  2],
       [ 0,  0,  1,  1,  2],
       [ 0,  0,  3,  3,  2],
       [ 0,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [ 0, 15,  4,  5, 16],
       [ 0, 17,  4,  5, 18],
       [ 0,  0,  1,  1,  2],
       [ 0,  0,  3,  3,  2]])

### Training an RNN model using imdb data

In [30]:
from keras.datasets import imdb

In [31]:
data = imdb.load_data()
(X_train,Y_train),(X_test,Y_test) = data

In [32]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [33]:
len(Y_train)

25000

In [34]:
len(X_test)

25000

In [35]:
len(X_train[0])

218

In [36]:
len(X_train[1])

189

In [37]:
Y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [38]:
X_train = pad_sequences(X_train)
X_test = pad_sequences(X_test)

In [39]:
len(X_train[0])

2494

In [40]:
len(X_train[1])

2494

In [41]:
len(X_train[0])

2494

In [42]:
X_train.shape

(25000, 2494)

In [43]:
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN

In [44]:
model_imdb = Sequential()

model_imdb.add(SimpleRNN(32,input_shape=(2494,1)))
model_imdb.add(Dense(1,activation='sigmoid'))

model_imdb.summary()

In [45]:
model_imdb.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model_imdb.fit(X_train,Y_train, epochs=10, validation_data=(X_test,Y_test),batch_size=32)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 542ms/step - accuracy: 0.5018 - loss: 0.7097 - val_accuracy: 0.5057 - val_loss: 0.6935
Epoch 2/10
[1m374/782[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m3:19[0m 490ms/step - accuracy: 0.5031 - loss: 0.6940