### USING INTEGER ENCODING METHOD

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
x_train
# this dataset is already tokenized and words are already integer encoded in the entire vocabulary as well

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [4]:
len(x_train[0])

218

In [5]:
len(x_train[1])

189

In [6]:
# now we will shape the reviews and make them of equal length (padding)
from tensorflow.keras.utils import pad_sequences
x_train = pad_sequences(x_train, padding='post', maxlen = 50)
x_test = pad_sequences(x_test, padding='post', maxlen = 50)
# maxlen is 50 means this will trim every sentence to a length of 50, and rest of the information is loosed

In [7]:
x_train.shape

(25000, 50)

In [8]:
x_train

array([[ 2071,    56,    26, ...,    19,   178,    32],
       [ 8255,     5, 25249, ...,    16,   145,    95],
       [  215,    28,   610, ...,     7,   129,   113],
       ...,
       [    4,    65,   496, ...,     4,  3586, 22459],
       [   13,    18,    31, ...,    12,     9,    23],
       [ 7585,     8,  2197, ...,   204,   131,     9]], dtype=int32)

In [9]:
model = Sequential()

model.add(SimpleRNN(32, input_shape=(50,1), return_sequences=False))
# 50 is the time steps(one review sentence of 50 integer encoded numbers) and 1 is input features(out of those 50 numbers one will be the input number)
# there will be 32 nodes in the hidden layer
# return sequence is false means the output from previous timestep will only act as input for the present and not go out(will remain in the system) as we dont need to know the sentiment after every word, rather we need it after every sentence
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 32)                1088      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1121 (4.38 KB)
Trainable params: 1121 (4.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
model.compile(optimizer='Adam', loss='binary_crossentropy',metrics=['accuracy'])


In [11]:
model.fit(x_train,y_train, epochs = 5,validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d11c87d2740>

### USING EMBEDDING METHOD

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Create the Sequential model
model = Sequential()

# Add an Embedding layer with input_dim=10000, output_dim=2, and input_length=50
model.add(Embedding(input_dim=10000, output_dim=2, input_length=50))

# Add a SimpleRNN layer with 32 units
model.add(SimpleRNN(32))

# Add a Dense layer with 1 unit and sigmoid activation
model.add(Dense(1, activation='sigmoid'))

# Print the model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 2)             20000     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21153 (82.63 KB)
Trainable params: 21153 (82.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [16]:
print("x_train shape before reshaping:", x_train.shape)
print("x_test shape before reshaping:", x_test.shape)

x_train shape before reshaping: (25000, 50, 1)
x_test shape before reshaping: (25000, 50, 1)


In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Example data: replace this with your actual data
# Assuming x_train and x_test have shape (num_samples, timesteps)
# and y_train, y_test are the corresponding labels
num_samples = 1000
timesteps = 50
vocab_size = 10000

# Generate dummy data for illustration
x_train = np.random.randint(vocab_size, size=(num_samples, timesteps))
y_train = np.random.randint(2, size=(num_samples,))
x_test = np.random.randint(vocab_size, size=(200, timesteps))
y_test = np.random.randint(2, size=(200,))

# Reshape input data to have an extra dimension for features
# In this case, we assume each timestep has one feature, so the shape will be (num_samples, timesteps, 1)
x_train = x_train.reshape((x_train.shape[0], timesteps, 1))
x_test = x_test.reshape((x_test.shape[0], timesteps, 1))

# Verify the shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=2, input_length=timesteps))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Fit the model
history = model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))


x_train shape: (1000, 50, 1)
y_train shape: (1000,)
x_test shape: (200, 50, 1)
y_test shape: (200,)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 2)             20000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21153 (82.63 KB)
Trainable params: 21153 (82.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
