<a href="https://colab.research.google.com/github/mrishikreddy/nlp/blob/main/Assignment6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split


In [4]:
# (a) Preprocessing of the Data
# Load data from keras.datasets
max_words = 10000  # Limit to 10,000 most frequent words
max_len = 200  # Maximum length of each sequence

# Load IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_words)

# Pad sequences to ensure uniform length
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)


In [5]:
# (b) Divide data into training and testing data set
# Here, we are using the pre-split data from IMDB
# To further split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# (c) Build the Gated Recurrent Units (GRU) Model
embedding_dim = 128  # Dimension of embedding layer
gru_units = 64  # Number of GRU units


In [6]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(GRU(units=gru_units, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))  # For binary classification (positive/negative sentiment)



In [7]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# (d) Training the GRU Model
batch_size = 64
epochs = 5

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 330ms/step - accuracy: 0.6902 - loss: 0.5427 - val_accuracy: 0.8442 - val_loss: 0.3609
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 331ms/step - accuracy: 0.9063 - loss: 0.2362 - val_accuracy: 0.8782 - val_loss: 0.3029
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 316ms/step - accuracy: 0.9431 - loss: 0.1606 - val_accuracy: 0.8748 - val_loss: 0.3203
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 324ms/step - accuracy: 0.9620 - loss: 0.1103 - val_accuracy: 0.8740 - val_loss: 0.3717
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 319ms/step - accuracy: 0.9758 - loss: 0.0760 - val_accuracy: 0.8690 - val_loss: 0.4426


In [9]:
# (e) Text Generation Using the Trained Model
# For text generation, you need a trained model that can generate sequences.
# Here, we'll implement a simple method for text generation based on the GRU model:
def generate_text(model, tokenizer, seed_text, max_sequence_len):
    for _ in range(50):  # Generate 50 words
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == np.argmax(predicted):
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Example: Generate text (Note: Requires a text-based model and tokenizer)
# tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(your_text_data)
# print(generate_text(model, tokenizer, seed_text="This movie", max_sequence_len=max_len))

# (f) Evaluate Model’s accuracy
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.8593 - loss: 0.4825
Test Accuracy: 85.92%
