# Informative System using a RNN Model

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

In [None]:

# Sample documents for initial training (starting with two documents)
documents_initial = [
    "This is the first document.",
    "Here is the second document."
]


In [None]:

# Tokenize the initial documents
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents_initial)


In [None]:

# Convert text to sequences
sequences_initial = tokenizer.texts_to_sequences(documents_initial)


In [None]:
sequences_initial

In [None]:

# Pad sequences
max_sequence_length = max(len(seq) for seq in sequences_initial)
padded_sequences_initial = pad_sequences(
    sequences_initial, maxlen=max_sequence_length, padding='post', truncating='post')


In [None]:

# Create input data (X) and target data (y) for initial training
X_train_initial = padded_sequences_initial[:, :-1]
# Target is not one-hot encoded in this approach
y_train_initial = padded_sequences_initial[:, 1:]


In [None]:
X_train_initial

In [None]:
y_train_initial

In [None]:

# Define the initial RNN model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50


In [None]:

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
          input_length=max_sequence_length - 1))
# Return sequences for training purposes
model.add(LSTM(units=100, return_sequences=True))
model.add(Dense(units=vocab_size, activation='softmax'))

# Compile the initial model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

# Define a ModelCheckpoint callback to save the model during training
checkpoint_callback = ModelCheckpoint(
    "initial_model_checkpoint.h5", save_best_only=True)


In [None]:

# Train the initial model and include the checkpoint callback
model.fit(X_train_initial, y_train_initial, epochs=50,
          batch_size=1, callbacks=[checkpoint_callback])


In [None]:

# Save the entire initial model (including architecture, optimizer, and weights)
#model.save("initial_model.h5")

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model.save('initial_model.h5')

In [52]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np


In [53]:

# Sample documents
documents = [
    "Document 1 is about machine learning.",
    "Document 2 talks about natural language processing.",
    "Document 3 covers recurrent neural networks."
]


In [55]:

# Tokenizing the documents
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
total_words = len(tokenizer.word_index) + 1


In [56]:

# Creating input sequences and labels
input_sequences = []
for line in documents:
    token_list = tokenizer.texts_to_sequences([line])[0]
    input_sequences.append(token_list)


In [None]:

max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    input_sequences, maxlen=max_sequence_length, padding='pre')
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


In [68]:
x

array([[ 0,  1,  3,  4,  2,  5],
       [ 1,  7,  8,  2,  9, 10],
       [ 0,  1, 12, 13, 14, 15]])

In [69]:
y

array([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.]], dtype=float32)

In [58]:

# Build the model
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])


In [59]:

# Train the model
model.fit(x, y, epochs=50, verbose=2)


Epoch 1/50
1/1 - 3s - loss: 2.8378 - accuracy: 0.0000e+00 - 3s/epoch - 3s/step
Epoch 2/50
1/1 - 0s - loss: 2.8241 - accuracy: 0.6667 - 0s/epoch - 0s/step
Epoch 3/50
1/1 - 0s - loss: 2.8105 - accuracy: 1.0000 - 23ms/epoch - 23ms/step
Epoch 4/50
1/1 - 0s - loss: 2.7966 - accuracy: 1.0000 - 0s/epoch - 0s/step
Epoch 5/50
1/1 - 0s - loss: 2.7821 - accuracy: 1.0000 - 9ms/epoch - 9ms/step
Epoch 6/50
1/1 - 0s - loss: 2.7669 - accuracy: 1.0000 - 13ms/epoch - 13ms/step
Epoch 7/50
1/1 - 0s - loss: 2.7506 - accuracy: 1.0000 - 10ms/epoch - 10ms/step
Epoch 8/50
1/1 - 0s - loss: 2.7329 - accuracy: 1.0000 - 7ms/epoch - 7ms/step
Epoch 9/50
1/1 - 0s - loss: 2.7136 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
Epoch 10/50
1/1 - 0s - loss: 2.6924 - accuracy: 1.0000 - 4ms/epoch - 4ms/step
Epoch 11/50
1/1 - 0s - loss: 2.6689 - accuracy: 1.0000 - 18ms/epoch - 18ms/step
Epoch 12/50
1/1 - 0s - loss: 2.6426 - accuracy: 1.0000 - 8ms/epoch - 8ms/step
Epoch 13/50
1/1 - 0s - loss: 2.6132 - accuracy: 1.0000 - 0s/epoch

<keras.src.callbacks.History at 0x180195ff210>

In [62]:

# Save the model
model.save('my_rnn_model.keras')


In [63]:

# Load the saved model
loaded_model = load_model('my_rnn_model.keras')


In [70]:

def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = tf.keras.preprocessing.sequence.pad_sequences(
            [token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [75]:

def reinforce_model(feedback_text, target_word, model, tokenizer, max_sequence_len, train_model=True):
    x = tokenizer.texts_to_sequences([feedback_text])[0]
    x = tf.keras.preprocessing.sequence.pad_sequences(
        [x], maxlen=max_sequence_len-1, padding='pre')
    y = tokenizer.texts_to_sequences([target_word])[0]
    y = tf.keras.utils.to_categorical(
        y, num_classes=len(tokenizer.word_index) + 1)

    if len(x) != len(y):
        raise ValueError(
            "Mismatch in the number of samples between x and y. Make sure they have the same number of samples.")

    if train_model:
        model.fit(x, y, epochs=1, verbose=0)



In [76]:

# Example usage to generate text
generated_text = generate_text(
    "Document 1 is about", 5, loaded_model, tokenizer, max_sequence_length)
print("Generated Text:", generated_text)


Generated Text: Document 1 is about learning learning learning learning processing


In [77]:

# User provides feedback
user_feedback_text = "Document 1 is about"
user_target_word = "machine"


In [78]:

# Reinforce the model based on user feedback (with training)
reinforce_model(user_feedback_text, user_target_word, loaded_model,
                tokenizer, max_sequence_length, train_model=True)


In [79]:


# Generate text again after reinforcement
generated_text_after_feedback = generate_text(
    "Document 1 is about", 5, loaded_model, tokenizer, max_sequence_length)
print("Generated Text after feedback (with training):",
      generated_text_after_feedback)

# Reinforce the model based on user feedback (without training)
reinforce_model(user_feedback_text, user_target_word, loaded_model,
                tokenizer, max_sequence_length, train_model=False)

# Generate text again after reinforcement (without training)
generated_text_after_feedback_no_training = generate_text(
    "Document 1 is about", 5, loaded_model, tokenizer, max_sequence_length)
print("Generated Text after feedback (without training):",
      generated_text_after_feedback_no_training)

Generated Text after feedback (with training): Document 1 is about learning learning learning processing processing
Generated Text after feedback (without training): Document 1 is about learning learning learning processing processing
