In [None]:
import PyPDF2
import numpy as np
from tensorflow.keras.models import Sequential, load_model, save_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:


def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text


def generate_question_and_answer(model, tokenizer, seed_text, max_sequence_len, num_words_to_generate):
    for _ in range(num_words_to_generate):
        # Tokenize the seed text
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        # Truncate sequences to a fixed length
        encoded = pad_sequences(
            [encoded], maxlen=max_sequence_len, truncating='pre')
        # Predict the next word
        y_pred = model.predict_classes(encoded, verbose=0)
        # Map predicted word index to word
        predicted_word = ""
        for word, index in tokenizer.word_index.items():
            if index == y_pred:
                predicted_word = word
                break
        # Add predicted word to the seed text for the next prediction
        seed_text += " " + predicted_word
    return seed_text



In [None]:

# Paths to the three PDF files
pdf1_path = "file1.pdf"
#pdf2_path = "file2.pdf"
pdf3_path = "file3.pdf"

# Extract text from PDF files
pdf1_text = extract_text_from_pdf(pdf1_path)
#pdf2_text = extract_text_from_pdf(pdf2_path)
pdf3_text = extract_text_from_pdf(pdf3_path)

# Combine texts
#combined_text = pdf1_text + pdf2_text + pdf3_text
combined_text = pdf1_text + pdf3_text


In [None]:
combined_text

In [None]:

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([combined_text])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences([combined_text])[0]


In [None]:

# Generate input sequences and labels
input_sequences = []
max_sequence_len = 200
for i in range(0, len(sequences) - max_sequence_len, 1):
    input_seq = sequences[i:i + max_sequence_len]
    input_sequences.append(input_seq)

# Convert input sequences to numpy arrays
input_sequences = np.array(input_sequences)

# Split input sequences into X and y
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Build the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_len - 1),
    Bidirectional(LSTM(150, return_sequences=True)),
    Bidirectional(LSTM(150)),
    Dense(vocab_size, activation='softmax')
])


In [None]:

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])


In [None]:

# Train the model
model.fit(X, y, batch_size=128, epochs=10)


In [None]:

# Save the trained model
save_model(model, 'pdf_rnn_model.keras')


In [None]:

# Load the trained model
model = load_model('pdf_rnn_model.keras')

In [None]:
def generate_question_and_answer(model, tokenizer, seed_text, max_sequence_len, num_words_to_generate):
    for _ in range(num_words_to_generate):
        # Tokenize the seed text
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        # Truncate sequences to a fixed length
        encoded = pad_sequences(
            [encoded], maxlen=max_sequence_len - 1)#, truncating='pre')
        # Predict the next word probabilities
        y_pred_probs = model.predict(encoded, verbose=0)[0]
        # Get the index of the word with the highest probability
        predicted_word_index = np.argmax(y_pred_probs)
        # Map predicted word index to word
        predicted_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                predicted_word = word
                break
        # Add predicted word to the seed text for the next prediction
        seed_text += " " + predicted_word
    # Convert generated text to ASCII
    return seed_text.encode('ascii', 'ignore').decode('ascii')

In [None]:

# Generate a question prompt
seed_text = input("Enter a seed text to generate a question: ")
num_words_to_generate = int(
    input("Enter the number of words to generate in the question: "))

# Generate question and answer using the trained model
generated_question = generate_question_and_answer(
    model, tokenizer, seed_text, max_sequence_len, num_words_to_generate)

# Display the generated question
print("Generated Question:", generated_question)

In [None]:
save_model(model, 'pdf_rnn_model.h5')

# Load the trained model
model = load_model('pdf_rnn_model.h5')

# Generate a question prompt
seed_text = input("Enter a seed text to generate a question: ")
num_words_to_generate = int(
    input("Enter the number of words to generate in the question: "))

# Generate question and answer using the trained model
generated_question = generate_question_and_answer(
    model, tokenizer, seed_text, max_sequence_len, num_words_to_generate)

# Display the generated question
print("Generated Question:", generated_question)