In [8]:
!pip install gensim pypdf



In [9]:
import numpy as np
from pypdf import PdfReader
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [10]:
# Step 1: Read the PDF and extract text
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from all pages of a PDF file.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    text = ""
    reader = PdfReader(pdf_path)
    for page_num in range(len(reader.pages)):
      text += reader.pages[page_num].extract_text()

    return text

In [11]:
# Step 2: Preprocess the Text
def preprocess_text(corpus):
    """
    Tokenizes the corpus and pads the sequences.

    Args:
    corpus (list of str): List of sentences.

    Returns:
    tuple: tokenizer (Tokenizer object), padded_sequences (np.array), word_index (dict), max_len (int)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    sequences = tokenizer.texts_to_sequences(corpus)
    word_index = tokenizer.word_index
    max_len = max(len(seq) for seq in sequences)

    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

    return tokenizer, padded_sequences, word_index, max_len

In [12]:
# Step 3: Create the Word2Vec Model
def train_word2vec_model(corpus_tokens, embedding_dim=300):
    """
    Trains a Word2Vec model on the tokenized corpus.

    Args:
    corpus_tokens (list of list of str): Tokenized sentences.
    embedding_dim (int): Dimension of the word vectors.

    Returns:
    Word2Vec: Trained Word2Vec model.
    """
    word2vec_model = Word2Vec(corpus_tokens, vector_size=embedding_dim, window=5, min_count=1, workers=4)
    return word2vec_model

In [13]:
# Step 4: Create the Embedding Matrix
def create_embedding_matrix(word_index, word2vec_model, embedding_dim=300):
    """
    Creates an embedding matrix from the Word2Vec model.

    Args:
    word_index (dict): Dictionary mapping words to their integer index.
    word2vec_model (Word2Vec): Trained Word2Vec model.
    embedding_dim (int): Dimension of the word vectors.

    Returns:
    tuple: embedding_matrix (np.array), vocab_size (int)
    """
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]

    return embedding_matrix, vocab_size

In [18]:
# Step 5: Build the RNN Model
def build_rnn_model(vocab_size, embedding_dim, max_len, embedding_matrix):
    """
    Builds the RNN model for next word prediction.

    Args:
    vocab_size (int): Size of the vocabulary.
    embedding_dim (int): Dimension of the word vectors.
    max_len (int): Maximum length of the input sequences.
    embedding_matrix (np.array): Embedding matrix.

    Returns:
    Sequential: Compiled RNN model.
    """
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len-1, trainable=False),
        LSTM(64),
        Dense(vocab_size, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model

In [15]:
# Step 6: Create Input-Output Pairs for Next Word Prediction
def create_sequences(tokenizer, corpus, max_len):
    """
    Creates input-output pairs for next word prediction.

    Args:
    tokenizer (Tokenizer): Fitted tokenizer.
    corpus (list of str): List of sentences.
    max_len (int): Maximum length of the input sequences.

    Returns:
    tuple: input_sequences (np.array), output_words (np.array)
    """
    sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            sequences.append(n_gram_sequence)

    sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

    input_sequences, output_words = sequences[:,:-1], sequences[:,-1]
    return input_sequences, output_words

In [19]:
# Example usage
pdf_path = '/content/2407.12220v1.pdf'  # Replace with your actual PDF file path

# Extract text from the PDF
text = extract_text_from_pdf(pdf_path)

# Split the text into sentences for processing
corpus = text.split('.')

# Tokenize the corpus for Word2Vec
corpus_tokens = [sentence.lower().split() for sentence in corpus]

# Preprocess the text
tokenizer, padded_sequences, word_index, max_len = preprocess_text(corpus)

# Train the Word2Vec model
word2vec_model = train_word2vec_model(corpus_tokens)

# Create the embedding matrix
embedding_matrix, vocab_size = create_embedding_matrix(word_index, word2vec_model)

# Build the RNN model
model = build_rnn_model(vocab_size, 300, max_len, embedding_matrix)

# Create input-output pairs for next word prediction
input_sequences, output_words = create_sequences(tokenizer, corpus, max_len)
output_words = np.array(output_words).reshape(-1, 1)

# Print the model summary
model.summary()

# The model is now ready for training


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 174, 300)          1743000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                93440     
                                                                 
 dense_1 (Dense)             (None, 5810)              377650    
                                                                 
Total params: 2214090 (8.45 MB)
Trainable params: 471090 (1.80 MB)
Non-trainable params: 1743000 (6.65 MB)
_________________________________________________________________


In [20]:
model.fit(input_sequences, output_words, epochs=30, batch_size=32)  # Uncomment and add target data for training


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7ccf4c4d0100>

In [21]:
import numpy as np

def predict_next_word(model, tokenizer, text, max_len):
    """
    Predicts the next word for a given text using the trained model.

    Args:
    model (Sequential): Trained RNN model.
    tokenizer (Tokenizer): Fitted tokenizer.
    text (str): Input text for prediction.
    max_len (int): Maximum length of input sequences.

    Returns:
    str: Predicted next word.
    """
    # Tokenize the input text
    token_list = tokenizer.texts_to_sequences([text])[0]

    # Pad the token list
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')

    # Predict the next word
    predicted_probabilities = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted_probabilities, axis=-1)[0]

    # Convert the predicted word index to the actual word
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word

def generate_text(model, tokenizer, seed_text, max_len, num_words):
    """
    Generates text by predicting the next word for a given seed text.

    Args:
    model (Sequential): Trained RNN model.
    tokenizer (Tokenizer): Fitted tokenizer.
    seed_text (str): Initial text to start the prediction.
    max_len (int): Maximum length of input sequences.
    num_words (int): Number of words to generate.

    Returns:
    str: Generated text.
    """
    text = seed_text
    for _ in range(num_words):
        next_word = predict_next_word(model, tokenizer, text, max_len)
        text += " " + next_word
    return text

# Example usage
seed_text = " While the most contamination is likely to happen at pre/post training"
generated_text = generate_text(model, tokenizer, seed_text, max_len, 10)
print(generated_text)


 While the most contamination is likely to happen at pre/post training al the test set is the test set is the
