### Importing the required libraries

In [1]:
# -Import the requied Libraries-
import re
import nltk
import PyPDF2
import pickle
import string
import requests
import numpy as np
from io import BytesIO
import tensorflow as tf
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


2023-09-14 04:39:13.586316: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Fetching data from PDF

In [2]:
def fetch_text_from_pdf(pdf_link):
    try:
        # -Download the PDF file from the provided link-
        response = requests.get(pdf_link)
        response.raise_for_status()

        # -Check if the response content type is PDF-
        if response.headers.get('content-type') == 'application/pdf':
            # -You have successfully fetched the PDF content-
            pdf_content = response.content

            # -Create a BytesIO stream from the PDF content-
            pdf_stream = BytesIO(pdf_content)

            # -Create a PDF reader object-
            pdf_reader = PyPDF2.PdfFileReader(pdf_stream)

            # -Initialize a variable to store the extracted text-
            extracted_text = ""

            # -Extract text from each page of the PDF-
            for page_num in range(pdf_reader.numPages):
                page = pdf_reader.getPage(page_num)
                extracted_text += page.extractText()

            return extracted_text  # -Return the extracted text-

        else:
            print("The fetched content is not a PDF.")
            return None

    except Exception as e:
        print(f"Error fetching text from PDF: {e}")
        return None
# -Function to process text-
def preprocess_text(text):
    if text is None:
        return ""  # -Return an empty string if text is None-

    # -Remove non-printable characters and Unicode escape sequences-
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # -TODO: Add preprocess steps as per data, Convert the text to lowercase-
    text = text.lower()
    # -Tokenize the text into individual words-
    tokens = word_tokenize(text)
    # -Remove stopwords and punctuation from the tokens-
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)  # -Access the punctuation characters-
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

# -Collect and preprocess data from the PDFs-
corpus = []  # -Use a list to store preprocessed text for each book-

# -List of books on Chanakya Neeti with their PDF links-
books = [
    {"title": "Language Models are Few-Shot Learners", 
     "author": "Tom B. Brown", "pdf_link": "https://arxiv.org/pdf/2005.14165.pdf"},
    {"title": "Explorations in Artificial Intelligence and Machine Learning", 
     "author": "Prof. Roberto V. Zicari", "pdf_link": "https://www.routledge.com/rsc/downloads/AI_FreeBook.pdf"},
     #{"title": "Artificial Intelligence A Modern Approach Third Edition",
     #"author": "Stuart J. Russell and Peter Norvig",
     #"pdf_link": "https://people.engr.tamu.edu/guni/csce421/files/AI_Russell_Norvig.pdf"}
     # Add more books to the list
]


for book in books:
    pdf_link = book["pdf_link"]
    text = fetch_text_from_pdf(pdf_link)
    
    if text is not None:
        processed_text = preprocess_text(text)
        corpus.append(processed_text)  # -Append the preprocessed text for each book to the corpus list-

# -Print the preprocessed data
for i, book in enumerate(books):
    print(f"Book {i + 1} - Title: {book['title']}, Author: {book['author']}")
    #print(corpus[i])  # -Print the preprocessed text for each book-
    print("\n")


Book 1 - Title: Language Models are Few-Shot Learners, Author: Tom B. Brown


Book 2 - Title: Explorations in Artificial Intelligence and Machine Learning, Author: Prof. Roberto V. Zicari




### Training the model

In [3]:
# -Define hyperparameters for the custom language model:
#  - vocab_size: The size of the vocabulary, which determines the number of unique words the model can work with.
#  - embedding_dim: The dimensionality of word embeddings (vector representations) for words in the vocabulary.
#  - max_seq_length: The maximum length of input sequences that the model will accept during training and generation.
#  - lstm_units: The number of LSTM (Long Short-Term Memory) units in the model's hidden layers.
#  - output_units: The number of units in the output layer, which matches the vocabulary size for text generation.

vocab_size = 10000
embedding_dim = 128
max_seq_length = 50
lstm_units = 256
output_units = vocab_size

# -Initialize a Tokenizer with a specified vocabulary size and an out-of-vocabulary (OOV) token.-
# -Then, fit the Tokenizer on the provided 'corpus' to build a vocabulary and prepare text data for tokenization.-

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(corpus)

# -Converting text to sequences (tokenization)-
X_sequences = tokenizer.texts_to_sequences(corpus)

# -Create training sequences (X_train) and labels (y_train) for text generation-
sequences = []
for seq in X_sequences:
    for i in range(1, len(seq)):
        sequences.append(seq[:i+1])

# -Creating pad sequences-
X_padded = pad_sequences(sequences, maxlen=max_seq_length, padding='pre', truncating='pre')

# -Spliting the data into training and validation sets-
# - 80% for training and 20% for validation
split_ratio = 0.8
split_index = int(len(X_padded) * split_ratio)

X_train = X_padded[:split_index, :-1]
y_train = X_padded[:split_index, -1]

X_val = X_padded[split_index:, :-1]
y_val = X_padded[split_index:, -1]

# -Building the model-
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_seq_length-1),
    LSTM(lstm_units),
    Dense(output_units, activation='softmax')
])

# -Compile and train the model-
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)


2023-09-14 04:40:41.204035: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-09-14 04:40:52.175057: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-09-14 04:40:52.176774: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-09-14 04:40:52.1

Epoch 1/10


2023-09-14 04:40:52.720236: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-09-14 04:40:52.722563: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-09-14 04:40:52.724112: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-09-14 04:45:22.941885: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-09-14 04:45:22.943507: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-09-14 04:45:22.944938: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
