In [3]:
# Importing the required libraries
import PyPDF2
import requests
from io import BytesIO
import string
import requests
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk


### Fetching data from PDF

In [5]:
def fetch_text_from_pdf(pdf_link):
    try:
        # Download the PDF file from the provided link
        response = requests.get(pdf_link)
        response.raise_for_status()

        # Check if the response content type is PDF
        if response.headers.get('content-type') == 'application/pdf':
            # You have successfully fetched the PDF content
            pdf_content = response.content

            # Create a BytesIO stream from the PDF content
            pdf_stream = BytesIO(pdf_content)

            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfFileReader(pdf_stream)

            # Initialize a variable to store the extracted text
            extracted_text = ""

            # Extract text from each page of the PDF
            for page_num in range(pdf_reader.numPages):
                page = pdf_reader.getPage(page_num)
                extracted_text += page.extractText()

            return extracted_text  # Return the extracted text

        else:
            print("The fetched content is not a PDF.")
            return None

    except Exception as e:
        print(f"Error fetching text from PDF: {e}")
        return None

def preprocess_text(text):
    if text is None:
        return ""  # Return an empty string if text is None

    # Remove non-printable characters and Unicode escape sequences
    text = text.encode('ascii', 'ignore').decode('utf-8')
    # TODO: Add preprocess steps as per data, Convert the text to lowercase
    text = text.lower()
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation from the tokens
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)  # Access the punctuation characters
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

# Collect and preprocess data from the PDFs
corpus = []  # Use a list to store preprocessed text for each book

# List of books on Chanakya Neeti with their PDF links
books = [
    {"title": "Language Models are Few-Shot Learners", 
     "author": "Tom B. Brown", "pdf_link": "https://arxiv.org/pdf/2005.14165.pdf"},
    {"title": "Explorations in Artificial Intelligence and Machine Learning", 
     "author": "Prof. Roberto V. Zicari", "pdf_link": "https://www.routledge.com/rsc/downloads/AI_FreeBook.pdf"},
    
    # Add more books to the list
]


for book in books:
    pdf_link = book["pdf_link"]
    text = fetch_text_from_pdf(pdf_link)
    
    if text is not None:
        processed_text = preprocess_text(text)
        corpus.append(processed_text)  # Append the preprocessed text for each book to the corpus list

# Print the preprocessed data
for i, book in enumerate(books):
    print(f"Book {i + 1} - Title: {book['title']}, Author: {book['author']}")
    #print(corpus[i])  # Print the preprocessed text for each book
    print("\n")


Book 1 - Title: Language Models are Few-Shot Learners, Author: Tom B. Brown


Book 2 - Title: Explorations in Artificial Intelligence and Machine Learning, Author: Prof. Roberto V. Zicari


