In [4]:
import pymupdf
import pytesseract
from pdfminer.high_level import extract_text
from PIL import Image, ImageOps, ImageFilter
import cv2  # For multilingual OCR
import os
import numpy as np
import fitz

In [5]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [6]:
doc = pymupdf.open(r"D:\SEM 5\RAG\sample_pdfs\en\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.pdf") 
out = open("output.txt", "wb")
for page in doc: 
    text = page.get_text().encode("utf8") 
    out.write(text) 
    out.write(bytes((12,)))
out.close()

In [7]:
def extract_text_from_pdf(pdf_path, language = 'eng'):
    try:
        doc = pymupdf.open(pdf_path)
        text = ""
        for page in doc:
            page_text = page.get_text()
            text += page_text
            if page_text:
                text += page_text
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None
    


In [8]:
# Process the PDFs

def process_pdf_folder(input_folder, output_folder, language = 'eng'):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, filename)
            print(f"Found PDF: {pdf_path}...")

            text = extract_text_from_pdf(pdf_path)

            if text is None:
                print(f'No text found in {filename}, Applying OCR using pytesseract...')
                text = ocr_from_pdf(pdf_path, language)

            output_filename = os.path.splitext(filename)[0] + '.txt'
            output_path = os.path.join(output_folder, output_filename)

            with open(output_path, 'w', encoding='utf-8') as file_out:
                file_out.write(text)

            print(f"Saved text to {output_path}")

In [9]:
# English PDFs

eng_pdfs= r"D:\SEM 5\RAG\sample_pdfs\en"
output_folder_en = r"D:\SEM 5\RAG\converted_files\en"

process_pdf_folder(eng_pdfs, output_folder_en, language = 'eng')

Found PDF: D:\SEM 5\RAG\sample_pdfs\en\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.pdf...
Saved text to D:\SEM 5\RAG\converted_files\en\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\en\Reboot_Leadership_and_the_Art_of.pdf...
Saved text to D:\SEM 5\RAG\converted_files\en\Reboot_Leadership_and_the_Art_of.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\en\The Alchemist by Paulo Coelho-1.pdf...
Saved text to D:\SEM 5\RAG\converted_files\en\The Alchemist by Paulo Coelho-1.txt


In [10]:
# Bengali PDFs

bengali_pdfs= r"D:\SEM 5\RAG\sample_pdfs\bn"
output_folder_bn = r"D:\SEM 5\RAG\converted_files\bn"

process_pdf_folder(bengali_pdfs, output_folder_bn, language = 'ben')

Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\15092024_142.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\15092024_142.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\471 (TO).pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\471 (TO).txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\AP Ramjan.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\AP Ramjan.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\NEC-14.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\NEC-14.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\Research Nirdeshika.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\Research Nirdeshika.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\আহম্মেদNOC.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\আহম্মেদNOC.txt


In [11]:
# Urdu PDFs

urdu_pdfs= r"D:\SEM 5\RAG\sample_pdfs\ur"
output_folder_ur = r"D:\SEM 5\RAG\converted_files\ur"

process_pdf_folder(urdu_pdfs, output_folder_ur, language = 'urd')

Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\12-Rabiul-Awal-2024.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\12-Rabiul-Awal-2024.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\Extension-of-Ahdoc-Employees.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\Extension-of-Ahdoc-Employees.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\fasana-e-ajaib final.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\fasana-e-ajaib final.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\Notification-for-Other-Nationals.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\Notification-for-Other-Nationals.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\shora e rampur.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\shora e rampur.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\Solidarity-Day.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\Solidarity-Day.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\حیات جاوید، سوانح سر سید احمد خاں.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\حیات جاوید، سوانح سر سید احمد خا

In [12]:
# Chinese PDFs

chinese_pdfs= r"D:\SEM 5\RAG\sample_pdfs\zh"
output_folder_zh = r"D:\SEM 5\RAG\converted_files\zh"

process_pdf_folder(chinese_pdfs, output_folder_zh, language = 'chi_sim')


Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\1553a07b-9f53-4e8b-9987-ae714000b95b.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\1553a07b-9f53-4e8b-9987-ae714000b95b.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\98aab034-f8d7-4f6e-9a0c-b52c12f55ce7.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\98aab034-f8d7-4f6e-9a0c-b52c12f55ce7.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\ec2def2f-cc7b-44f3-87d1-24dc82f3a0ca.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\ec2def2f-cc7b-44f3-87d1-24dc82f3a0ca.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\P020230313555181904759.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\P020230313555181904759.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\P020230907694757200665.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\P020230907694757200665.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\P020230907695746624812.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\P020230907695746624812.txt


---

## Vector Embeddings

In [13]:
from sentence_transformers import SentenceTransformer
import faiss

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')
text_file_folder = r"D:\SEM 5\RAG\converted_files"
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Iterate through text files and generate embeddings
text_data = []
file_names = []

for root, dirs, files in os.walk(text_file_folder):
    for file_name in files:
        if file_name.endswith('.txt'):  # Check for text files
            file_path = os.path.join(root, file_name)
            text = read_text_file(file_path)
            text_data.append(text)  # Store the text content
            file_names.append(file_name)
 


In [15]:
embeddings = model.encode(text_data, convert_to_numpy= True)
print(embeddings.shape)

embeddings = np.array(embeddings)
embeddings = embeddings/np.linalg.norm(embeddings, axis=1, keepdims=True)

(22, 384)


In [16]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print(f"Indexed {len(embeddings)} text files with {dim} dimensions")

Indexed 22 text files with 384 dimensions


In [17]:
faiss.write_index(index, 'faiss_index.bin')

## Retrieval

NameError: name 'documents' is not defined

In [19]:
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.chat_models import ChatOllama

# Load the FAISS index
index = faiss.read_index('faiss_index.bin')

# Load the text files
text_files = [os.path.join(text_file_folder, file_name) for file_name in file_names]

In [20]:
local_model = 'mistral'
llm = ChatOllama(model_name=local_model, temperature=0.2, max_tokens=1024, top_p=0.95, repetition_penalty=1.2)

  llm = ChatOllama(model_name=local_model, temperature=0.2, max_tokens=1024, top_p=0.95, repetition_penalty=1.2)


In [33]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [21]:
query = "What are the objectives of Budget 2024?"
query_embedding = model.encode([query], convert_to_numpy=True)
query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

# Search the FAISS index for the nearest neighbors to the query
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(query_embedding, k)

# Display the results
print(f"Top {k} results for the query '{query}':")
for i in range(k):
    print(f"File: {file_names[indices[0][i]]}, Distance: {distances[0][i]}")

Top 5 results for the query 'What are the objectives of Budget 2024?':
File: 15092024_142.txt, Distance: 1.7089343070983887
File: 471 (TO).txt, Distance: 1.7089343070983887
File: AP Ramjan.txt, Distance: 1.7089343070983887
File: 12-Rabiul-Awal-2024.txt, Distance: 1.7089343070983887
File: Extension-of-Ahdoc-Employees.txt, Distance: 1.7089343070983887


In [23]:
from google.api_core.exceptions import InternalServerError
from dotenv import load_dotenv
import genai

load_dotenv()

# Get the GOOGLE_API_KEY from the .env file
API_KEY = os.getenv('GOOGLE_API_KEY')

# Configure genai with the API key
genai.configure(api_key=API_KEY)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

ModuleNotFoundError: No module named 'genai'

In [None]:
def get_output(query,pdfd):
    model = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=arj_API,

                                   temperature=0.3)

    prompt_template = """" Provide the extract all the text and image text inside the content 
      Context: \n {context}?\n

      Question: \n {question} \n

      Answer:

     """

    prompt = PromptTemplate(

        template=prompt_template, input_variables=["context", "question"]

    )
    
    data= texto(pdfd)
    model = genai.GenerativeModel(model_name="gemini-pro")
    responses = model.generate_content(prompt.format(context=data, question=(query)))
    print(responses.text)
    return responses.text
input= input("Enter the Query  : ")
query = input
get_output(query,"") 