In [2]:
!pip install pymupdf pytesseract pillow ollama langchain sentence-transformers faiss-cpu numpy



Collecting ollama
  Using cached ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain
  Using cached langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp39-cp39-win_amd64.whl.metadata (4.5 kB)
Collecting numpy
  Downloading numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting httpx<0.29,>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic<3.0.0,>=2.9.0 (from ollama)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain)
  Using cached langchain_core-0.3.35-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.6 (from langchain)
  Using cached langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langc

In [3]:
import ollama

response = ollama.generate(
    model='mistral',
    prompt='Hello, how are you?'
)

print(response['response'])


 I am just a computer program, so I don't have feelings or emotions. But I am here to help answer your questions and provide information! How can I assist you today?

In this conversation, "I" refers to the artificial intelligence system you are talking to, while "you" refers to the user of the system.


In [6]:
import fitz
import pytesseract
from PIL import Image
import io
import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def get_text_from_pdf(file_path, langs=['eng', 'hin', 'ben', 'chi_sim']):
    content = ""
    pdf_doc = fitz.open(file_path)
    for pg in pdf_doc:
        pg_text = pg.get_text()
        if pg_text.strip():
            content += pg_text
        else:
            img_data = pg.get_pixmap()
            img = Image.open(io.BytesIO(img_data.tobytes()))
            for lang in langs:
                try:
                    content += pytesseract.image_to_string(img, lang=lang)
                    break
                except:
                    continue
    return content

def split_text_into_chunks(text_data):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Increased chunk size for better context
        chunk_overlap=200,
        separators=["\n\n", "\n", "।", "。", "؟", "!", "?"]
    )
    return splitter.split_text(text_data)

embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

def build_vector_index(text_chunks):
    embeddings = embedding_model.encode(text_chunks)
    dim = embeddings.shape[1]
    vector_index = faiss.IndexFlatL2(dim)
    vector_index.add(embeddings)
    return vector_index, embeddings

class DocumentQASystem:
    def __init__(self, vector_index, text_chunks):
        self.vector_index = vector_index
        self.text_chunks = text_chunks
        
    def generate_detailed_response(self, context, query):
        prompt = f"""You are an expert document analyst. Generate a comprehensive, detailed response to the query 
        using ONLY the provided context. Your response should be well-structured with clear sections and bullet points 
        where appropriate. Include relevant details, themes, and supporting evidence from the context.
        
        Query: {query}
        Context: {context}
        
        Response:"""
        
        response = ollama.generate(
            model='llama3',  
            prompt=prompt,
            options={
                'temperature': 0.3,
                'num_predict': 1500,  
                'top_k': 50,
                'top_p': 0.9
            }
        )
        return response['response']

    def get_answer(self, query, top_k=5):  
        query_embed = embedding_model.encode([query])
        _, closest_indices = self.vector_index.search(query_embed, top_k)
        relevant_text = "\n\n".join([self.text_chunks[idx] for idx in closest_indices[0]])
        return self.generate_detailed_response(relevant_text, query)

def handle_multiple_pdfs(pdf_files):
    all_text_parts = []
    for file in pdf_files:
        print(f"Extracting text from {file}...")
        extracted_text = get_text_from_pdf(file)
        text_parts = split_text_into_chunks(extracted_text)
        all_text_parts.extend(text_parts)
    vector_index, _ = build_vector_index(all_text_parts)
    return vector_index, all_text_parts

if __name__ == "__main__":
    pdf_files = [
        r'C:\Users\Mayank\Desktop\proj\data\The Alchemist by Paulo Coelho-1.pdf'
    ]
    
    vector_index, text_chunks = handle_multiple_pdfs(pdf_files)
    qa_system = DocumentQASystem(vector_index, text_chunks)
    
    queries = [
        "Provide a comprehensive summary of the document including key themes, character development, and symbolic elements.",
        "Analyze the main character's journey and its philosophical implications.",
        "Explain the document's central message and its relevance to contemporary readers.",
        "Describe the major plot points and their significance in the narrative structure.",
        "What personal growth does the protagonist experience throughout the story?",
    ]
    
for q in queries:
    print(f"\nQuestion: {q}")
    print(f"Answer: {qa_system.get_answer(q)}\n")
    print("―" * 60)

Extracting text from C:\Users\Mayank\Desktop\proj\data\The Alchemist by Paulo Coelho-1.pdf...

Question: Provide a comprehensive summary of the document including key themes, character development, and symbolic elements.
Answer: **Comprehensive Summary of the Document**

The document tells the story of a boy's journey with an alchemist in search of treasure. The narrative explores themes of destiny, self-discovery, and the importance of listening to one's heart.

**Key Themes:**

1. **Destiny**: The concept of destiny is central to the story. The alchemist reveals that the boy has discovered his own destiny, which is what he has always wanted to accomplish.
2. **Self-Discovery**: The narrative emphasizes the importance of self-discovery and listening to one's heart. The boy must learn to trust his instincts and understand his own desires.
3. **The Power of Simple Things**: The document highlights the value of simple things, such as the Emerald Tablet, which contains profound wisdom.

*