<a href="https://colab.research.google.com/github/navin1111/CAPSTONE_PROJECT/blob/main/final_run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio langchain-community litellm PyPDF2 torch pypdf faiss-cpu pdf2image pytesseract transformers
!apt-get install -q -y poppler-utils tesseract-ocr libtesseract-dev

Reading package lists...
Building dependency tree...
Reading state information...
libtesseract-dev is already the newest version (4.1.1-2.1build1).
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
import os
from getpass import getpass

os.environ['GEMINI_API_KEY'] = getpass("Enter your GEMINI API Key:")


Enter your GEMINI API Key:··········


In [None]:
api_key = os.environ['GEMINI_API_KEY']


In [None]:
import pytesseract
import gradio as gr
import os
import tempfile
from pdf2image import convert_from_path
from langchain.text_splitter import CharacterTextSplitter
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch
import concurrent.futures
from litellm import completion
from typing import List, Dict
from google.colab import files
import base64
import time
import os

# Set Tesseract path for Colab
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# Load the models globally
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

class CharacterTextSplitterWithPageNumbers(CharacterTextSplitter):
    def __init__(self, chunk_size=1000, chunk_overlap=200, **kwargs):
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_documents(self, documents):
        chunks = []
        for doc in documents:
            page_num = doc['page_number']
            content = doc['content']

            while len(content) > 0:
                chunk = content[:self.chunk_size]
                chunks.append({'content': chunk, 'page_number': page_num})
                content = content[self.chunk_size - self.chunk_overlap:]

        return chunks

class PDFChatbot:
    def __init__(self):
        self.index = None
        self.texts = None
        self.current_pdf_path = None

    def extract_text_from_image(self, image, page_num):
        text = pytesseract.image_to_string(image)
        return {'content': text, 'page_number': page_num + 1}

    def get_text_from_file_tesseract(self, file_path):
        images = convert_from_path(file_path)

        texts = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(self.extract_text_from_image, img, idx)
                      for idx, img in enumerate(images)]
            for future in concurrent.futures.as_completed(futures):
                texts.append(future.result())

        return sorted(texts, key=lambda x: x['page_number'])

    def get_embeddings(self, texts):
        embeddings = []
        for text in texts:
            inputs = tokenizer(text['content'], return_tensors='pt', truncation=True,
                             padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append({'embedding': embedding, 'page_number': text['page_number']})
        return embeddings

    def process_pdf(self, file_obj):
        # Save uploaded file to temporary location
        temp_dir = tempfile.mkdtemp()
        temp_path = os.path.join(temp_dir, "uploaded.pdf")

        # Handle both file object from Gradio and direct file path
        if isinstance(file_obj, str):
            temp_path = file_obj
        else:
            with open(temp_path, 'wb') as f:
                f.write(file_obj.read())

        # Extract text from PDF
        try:
            documents = self.get_text_from_file_tesseract(temp_path)

            # Split text into chunks
            text_splitter = CharacterTextSplitterWithPageNumbers(chunk_size=1000, chunk_overlap=200)
            self.texts = text_splitter.split_documents(documents)

            # Generate embeddings
            embeddings = self.get_embeddings(self.texts)
            embedding_vectors = np.array([emb['embedding'] for emb in embeddings], dtype=np.float32)

            # Create FAISS index
            dimension = embedding_vectors.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embedding_vectors)

            return "PDF processed successfully! You can now ask questions about the document."
        except Exception as e:
            return f"Error processing PDF: {str(e)}"

    def find_most_similar_document(self, query):
        inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        D, I = self.index.search(np.array([query_embedding], dtype=np.float32), k=1)
        most_similar_idx = I[0][0]
        return self.texts[most_similar_idx]['content'], self.texts[most_similar_idx]['page_number']

    def get_response(self, query):
        """Generates a response using the Gemini Pro model."""
        response = completion(
            model="gemini/gemini-pro",
            messages=[{"role": "user", "content": query}]
        )
        return response.get("choices", [{}])[0].get("message", {}).get("content", "")

    def get_answer_from_pdf(self, query):
        if self.index is None:
            return "Please upload a PDF first!", 0

        most_similar_document_content, page_number = self.find_most_similar_document(query)
        prompt = f"Based on the following content:\n\n{most_similar_document_content}\n\nAnswer the following question: {query}"
        answer = self.get_response(prompt)
        return answer, page_number



In [None]:
def create_ui():
    chatbot = PDFChatbot()

    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("# PDF Question Answering System")

        with gr.Row():
            with gr.Column(scale=1):
                file_output = gr.File(label="Upload PDF")
                status_output = gr.Textbox(label="Status", placeholder="Upload a PDF to begin...")

            with gr.Column(scale=2):
                chatbot_ui = gr.Chatbot(label="Chat History")
                msg = gr.Textbox(label="Ask a question about the PDF", placeholder="Type your question here...")
                clear = gr.Button("Clear")

        def user_query(message, history):
            if chatbot.index is None:
                response = "Please upload a PDF first!"
                history.append((message, response))
                return "", history

            answer, page_number = chatbot.get_answer_from_pdf(message)
            response = f"{answer}\n\nFound on Page: {page_number}"
            history.append((message, response))
            return "", history

        def process_pdf(file):
            if file is None:
                return "Please upload a PDF file."
            try:
                result = chatbot.process_pdf(file)
                return result
            except Exception as e:
                return f"Error processing PDF: {str(e)}"

        msg.submit(user_query, [msg, chatbot_ui], [msg, chatbot_ui])
        file_output.upload(process_pdf, file_output, status_output)
        clear.click(lambda: None, None, chatbot_ui, queue=False)

    return demo

In [None]:
# Create a cell to set your API key
#os.environ['GEMINI_API_KEY'] = "AIzaSyBQ0_JY2d68Dn6Qr82uRnnN-pQNFlDFTzY"

In [None]:
# Create a cell to launch the interface
demo = create_ui()
demo.launch(debug=True, share=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c731e48e6112a6b88c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c731e48e6112a6b88c.gradio.live




In [None]:
!git init


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [None]:
!git remote remove origin
!git remote add origin https://github.com/navin1111/CAPSTONE_PROJECT.git


In [None]:
!git add .


In [None]:
!git commit -m "Initial commit with PDF processing and chatbot code"


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@27739749087e.(none)')


In [None]:
!git push -u origin main


error: src refspec main does not match any
[31merror: failed to push some refs to 'https://github.com/navin1111/CAPSTONE_PROJECT.git'
[m

In [None]:
!git status

On branch main

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	[32mnew file:   .config/.last_opt_in_prompt.yaml[m
	[32mnew file:   .config/.last_survey_prompt.yaml[m
	[32mnew file:   .config/.last_update_check.json[m
	[32mnew file:   .config/active_config[m
	[32mnew file:   .config/config_sentinel[m
	[32mnew file:   .config/configurations/config_default[m
	[32mnew file:   .config/default_configs.db[m
	[32mnew file:   .config/gce[m
	[32mnew file:   .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db[m
	[32mnew file:   .config/logs/2024.11.06/14.22.28.082325.log[m
	[32mnew file:   .config/logs/2024.11.06/14.22.49.463793.log[m
	[32mnew file:   .config/logs/2024.11.06/14.23.01.961464.log[m
	[32mnew file:   .config/logs/2024.11.06/14.23.02.977294.log[m
	[32mnew file:   .config/logs/2024.11.06/14.23.15.846674.log[m
	[32mnew file:   .config/logs/2024.11.06/14.23.16.565928.log[m
	[32mnew file:   .gradio