In [54]:
import os
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import faiss
import re

# Step 1: Extract data from PDF
def extract_text_with_headings(pdf_path):
    reader = PdfReader(pdf_path)
    data = []
    for page in reader.pages:
        text = page.extract_text()
        lines = text.split("\n")
        for line in lines:
            if re.match(r"^[A-Z][A-Z\s]+$", line.strip()):  # Heading (uppercase assumption)
                data.append({"heading": line.strip(), "content": ""})
            elif data:
                data[-1]["content"] += line.strip() + " "
    return data

# Step 2: Process text into meaningful chunks
def chunk_by_headings(data, max_chunk_size=500):
    chunks = []
    for section in data:
        heading = section["heading"]
        content = section["content"]
        words = content.split()
        for i in range(0, len(words), max_chunk_size):
            chunk = " ".join(words[i:i + max_chunk_size])
            chunks.append(f"{heading}\n{chunk}")
    return chunks

# Step 3: Embed and store the data
def store_in_faiss_advanced(chunks, faiss_index_path="faiss_index"):
    model= SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(chunks, normalize_embeddings=True)  # Normalize embeddings

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product for normalized vectors
    index.add(np.array(embeddings))

    faiss.write_index(index, faiss_index_path)
    return faiss_index_path, chunks

def re_rank_results(chunks, query, top_k=3):
    model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
    query_embedding = model.encode([query], normalize_embeddings=True)

    # Compute cosine similarity
    embeddings = model.encode(chunks, normalize_embeddings=True)
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Rank and retrieve top-k
    ranked_indices = np.argsort(similarities)[::-1][:top_k]
    return [chunks[i] for i in ranked_indices], similarities[ranked_indices]
    
def query_faiss_with_reranking(index, query, chunks, top_k=5):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    query_embedding = model.encode([query], normalize_embeddings=True)
    distances, indices = index.search(np.array(query_embedding), top_k)

    retrieved_chunks = [chunks[i] for i in indices[0]]
    reranked_chunks, reranked_scores = re_rank_results(retrieved_chunks, query, top_k=3)
    return reranked_chunks
    


In [55]:

# Step 3: Use OpenAI GPT for generating answers
def get_answer_with_openai(context, query, openai_api_key):
    print("Context provided for this query is: ", context)
    client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
)

    # Construct the conversation messages
    messages = [
        {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context only."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}\nAnswer:"}
    ]

    # Call OpenAI Chat Completion API
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # or "gpt-3.5-turbo" for a faster, cheaper model
        messages=messages,
        max_tokens=300,
        temperature=0.7
    )
    
    # Extract and return the assistant's reply
    return response.choices[0].message.content.strip()


    

def main():
    pdf_path = "cant_hurt_me.pdf"
    openai_api_key = "Open-ai-Key"

    # Step 1: Extract and store data
    data = extract_text_with_headings(pdf_path)
    chunks = chunk_by_headings(data)
    faiss_index_path, formatted_chunks = store_in_faiss_advanced(chunks)

    # Step 2: Load FAISS and query
    index = faiss.read_index(faiss_index_path)
    user_query = input("Enter your question: ")
    relevant_chunks = query_faiss_with_reranking(index, user_query, formatted_chunks)

    # Step 3: Use OpenAI to get the answer
    context = "\n".join(relevant_chunks)
    answer = get_answer_with_openai(context, user_query, openai_api_key)

    print(f"\n Answer: {answer}")

if __name__ == "__main__":
    main()


Enter your question:  Tell me about david goggins


Context provided for this query is:  ABOUT THE AUTHOR
DAVID GOGGINS is a retired Navy SEAL and the only member of the U.S. Armed Forces ever to complete SEAL training, U.S. Army Ranger School, and Air Force Tactic al Air Controller training. Goggins has competed in more than sixty ultra-marathons , triathl ons, and ultra- triathlons , setting new course records and regularly placing in the top five. A forme r Guinness World Record holder for completing 4,030 pull-ups in seventeen hours, he’s a much -sought -after public speaker who’ s shared his story with the staffs of Fortune 500 companies , professional sports teams, and hundreds of thousands of students across the country . OceanofPDF .com
CHAPTER FIVE
carry their boat and log by themselves. He admitted he was fighting his own demons on that beach. That his foundation was cracked. “I was an insecure person with low self esteem trying to grind an axe,” he said, “and my own ego, arrogance, and insecurity made my own life more difficu

In [61]:
pip install pytesseract 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from PIL import Image
import pytesseract

# Path to Tesseract executable (only needed on Windows, update as per your installation)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def perform_ocr(image_path):
    """
    Perform OCR on the given image file.
    
    Args:
        image_path (str): Path to the image file.
    
    Returns:
        str: Extracted text from the image.
    """
    try:
        # Open the image file
        image = Image.open(image_path)

        # Perform OCR
        extracted_text = pytesseract.image_to_string(image)

        return extracted_text
    except Exception as e:
        return f"Error during OCR: {e}"

# Example Usage
if __name__ == "__main__":
    image_path = "/Users/Aman-India/Desktop/personal/temp.png"  # Replace with your image file path
    text = perform_ocr(image_path)
    print("Extracted Text:")
    print(text)


Extracted Text:
Error during OCR: tesseract is not installed or it's not in your PATH. See README file for more information.
