In [3]:
import camelot
import pandas as pd
import numpy as np
from pathlib import Path

In [12]:
pdf_folder = Path("pdfs")                           # The datasets stored
output_csv_folder = Path("tables_csv")              # Extracted tables data in csv
output_csv_folder.mkdir(exist_ok=True)

In [13]:
all_tables_text = []
sources = []

In [14]:
pdf_files = list(pdf_folder.glob("*.pdf"))
print(f"found {len(pdf_files)} PDFs to process.\n")

found 1 PDFs to process.



In [15]:
for pdf_file in pdf_files:                                          # Extracting tables using camelot
    print(f"processing PDF : {pdf_file.name}")
    try:
        tables = camelot.read_pdf(str(pdf_file),pages="all",flavor="lattice")
        for i, table in enumerate(tables):
            csv_path = output_csv_folder/f"{pdf_file.stem}_table{i+1}.csv"
            table.to_csv(str(csv_path))

            df = table.df                                          # Convert table data into a DataFrame
            table_text = df.to_string(index=False)                 # Convert DataFrame to plain text for embedding
            all_tables_text.append(table_text)
            sources.append(f"{pdf_file.name} - Table{i+1} ")
        print(f" extracted {len(tables)} tables")
    except Exception as e:
        print(f"   Failed to process {pdf_file.name}: {e}")
        
print(f"\n Extracted total {len(all_tables_text)} tables.\n")        

processing PDF : foo.pdf
 extracted 1 tables

 Extracted total 1 tables.



In [16]:
from sentence_transformers import SentenceTransformer




In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")                         # Encode all table texts into numerical vector embeddings
embeddings = model.encode(all_tables_text,convert_to_numpy=True,show_progress_bar=True)
embedding_dim = embeddings.shape[1]
print(f"Embedding dimension: {embedding_dim}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding dimension: 384


In [18]:
import faiss

In [19]:
index = faiss.IndexFlatL2(embedding_dim)                      # Initialize a FAISS index (L2 distance) for fast vector similarity search
index.add(embeddings)                                         # Add all embeddings to the FAISS index
print(f" FAISS index built with {index.ntotal} vectors.\n")

 FAISS index built with 1 vectors.



In [20]:
import google.generativeai as genai

In [35]:
genai.configure(api_key = "AIzaSyC6uE-JcK8Ybu3sQA4jpYX12LEJWysB2HU")     # Configure the Gemini API with my API key

def query_table(query, k=3):
    query_emb = model.encode([query],convert_to_numpy=True)               # Convert the query into an embedding vector
    distances,indices = index.search(query_emb,k)                         # Retrieve the top-k most similar tables from FAISS
    context = "\n\n".join([all_tables_text[i] for i in indices[0]])
    used_sources = [sources[i] for i in indices[0]]

     # Build the prompt for the LLM (Gemini)
    prompt = f"""
You are an AI assistant helping to answer questions from extracted tables.                   
Use the following tables as your context and provide a clear, concise answer.

Context:
{context}

Question: {query}
Answer:
"""
    model_gemini = genai.GenerativeModel("gemini-2.5-pro")                
    response = model_gemini.generate_content(prompt)
    
    return response.text.strip(), used_sources
     
query = "Which cycle has the highest Percent Fuel Savings?"                   # Example query
answer, used_sources = query_table(query)

print("\n Answer:\n", answer)
print("\n Sources used:", used_sources)


 Answer:
 Cycle 4171_1 has the highest Percent Fuel Savings, with a 58.1% savings from Improved Speed.

 Sources used: ['foo.pdf - Table1 ', 'foo.pdf - Table1 ', 'foo.pdf - Table1 ']
