### Filename: 05_Add_File_Summary_As_Input.ipynb.ipynb


In [12]:
# ==========================================
# RAG Pipeline for Credit Card PDFs
# ==========================================
# This notebook demonstrates a RAG workflow:
# 1. Read PDF files
# 2. Generate summaries using Gemini-2.5-flash
# 3. Chunk text and store in ChromaDB
# 4. Retrieve relevant chunks using hybrid scoring
# 5. Generate concise answer with explanation using Gemini
# ==========================================


In [4]:
pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 2.4 MB/s eta 0:00:05
   --- ------------------------------------ 1.0/11.0 MB 3.0 MB/s eta 0:00:04
   ------ --------------------------------- 1.8/11.0 MB 3.2 MB/s eta 0:00:03
   -------- ------------------------------- 2.4/11.0 MB 3.1 MB/s eta 0:00:03
   ---------- ----------------------------- 2.9/11.0 MB 3.0 MB/s eta 0:00:03
   ------------- -------------------------- 3.7/11.0 MB 3.1 MB/s eta 0:00:03
   --------------- ------------------------ 4.2/11.0 MB 3.1 MB/s eta 0:00:03
   ----------------- ------------

In [79]:
import os
import pandas as pd
import pdfplumber
from pathlib import Path

import chromadb
from chromadb.utils import embedding_functions
# import openai
import google.generativeai as genai

from dotenv import load_dotenv
load_dotenv()

from datetime import datetime
today = datetime.now().strftime("%B %d, %Y")



In [None]:
# ==========================================
# READ FILES
# ==========================================

In [57]:

# Go one level up from Notebook/ → into Data/Cards
BASE_FOLDER = Path("..") / "Data" / "Cards"

def load_pdf_files(data_folder=BASE_FOLDER):
    """
    Load all PDF files and extract text.
    Returns a dictionary: {file_name: full_text}
    """
    files_text = {}
    for file_name in BASE_FOLDER.rglob("*.pdf"):
        file_path = os.path.join(file_name)
        full_text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
        files_text[str(file_name)] = full_text
        print(f"Loaded {str(file_name)} ({len(full_text.split())} words)")
    return files_text

pdf_texts = load_pdf_files()

Loaded ..\Data\Cards\Kohls\20250930\Cashback.pdf (105 words)
Loaded ..\Data\Cards\Discover\20250930\Cashback.pdf (990 words)
Loaded ..\Data\Cards\Citi\20250930\Additional Document.pdf (1492 words)


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Cannot set gray stroke color because /'P133' is an invalid float value
Cannot set gray non-stroke color because /'P133' is an invalid float value
Cannot set gray stroke color because /'P153' is an invalid float value
Cannot set gray non-stroke color because /'P153' is an invalid float value
Cannot set gray stroke color because /'P13' is an invalid float value
Cannot set gray non-stroke color because /'P13' is an invalid float value
Cannot set gray stroke color because /'P14' is an invalid float value
Cannot set gray non-stroke color because /'P14' is an invalid float value
Cannot set gray stroke color because /'P15' is an invalid float value
Cannot set gray non-stroke color because /'P15' is an invalid float value
Cannot set gray stroke color because /'P16' is an invalid float value
Cannot set gray non-stroke color because /

Loaded ..\Data\Cards\Citi\20250930\Cashback.pdf (1643 words)
Loaded ..\Data\Cards\BOA Allegiant\20250930\Cashback.pdf (253 words)
Loaded ..\Data\Cards\Apple\20250930\Cashback.pdf (902 words)
Loaded ..\Data\Cards\Amex\20250930\Cashback.pdf (9109 words)


In [None]:
# ==========================================
# Store to ChromaDB (Chunking + Summaries)
# ==========================================

In [89]:
os.getcwd()

'c:\\Users\\soumy\\OneDrive\\Documents\\IntelligentCardSelectorEngine\\notebooks'

In [63]:
# -----------------------------
# Config
# -----------------------------
COLLECTION_NAME = "credit_cards"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
API_KEY = os.getenv("GOOGLE_API_KEY")

In [72]:
# -----------------------------
# Initialize LLM client for Gemini
# -----------------------------
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
llm_model = genai.GenerativeModel("gemini-2.5-flash")

In [73]:
llm_model

genai.GenerativeModel(
    model_name='models/gemini-2.5-flash',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

In [64]:
# -----------------------------
# Initialize Chroma and Embedding
# -----------------------------
client = chromadb.Client()
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_MODEL_NAME
)
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_fn
)

In [74]:
# -----------------------------
# Helper functions
# -----------------------------
def chunk_text(text, chunk_size=300, overlap=100):
    """Split text into chunks of chunk_size words with overlap."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

def summarize_text(text, model=llm_model):
    """Generate a concise summary using Gemini-2.5-flash."""
    prompt = f"Summarize the following information in 3-5 sentences:\n\n{text}"
     # Generate response
    response = model.generate_content(
        [
            {"role": "model", "parts": "You summarize credit card benefits clearly and concisely."},
            {"role": "user", "parts": prompt}
        ],
        generation_config=genai.types.GenerationConfig(
            max_output_tokens=300,
            temperature=0.7
        )
    )
    summary = response.text.strip()
    return summary

def add_pdf_to_chroma(file_name, text):
    """Store PDF text and summary into ChromaDB using chunks."""
    # 1. Generate summary
    summary = summarize_text(text)
    summary_chunks = chunk_text(summary)
    for i, chunk in enumerate(summary_chunks):
        collection.add(
            documents=[chunk],
            metadatas=[{"file_name": file_name, "type": "summary_chunk", "chunk_index": i}],
            ids=[f"{file_name}_summary_chunk_{i}"]
        )
    # 2️. Add row chunks
    text_chunks = chunk_text(text)
    for j, chunk in enumerate(text_chunks):
        collection.add(
            documents=[chunk],
            metadatas=[{"file_name": file_name, "type": "row_chunk", "chunk_index": j}],
            ids=[f"{file_name}_row_chunk_{j}"]
        )
    print(f"Inserted {file_name} into ChromaDB ({len(text_chunks)} chunks + {len(summary_chunks)} summary chunks).")

In [75]:
# -----------------------------
# Insert all PDFs
# -----------------------------
for fname, text in pdf_texts.items():
    add_pdf_to_chroma(fname, text)

Inserted ..\Data\Cards\Kohls\20250930\Cashback.pdf into ChromaDB (1 chunks + 1 summary chunks).
Inserted ..\Data\Cards\Discover\20250930\Cashback.pdf into ChromaDB (5 chunks + 1 summary chunks).
Inserted ..\Data\Cards\Citi\20250930\Additional Document.pdf into ChromaDB (8 chunks + 1 summary chunks).
Inserted ..\Data\Cards\Citi\20250930\Cashback.pdf into ChromaDB (9 chunks + 1 summary chunks).
Inserted ..\Data\Cards\BOA Allegiant\20250930\Cashback.pdf into ChromaDB (2 chunks + 1 summary chunks).
Inserted ..\Data\Cards\Apple\20250930\Cashback.pdf into ChromaDB (5 chunks + 1 summary chunks).
Inserted ..\Data\Cards\Amex\20250930\Cashback.pdf into ChromaDB (46 chunks + 1 summary chunks).


In [82]:
# ==========================================
# Retrieve (Hybrid Scoring)
# ==========================================
import numpy as np

def retrieve_with_hybrid_scoring(query, top_k=5, summary_weight=0.3, row_weight=0.7):
    """
    Retrieve and rank both summary + row chunks across all PDFs.
    Returns top chunks with hybrid score.
    """
    # Retrieve summary chunks
    summary_results = collection.query(
        query_texts=[query],
        n_results=top_k*2,
        include=["documents", "distances"],
        where={"type": "summary_chunk"}
    )
    # Retrieve row chunks
    row_results = collection.query(
        query_texts=[query],
        n_results=top_k*2,
        include=["documents", "distances"],
        where={"type": "row_chunk"}
    )

    summary_docs, summary_scores = summary_results['documents'][0], summary_results['distances'][0]
    row_docs, row_scores = row_results['documents'][0], row_results['distances'][0]

    # Normalize scores to 0-1 similarity
    def normalize(scores):
        if not scores: return []
        arr = np.array(scores)
        return 1 - (arr - arr.min()) / (arr.max() - arr.min() + 1e-9)

    summary_sim = normalize(summary_scores) * summary_weight
    row_sim = normalize(row_scores) * row_weight

    # Combine and sort
    combined = [(doc, score) for doc, score in zip(summary_docs, summary_sim)]
    combined += [(doc, score) for doc, score in zip(row_docs, row_sim)]
    combined_sorted = sorted(combined, key=lambda x: x[1], reverse=True)

    return combined_sorted[:top_k]

In [83]:
# Example query
query = "Which card should I use for groceries to get the best cashback?"
top_chunks = retrieve_with_hybrid_scoring(query)
for i, (chunk, score) in enumerate(top_chunks):
    print(f"{i+1}. Score={score:.3f}\n{chunk}\n")


1. Score=0.700
Nike and Walgreens when you use Apple Card with Apple Pay.* Every time you pay with Apple Card using Apple Pay. On all purchases where Apple Pay isn’t accepted. Apply now› Apple Card Apply now No fees. Not even hidden ones. We want to make it easier to pay down your balance, not harder. So Apple Card doesn’t have any fees. Apply now› When you buy things with the Apple-designed titanium card, you get unlimited Daily Cash back on everything you buy. For apps and websites that don’t yet take Apple Pay, there’s a virtual card number in the Wallet app. And Mastercard is our global payment network, so you can use Apple Card all over the world. Apply now› 1. You can choose to direct Daily Cash to a Savings account or to an Apple Cash account. If you do not have either set up to receive your Daily Cash, it can be applied as statement credit upon request. Apple Card is issued and Savings accounts are provided by Goldman Sachs Bank USA, Salt Lake City Branch, Member FDIC. Daily Ca

In [87]:
# ==========================================
# Summarized Output (Gemini Answer + Explanation)
# ==========================================
def answer_query_with_explanation(query, top_k=5):
    """
    End-to-end RAG:
    - Retrieve top chunks using hybrid scoring
    - Generate concise answer with explanation using Gemini
    """
    top_chunks = retrieve_with_hybrid_scoring(query, top_k=top_k)
    context_text = "\n\n".join([chunk for chunk, _ in top_chunks])

    summary_prompt = (
        f"Today's date is {today}.\n"
        f"Use this date as context when answering.\n\n"
        f"Use the following information to answer the question concisely.\n\n"
        f"Information:\n{context_text}\n\n"
        f"Question: {query}\n\n"
        f"Explain why this information is relevant when giving the answer.\n"
        f"Answer:"
        f"Provide a short, clear recommendation if applicable. Also point it as bullet points.\n"
        f"Not too many words"
        f"If multiple cards are mentioned, order them based on relevance and keep it concise"
       
    )

    # Create the model (Gemini 2.5 Flash)
    model = genai.GenerativeModel("gemini-2.5-flash")

    # Generate response
    response = model.generate_content(
        [
            {"role": "model", "parts": "You summarize credit card benefits clearly and concisely."},
            {"role": "user", "parts": summary_prompt}
        ],
        generation_config=genai.types.GenerationConfig(
            max_output_tokens=300,
            temperature=0.7
        )
    )

    # Extract text
    summary = response.text.strip()
    
    return summary

In [88]:
if __name__ == "__main__":
    question = "Which card should I use for groceries to get the best cashback?"
    summary = answer_query_with_explanation(question)
    print(summary)

Here's the best card for groceries based on the information provided:

*   **Unnamed Card (3% cashback):** This card offers a consistent 3% cash back on the first $6,000 of eligible purchases at U.S. supermarkets each calendar year (then 1%).
    *   **Relevance:** This is a clear, guaranteed cashback rate specifically for groceries/supermarkets, making it a reliable choice for your grocery spending as of October 01, 2025.

*   **Discover it® Cash Back Credit Card (Potential 5% cashback):** While the text mentions "grocery stores" as a potential 5% rotating category, the specific categories for October 01, 2025 (Q4 2025) are not provided. If groceries become a 5% category, this card would offer the highest cashback, but it requires activation and is subject to a quarterly maximum. Otherwise, it earns 1% on all other purchases.
    *   **Relevance:** You would need to check Discover's current rotating categories for Q4 2025 to see if groceries are included for the 5% bonus. If not, the 