RAG with Milvus Demo 

In [1]:
# langchain packages
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
from langchain_huggingface import HuggingFaceEndpoint

In [2]:
import os

In [12]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Using cached torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence_transformers)
  Using cached scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting Pillow (from sentence_transformers)
  Using cached pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading regex-2025.9.18-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->

In [3]:
# Load the documents we want to prompt an LLM about
from langchain_community.document_loaders import PyPDFLoader

# Step 1 : Data Loading
## Connect to the source of data.
## Extract text from the file.
## Review and update metadata information.
## Clean or transform the data.

In [5]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

FOLDER_PATH = "/Users/pg47711/RAG Demo/docs/"

loader = DirectoryLoader(
    FOLDER_PATH,
    glob="**/*.pdf",           # use "*.pdf" for non-recursive
    loader_cls=PyPDFLoader,
    use_multithreading=True,   # speeds up loading
)
docs = loader.load()
print(f"✅ Loaded {len(docs)} pages from {len(set(d.metadata['source'] for d in docs))} files")

✅ Loaded 1037 pages from 20 files


## Extract more MetaData

In [15]:
import os
import re

MONTH_TO_Q = {
    "jan": 1, "january": 1,
    "feb": 1, "february": 1,
    "mar": 1, "march": 1,
    "apr": 2, "april": 2,
    "may": 2,
    "jun": 2, "june": 2,
    "jul": 3, "july": 3,
    "aug": 3, "august": 3,
    "sep": 3, "sept": 3, "september": 3,
    "oct": 4, "october": 4,
    "nov": 4, "november": 4,
    "dec": 4, "december": 4,
}

def parse_filename_fields(source_path: str):
    """
    Extract year, quarter, and company/ticker from PDF filename.
    Examples handled:
      - '2023 Q1 AMZN.pdf'
      - 'AMZN_Q1_2023.pdf'
      - '2023-Q2-MSFT 10-Q.pdf'
      - 'Q3 2024 Google.pdf'
      - 'Amazon 2023Q1.pdf'
      - '2024-04-30 Meta Q2.pdf' (quarter deduced from month if needed)
    """
    fn = os.path.basename(source_path or "")
    name, _ = os.path.splitext(fn)
    # Normalize separators to spaces
    s = re.sub(r"[_\-]+", " ", name)
    s = re.sub(r"\s+", " ", s).strip()

    # Year
    year = None
    m_year = re.search(r"\b(19|20)\d{2}\b", s)
    if m_year:
        year = int(m_year.group(0))

    # Quarter (Q1/Q 1/Quarter 1/1Q)
    quarter = None
    # Try Q patterns first
    m_q = re.search(r"\bQ[ \-]?([1-4])\b", s, flags=re.IGNORECASE) or \
          re.search(r"\b([1-4])[ \-]?Q\b", s, flags=re.IGNORECASE) or \
          re.search(r"\bQuarter[ \-]?([1-4])\b", s, flags=re.IGNORECASE)
    if m_q:
        quarter = f"Q{m_q.group(1)}"
    else:
        # Fallback: infer from month mention
        for mon, q in MONTH_TO_Q.items():
            if re.search(rf"\b{mon}\b", s, flags=re.IGNORECASE):
                quarter = f"Q{q}"
                break

    # Remove tokens (year/quarter/common noise) to isolate company
    rem = s
    if m_year:
        rem = re.sub(rf"\b{m_year.group(0)}\b", " ", rem)
    rem = re.sub(r"\bQ[ \-]?[1-4]\b|\b[1-4][ \-]?Q\b|\bQuarter[ \-]?[1-4]\b",
                 " ", rem, flags=re.IGNORECASE)

    # Remove common report terms
    noise = [
        "10-Q","10K","10-K","Form","Report","Earnings","Quarterly","Annual",
        "Statement","Results","Filing"
    ]
    for n in noise:
        rem = re.sub(rf"\b{re.escape(n)}\b", " ", rem, flags=re.IGNORECASE)

    company = re.sub(r"\s+", " ", rem).strip()

    # If company is still empty but filename contained obvious ticker at end like '(AMZN)'
    if not company and s:
        company = s  # fallback to the sanitized name

    return {
        "file_year": year or 0,
        "file_quarter": quarter or "",
        "company": company or "",
    }

# Quick test
parse_filename_fields("/Users/pg47711/RAG Demo/docs/2023 Q1 AMZN.pdf")
# -> {'file_year': 2023, 'file_quarter': 'Q1', 'company': 'AMZN'}

{'file_year': 2023, 'file_quarter': 'Q1', 'company': 'AMZN'}

In [16]:
from datetime import datetime

def normalize_pdf_metadata(md: dict) -> dict:
    source = md.get("source") or ""
    filename = os.path.basename(source) if source else ""

    # Parse fields from filename
    name_fields = parse_filename_fields(source)
    file_year = int(name_fields["file_year"] or 0)
    file_quarter = name_fields["file_quarter"] or ""
    company = name_fields["company"] or ""

    # Existing creation date parsing
    creation_raw = md.get("creationdate") or md.get("CreationDate") or ""
    creation_iso, creation_ts, year_meta, quarter_meta = "", 0, 0, ""

    if creation_raw:
        try:
            dt = datetime.fromisoformat(creation_raw.replace("Z", "+00:00"))
            creation_iso = dt.isoformat()
            creation_ts = int(dt.timestamp())
            year_meta = dt.year
            quarter_meta = f"Q{(dt.month - 1)//3 + 1}"
        except Exception:
            creation_iso = str(creation_raw)

    # Choose a canonical year/quarter: prefer filename if present, else metadata
    canonical_year = file_year or year_meta or 0
    canonical_quarter = file_quarter or quarter_meta or ""

    return {
        # original metadata
        "source": source,
        "filename": filename,
        "title": str(md.get("title") or ""),
        "author": str(md.get("author") or ""),
        "creator": str(md.get("creator") or ""),
        "producer": str(md.get("producer") or ""),
        "subject": str(md.get("subject") or ""),
        "keywords": str(md.get("keywords") or ""),
        "total_pages": int(md.get("total_pages") or 0),
        "page": int(md.get("page") or -1),
        "page_label": str(md.get("page_label") or ""),
        "creationdate": creation_iso,
        "creation_ts": int(creation_ts),

        # derived fields (filter-friendly)
        "company": company,
        "file_year": file_year,
        "file_quarter": file_quarter,

        # canonical fields to filter on
        "year": canonical_year,
        "quarter": canonical_quarter,
    }

In [18]:
# docs: list of langchain.schema.Document
metas = [normalize_pdf_metadata(d.metadata) for d in docs]

# Optional: text content to store alongside metadata
texts = [d.page_content for d in docs]

# Quick peek
print("Sample normalized metadata:", metas[1000])

Sample normalized metadata: {'source': '/Users/pg47711/RAG Demo/docs/2023 Q1 AMZN.pdf', 'filename': '2023 Q1 AMZN.pdf', 'title': '0001018724-23-000008', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'subject': 'Form 10-Q filed on 2023-04-28 for the period ending 2023-03-31', 'keywords': '0001018724-23-000008; ; 10-Q', 'total_pages': 52, 'page': 15, 'page_label': '16', 'creationdate': '2023-04-28T06:12:55-04:00', 'creation_ts': 1682676775, 'company': 'AMZN', 'file_year': 2023, 'file_quarter': 'Q1', 'year': 2023, 'quarter': 'Q1'}


In [23]:
# Column arrays aligned to your schema (adjust order to your schema)
text_col        = texts
source_col      = [m["source"] for m in metas]
filename_col    = [m["filename"] for m in metas]
title_col       = [m["title"] for m in metas]
author_col      = [m["author"] for m in metas]
creator_col     = [m["creator"] for m in metas]
producer_col    = [m["producer"] for m in metas]
subject_col     = [m["subject"] for m in metas]
keywords_col    = [m["keywords"] for m in metas]
total_pages_col = [m["total_pages"] for m in metas]
page_col        = [m["page"] for m in metas]
page_label_col  = [m["page_label"] for m in metas]
creationdate_col= [m["creationdate"] for m in metas]
creation_ts_col = [m["creation_ts"] for m in metas]
company_col     = [m["company"] for m in metas]
file_year_col   = [m["file_year"] for m in metas]
file_quarter_col= [m["file_quarter"] for m in metas]
year_col        = [m["year"] for m in metas]
quarter_col     = [m["quarter"] for m in metas]


## Create Embeddings 

In [29]:
EMBED_MODEL = "all-MiniLM-L6-v2"  # 384-dim, fast and solid
embedder = SentenceTransformer(EMBED_MODEL)
DIM = embedder.get_sentence_embedding_dimension()

# Note: normalize_embeddings=True pairs well with COSINE
vectors = embedder.encode(text_col, normalize_embeddings=True, show_progress_bar=True).tolist()


Batches: 100%|██████████| 33/33 [00:01<00:00, 16.74it/s]


## Milvus Load 

In [30]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
MILVUS_HOST = "localhost"  # If your endpoint is literally 'local_host', change this accordingly
MILVUS_PORT = "19530"
COLLECTION_NAME = "sec_filings"

connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

# Optional: drop collection if it exists (for clean reruns)
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

# 4) Define schema (make sure field names match your columns)
# Note: VARCHAR requires max_length; adjust as you prefer
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),

    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=DIM),

    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=1024),
    FieldSchema(name="filename", dtype=DataType.VARCHAR, max_length=512),

    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="creator", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="producer", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="subject", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="keywords", dtype=DataType.VARCHAR, max_length=1024),

    FieldSchema(name="total_pages", dtype=DataType.INT64),
    FieldSchema(name="page", dtype=DataType.INT64),
    FieldSchema(name="page_label", dtype=DataType.VARCHAR, max_length=64),

    FieldSchema(name="creationdate", dtype=DataType.VARCHAR, max_length=64),
    FieldSchema(name="creation_ts", dtype=DataType.INT64),

    # Filename-derived and canonical filters
    FieldSchema(name="company", dtype=DataType.VARCHAR, max_length=128),
    FieldSchema(name="file_year", dtype=DataType.INT64),
    FieldSchema(name="file_quarter", dtype=DataType.VARCHAR, max_length=8),
    FieldSchema(name="year", dtype=DataType.INT64),
    FieldSchema(name="quarter", dtype=DataType.VARCHAR, max_length=8),
]

schema = CollectionSchema(fields, description="Sec filings pages with text, embeddings, and rich metadata")
col = Collection(name=COLLECTION_NAME, schema=schema)

# 5) Create a vector index (COSINE works best with normalized embeddings)
index_params = {
    "metric_type": "IP",
    "index_type": "IVF_FLAT",  # for larger scale consider HNSW or IVF_SQ8
    "params": {"nlist": 1024},
}
col.create_index(field_name="embedding", index_params=index_params)

# 6) Prepare data and insert (order must match schema after the auto-id primary key)
# Ensure text length <= max_length in schema
text_col_trunc = [t[:8192] if isinstance(t, str) else "" for t in text_col]

# Sanity check: all column lengths must match
n = len(vectors)
assert all(len(lst) == n for lst in [
    text_col_trunc, source_col, filename_col, title_col, author_col, creator_col, producer_col,
    subject_col, keywords_col, total_pages_col, page_col, page_label_col, creationdate_col,
    creation_ts_col, company_col, file_year_col, file_quarter_col, year_col, quarter_col
]), "Column length mismatch — check your input lists."

# Insert column-wise (skip auto-id field)
mr = col.insert([
    vectors,
    text_col_trunc,
    source_col,
    filename_col,
    title_col,
    author_col,
    creator_col,
    producer_col,
    subject_col,
    keywords_col,
    total_pages_col,
    page_col,
    page_label_col,
    creationdate_col,
    creation_ts_col,
    company_col,
    file_year_col,
    file_quarter_col,
    year_col,
    quarter_col,
])
col.flush()
print(f"✅ Inserted entities: {mr.insert_count}")

# 7) Load collection for search
col.load()
print("✅ Collection loaded for search")

✅ Inserted entities: 1037
✅ Collection loaded for search
