In [1]:
import sys
print(sys.executable)


c:\Users\ngunupud\Desktop\Toy_Agent\.venv\Scripts\python.exe


In [2]:
import pinecone, inspect
print("pinecone module file:", getattr(pinecone, "__file__", None))
print("pinecone contents sample:", [x for x in dir(pinecone) if "Pine" in x or "Spec" in x][:30])


pinecone module file: c:\Users\ngunupud\Desktop\Toy_Agent\.venv\Lib\site-packages\pinecone\__init__.py
pinecone contents sample: ['ByocSpec', 'Pinecone', 'PineconeApiAttributeError', 'PineconeApiException', 'PineconeApiKeyError', 'PineconeApiTypeError', 'PineconeApiValueError', 'PineconeAsyncio', 'PineconeConfig', 'PineconeConfigurationError', 'PineconeException', 'PineconeProtocolError', 'PodSpec', 'PodSpecDefinition', 'ServerlessSpec', 'ServerlessSpecDefinition']


In [3]:
from pathlib import Path
import re, hashlib
from typing import Dict, Any, List, Optional, Tuple

from tqdm import tqdm
from pypdf import PdfReader
from docx import Document
from striprtf.striprtf import rtf_to_text

import os


In [4]:
import os
from pathlib import Path

DATA_DIR = Path("./realistic_docs_hr")
INDEX_NAME = "test-agent"
NAMESPACE = "toy-agent"

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
assert PINECONE_API_KEY, "Set env var PINECONE_API_KEY first"

print("✅ Key detected (length):", len(PINECONE_API_KEY))
print("✅ DATA_DIR exists:", DATA_DIR.exists(), DATA_DIR.resolve())


✅ Key detected (length): 75
✅ DATA_DIR exists: True C:\Users\ngunupud\Desktop\Toy_Agent\realistic_docs_hr


In [5]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)
print("✅ Connected. Indexes:", pc.list_indexes().names())


✅ Connected. Indexes: ['test-agent']


In [6]:
existing = pc.list_indexes().names()

if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=1024,      # must match llama-text-embed-v2 output for your setup
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("✅ Created index:", INDEX_NAME)
else:
    print("✅ Using existing index:", INDEX_NAME)

index = pc.Index(INDEX_NAME)


✅ Using existing index: test-agent


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
files = sorted([*DATA_DIR.glob("*.pdf"), *DATA_DIR.glob("*.docx"), *DATA_DIR.glob("*.rtf")])
print("✅ Found files:", len(files))
print("✅ Example file:", files[0].name if files else "NONE")


✅ Found files: 15
✅ Example file: HR-001_Code_of_Conduct.pdf


In [8]:
def normalize_text(text: str) -> str:
    text = text.replace("\x00", " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text

def sha256_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()

def load_pdf_text(path: Path) -> str:
    reader = PdfReader(str(path))
    return "\n".join([(p.extract_text() or "") for p in reader.pages])

def load_docx_text(path: Path) -> str:
    doc = Document(str(path))
    return "\n".join([p.text for p in doc.paragraphs if p.text])

def load_rtf_text(path: Path) -> str:
    raw = path.read_text(encoding="utf-8", errors="ignore")
    return rtf_to_text(raw)

def infer_doc_id_from_filename(name: str) -> str:
    m = re.search(r"(HR-\d{3})", name)
    return m.group(1) if m else name.split("_")[0]

def load_file(path: Path) -> Tuple[str, Dict[str, Any]]:
    ext = path.suffix.lower()
    if ext == ".pdf":
        text, source_type = load_pdf_text(path), "pdf"
    elif ext == ".docx":
        text, source_type = load_docx_text(path), "docx"
    elif ext == ".rtf":
        text, source_type = load_rtf_text(path), "rtf"
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    text = normalize_text(text)
    meta = {
        "doc_id": infer_doc_id_from_filename(path.name),
        "file_name": path.name,
        "source_type": source_type,
        "doc_hash": sha256_text(text),
        "text_len": len(text),
    }
    return text, meta

print("✅ load_file ready")


✅ load_file ready


In [9]:
docs = []
for fp in tqdm(files, desc="Loading files"):
    text, meta = load_file(fp)
    if len(text) < 50:
        continue
    docs.append({"text": text, "meta": meta})

print("✅ Loaded docs:", len(docs))
print("✅ Example meta:", docs[0]["meta"])
print("✅ Preview:", docs[0]["text"][:200])


Loading files: 100%|██████████| 15/15 [00:00<00:00, 79.58it/s]

✅ Loaded docs: 15
✅ Example meta: {'doc_id': 'HR-001', 'file_name': 'HR-001_Code_of_Conduct.pdf', 'source_type': 'pdf', 'doc_hash': '6d4c86134330795833509f87d3feae03edc308b3346146d3306e6409b371cebc', 'text_len': 1435}
✅ Preview: Document ID: HR-001 Topic: HR Policies Title: Code of Conduct Version: 1.2 Author: K. Patel Department: HR Operations Effective Date: 2025-12-28 1. Purpose This policy defines expectations and procedu





In [10]:
def chunk_text(text: str, max_chars: int = 900, overlap: int = 150) -> List[str]:
    text = normalize_text(text)
    chunks = []
    start = 0
    n = len(text)

    while start < n:
        end = min(n, start + max_chars)
        c = text[start:end].strip()
        if c:
            chunks.append(c)

        if end == n:
            break
        start = max(0, end - overlap)

    return chunks

chunks = []
for d in docs:
    meta = d["meta"]
    doc_id = meta["doc_id"]

    for idx, ch in enumerate(chunk_text(d["text"], max_chars=900, overlap=150)):
        if len(ch) < 50:
            continue

        chunk_id = f"{doc_id}__{idx:04d}__{sha256_text(ch)[:10]}"
        chunks.append({
            "id": chunk_id,
            "text": ch,
            "doc_id": doc_id,
            "chunk_index": idx,
            "file_name": meta["file_name"],
            "source_type": meta["source_type"],
        })

print("✅ Total chunks:", len(chunks))
print("✅ Sample chunk id:", chunks[0]["id"])


✅ Total chunks: 30
✅ Sample chunk id: HR-001__0000__47e77d311b


In [11]:
BATCH = 32

def batched(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

total = 0

for batch in tqdm(list(batched(chunks, BATCH)), desc="Embed+Upsert"):
    # ---- EMBEDDING happens here (Pinecone-hosted model) ----
    embedded = pc.inference.embed(
        model="llama-text-embed-v2",
        inputs=[b["text"] for b in batch],
        parameters={"input_type": "passage"}
    )

    vectors = [e.values for e in embedded.data]

    to_upsert = []
    for b, vec in zip(batch, vectors):
        meta = {
            "doc_id": b["doc_id"],
            "chunk_index": b["chunk_index"],
            "text": b["text"],
            "file_name": b["file_name"],
            "source_type": b["source_type"],
        }
        to_upsert.append((b["id"], vec, meta))

    index.upsert(vectors=to_upsert, namespace=NAMESPACE)
    total += len(to_upsert)

print("✅ Upserted vectors:", total)


Embed+Upsert: 100%|██████████| 1/1 [00:09<00:00,  9.57s/it]

✅ Upserted vectors: 30





In [12]:
q = "What is the purpose of this policy?"
q_emb = pc.inference.embed(
    model="llama-text-embed-v2",
    inputs=[q],
    parameters={"input_type": "query"}
)

res = index.query(
    vector=q_emb.data[0].values,
    top_k=3,
    include_metadata=True,
    namespace=NAMESPACE
)

for m in res["matches"]:
    md = m["metadata"]
    print(m["score"], md.get("doc_id"), md.get("file_name"))
    print("  ", md.get("text", "")[:120])


0.310680151 HR-004 HR-004_Remote_Work_Policy.pdf
   Document ID: HR-004 Topic: HR Policies Title: Remote Work Policy Version: 1.5 Author: J. Smith Department: Talent Acquis
0.270581543 HR-002 HR-002_Attendance_and_Punctuality.pdf
   Document ID: HR-002 Topic: HR Policies Title: Attendance and Punctuality Version: 1.7 Author: S. Ahmed Department: Peopl
0.25663358 HR-013 HR-013_Training_and_Development.docx
   Document ID: HR-013 Topic: HR Policies Title: Training and Development Version: 1.3 Author: A. Rao Department: HRBP Effe


In [None]:
import os
from openai import OpenAI


os.environ["OPENAI_API_KEY"] = "XXXX"

openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
print("✅ OpenAI client ready")


✅ OpenAI client ready


In [18]:
def build_context(matches, max_chars: int = 6000) -> str:
    parts = []
    used = 0

    for m in matches:
        md = m.get("metadata") or {}
        text = md.get("text") or ""
        doc_id = md.get("doc_id")
        file_name = md.get("file_name")

        header = f"[doc_id={doc_id} file={file_name}]"
        snippet = text[:1200]  # keep each chunk limited
        block = header + "\n" + snippet

        if used + len(block) > max_chars:
            break

        parts.append(block)
        used += len(block)

    return "\n\n---\n\n".join(parts)


In [19]:
def rag_answer(query: str, top_k: int = 5) -> str:
    # 1) Embed query with Pinecone-hosted embedding model
    q_emb = pc.inference.embed(
        model="llama-text-embed-v2",
        inputs=[query],
        parameters={"input_type": "query"}
    )

    # 2) Retrieve similar chunks from Pinecone
    res = index.query(
        vector=q_emb.data[0].values,
        top_k=top_k,
        include_metadata=True,
        namespace=NAMESPACE
    )

    matches = res.get("matches", [])
    if not matches:
        return "I couldn't find relevant information in the documents."

    # 3) Build context
    context = build_context(matches)

    # 4) Generate answer with OpenAI
    system = (
        "You are a helpful assistant for HR policy Q&A. "
        "Use ONLY the provided context. "
        "If the answer is not in the context, say: "
        "\"I don't know based on the provided documents.\""
    )

    user_prompt = f"""CONTEXT:
{context}

QUESTION:
{query}
"""

    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,
    )

    return resp.choices[0].message.content


In [23]:
print(rag_answer("anything i can knw about Employee onboarding?"))


The "Employee Onboarding" policy defines expectations and procedures for onboarding within the organization. It applies to all employees, interns, contractors, and temporary staff. Employees must comply with the onboarding requirements, and non-compliance may result in corrective action.

The procedure for onboarding includes:

1. **Request / Initiation**: Requests must be submitted via the HR portal or an approved communication channel.
2. **Review and Approval**: The employee's manager and HR Compliance must review requests where applicable.
3. **Execution**: Approved actions are implemented within standard timelines; exceptions require documentation.
4. **Recordkeeping**: HR maintains records according to retention guidelines.
5. **Monitoring and Auditing**: HR performs periodic checks and may audit related activities for adherence.
6. **Exceptions**: Exceptions require written approval from HR Compliance and the relevant department head.

There are also FAQs and examples provided, 

Exceptions require written approval from HR Compliance and the relevant department head.
