In [72]:
import reportlab
import qdrant_client
import sentence_transformers
import pandas
import pypdf
import pinecone

In [73]:
from pathlib import Path

DATA_DIR = Path("./realistic_drug_docs")   # change if your folder name differs
COLLECTION_NAME = "drug_rag_kb"

# Choose one:
USE_QDRANT_SERVER = True                  # recommended
QDRANT_URL = "http://localhost:6333"      # if server; else ignored


In [74]:
import os, re, json, hashlib
from typing import Dict, Any, List, Optional, Tuple

from tqdm import tqdm
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

from qdrant_client import QdrantClient
from qdrant_client.http import models as qm


In [75]:
def infer_drug_id_from_filename(name: str) -> Optional[str]:
    m = re.search(r"(D\d{3})", name)
    return m.group(1) if m else None

def infer_doc_type_from_filename(name: str) -> str:
    n = name.lower()
    if "truth" in n:
        return "truth"
    if "briefingbook" in n:
        return "briefingbook"
    return Path(name).suffix.lower().lstrip(".") or "unknown"

def normalize_text(text: str) -> str:
    text = text.replace("\x00", " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text

def sha256_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()


In [76]:
def load_pdf_text(path: Path) -> str:
    reader = PdfReader(str(path))
    out = []
    for page in reader.pages:
        out.append(page.extract_text() or "")
    return "\n".join(out)

def load_txt_text(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def load_json_text(path: Path) -> str:
    obj = json.loads(path.read_text(encoding="utf-8", errors="ignore"))

    lines = []
    def add(k, v):
        if v is None:
            return
        if isinstance(v, (str, int, float, bool)):
            lines.append(f"{k}: {v}")
        elif isinstance(v, list):
            if not v: return
            lines.append(f"{k}:")
            for item in v:
                if isinstance(item, dict):
                    compact = ", ".join([f"{ik}={item[ik]}" for ik in item.keys()])
                    lines.append(f"- {compact}")
                else:
                    lines.append(f"- {item}")
        elif isinstance(v, dict):
            lines.append(f"{k}:")
            for dk, dv in v.items():
                add(f"  {dk}", dv)
        else:
            lines.append(f"{k}: {str(v)}")

    # Pull common keys if present
    for key in [
        "drug_id","drug_name","indication","modality","mechanism","moa_short",
        "formulation","route","regimen","trial_id","phase","population",
        "key_inclusion","key_exclusion","results","safety","monitoring",
        "version","doc_date"
    ]:
        if key in obj:
            add(key, obj[key])

    if not lines:
        lines.append(json.dumps(obj, indent=2))
    return "\n".join(lines)

def load_file(path: Path) -> Tuple[str, Dict[str, Any]]:
    ext = path.suffix.lower()
    if ext == ".pdf":
        text = load_pdf_text(path); source_type = "pdf"
    elif ext == ".txt":
        text = load_txt_text(path); source_type = "txt"
    elif ext == ".json":
        text = load_json_text(path); source_type = "json"
    else:
        text = path.read_text(encoding="utf-8", errors="ignore"); source_type = "unknown"

    text = normalize_text(text)
    meta = {
        "file_name": path.name,
        "source_type": source_type,
        "drug_id": infer_drug_id_from_filename(path.name),
        "doc_type": infer_doc_type_from_filename(path.name),
        "abs_path": str(path.resolve()),
        "doc_hash": sha256_text(text),
        "text_len": len(text),
    }
    return text, meta


In [77]:
assert DATA_DIR.exists(), f"DATA_DIR not found: {DATA_DIR.resolve()}"

files = sorted([*DATA_DIR.glob("*.pdf"), *DATA_DIR.glob("*.txt"), *DATA_DIR.glob("*.json")])
print("Found:", len(files), "files")

docs = []
for fp in tqdm(files, desc="Loading files"):
    text, meta = load_file(fp)
    if len(text) < 50:
        continue
    docs.append({"text": text, "meta": meta})

print("Loaded docs:", len(docs))


Found: 30 files


Loading files: 100%|██████████| 30/30 [00:00<00:00, 106.10it/s]

Loaded docs: 30





In [78]:
dedup: Dict[str, Dict[str, Any]] = {}

for d in docs:
    h = d["meta"]["doc_hash"]
    if h not in dedup:
        dedup[h] = {
            "text": d["text"],
            "meta": d["meta"],
            "source_types": [d["meta"]["source_type"]],
            "source_files": [d["meta"]["file_name"]],
        }
    else:
        dedup[h]["source_types"].append(d["meta"]["source_type"])
        dedup[h]["source_files"].append(d["meta"]["file_name"])

dedup_docs = list(dedup.values())
print("Before dedup:", len(docs))
print("After dedup :", len(dedup_docs))
print("Example sources:", dedup_docs[12]["meta"]["drug_id"], dedup_docs[12]["source_types"])


Before dedup: 30
After dedup : 30
Example sources: D005 ['pdf']


In [79]:
def chunk_text(text: str, max_chars: int = 900, overlap: int = 150) -> List[str]:
    text = normalize_text(text)
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(n, start + max_chars)
        c = text[start:end].strip()
        if c:
            chunks.append(c)
        if end == n:
            break
        start = max(0, end - overlap)
    return chunks

chunk_payloads = []
for d in dedup_docs:
    chunks = chunk_text(d["text"])
    meta = d["meta"]
    for i, ch in enumerate(chunks):
        ch = normalize_text(ch)
        if len(ch) < 50:
            continue
        chunk_payloads.append({
            "drug_id": meta.get("drug_id"),
            "doc_type": meta.get("doc_type"),
            "chunk_index": i,
            "text": ch,
            "chunk_hash": sha256_text(ch),
            "source_types": d["source_types"],
            "source_files": d["source_files"],
        })

print("Chunks:", len(chunk_payloads))


Chunks: 180


In [80]:
chunk_payloads[179]

{'drug_id': 'D010',
 'doc_type': 'truth',
 'chunk_index': 2,
 'text': 'ations: - Known hypersensitivity to active substance or excipients. monitoring: - Baseline labs per protocol (CBC, CMP) - Periodic assessment of liver enzymes - Clinical monitoring for infections version: v2.9 doc_date: 2025-12-26',
 'chunk_hash': 'b68f13b2954e2a3ae8d7f1c9f51c38786f4364ccc9994e80a594475e5aed4c5c',
 'source_types': ['json'],
 'source_files': ['D010_Juvencor_TRUTH.json']}

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "Your openAI key"

In [82]:
from openai import OpenAI

client = OpenAI()

EMBED_MODEL = "text-embedding-3-small"  # good + cheaper
EMBED_DIM = 1536  # for text-embedding-3-small (dimension depends on model)

def embed_texts(texts: list[str]) -> list[list[float]]:
    # batch call
    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
    return [d.embedding for d in resp.data]


In [83]:
import pinecone
from pinecone import Pinecone


In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

# Connect
pc = Pinecone(api_key="pinecone key ")
# Use existing index (from your screenshot)
INDEX_NAME = "test-agent"

existing = pc.list_indexes().names()
print("Existing indexes:", existing)

if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=1024,          # IMPORTANT: must match index
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(INDEX_NAME)
print("Using Pinecone index:", INDEX_NAME)


Existing indexes: ['test-agent']
Using Pinecone index: test-agent


In [85]:
BATCH = 64

def batched(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]


In [86]:
print(type(chunks))
print(len(chunks))
print(type(chunks[0]))
print(str(chunks[0])[:120])


<class 'list'>
3
<class 'str'>
drug_id: D010 drug_name: Juvencor indication: Rare-G modality: enzyme replacement mechanism: gene expression silencer mo


In [87]:
print(type(dedup_docs), len(dedup_docs))
print(dedup_docs[0].keys())
print(dedup_docs[0]["meta"].keys())
print(dedup_docs[0]["meta"].get("drug_id"))


<class 'list'> 30
dict_keys(['text', 'meta', 'source_types', 'source_files'])
dict_keys(['file_name', 'source_type', 'drug_id', 'doc_type', 'abs_path', 'doc_hash', 'text_len'])
D001


In [88]:
# overwrite your current chunks (strings) with proper chunk records
chunks = []

for d in dedup_docs:
    meta = d["meta"]
    drug_id = meta.get("drug_id") or "UNK"
    doc_type = meta.get("doc_type") or "doc"

    for idx, ch in enumerate(chunk_text(d["text"], max_chars=900, overlap=150)):
        ch = normalize_text(ch)
        if len(ch) < 50:
            continue

        chunk_id = f"{drug_id}_{doc_type}_{sha256_text(ch)[:16]}"
        chunks.append({
            "id": chunk_id,
            "text": ch,
            "drug_id": drug_id,
            "doc_type": doc_type,
            "chunk_index": idx,
            "source_types": d.get("source_types", []),
            "source_files": d.get("source_files", []),
        })

print("Rebuilt chunks:", len(chunks))
print("Sample:", chunks[0]["id"], type(chunks[0]), chunks[0]["drug_id"])


Rebuilt chunks: 180
Sample: D001_briefingbook_65ddfe10bfae8599 <class 'dict'> D001


In [90]:
from tqdm import tqdm

BATCH = 64
NAMESPACE = "toy-agent"

def batched(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

total = 0

for batch in tqdm(list(batched(chunks, BATCH)), desc="Embed+Upsert"):
    embedded = pc.inference.embed(
        model="llama-text-embed-v2",
        inputs=[b["text"] for b in batch],
        parameters={"input_type": "passage"}
    )

    # ✅ correct for EmbeddingsList object
    vectors = [e.values for e in embedded.data]

    to_upsert = []
    for b, vec in zip(batch, vectors):
        meta = {
            "drug_id": b["drug_id"],
            "doc_type": b["doc_type"],
            "chunk_index": b["chunk_index"],
            "text": b["text"],
            "source_types": b["source_types"],
            "source_files": b["source_files"],
        }
        to_upsert.append((b["id"], vec, meta))

    index.upsert(vectors=to_upsert, namespace=NAMESPACE)
    total += len(to_upsert)

print("Upserted vectors:", total)


Embed+Upsert: 100%|██████████| 3/3 [00:27<00:00,  9.07s/it]

Upserted vectors: 180



