In [4]:
import reportlab


In [8]:
# More realistic synthetic pharma-style document generator:
# - 10 drugs
# - each drug: 10-page PDF + TXT + JSON "truth"
# - results are internally consistent across pages (same trial IDs, endpoints, AE rates)

import os, json, math, random, zipfile
from datetime import date
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple

# If reportlab isn't installed in your Colab, uncomment:
# !pip -q install reportlab

from reportlab.lib.pagesizes import LETTER
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch

random.seed(7)

OUT_DIR = "/content/realistic_drug_docs"
os.makedirs(OUT_DIR, exist_ok=True)

ZIP_PATH = "/content/realistic_drug_docs.zip"

# ------------ Synthetic "drug program" generator ------------

@dataclass
class TrialArm:
    name: str
    n: int
    dose: str

@dataclass
class TrialResult:
    endpoint_primary: str
    endpoint_secondary: str
    primary_effect: str
    primary_pvalue: float
    primary_ci: str
    secondary_effect: str
    discontinuation_rate_pct: float

@dataclass
class SafetyProfile:
    common_ae: List[Tuple[str, float]]          # (AE name, %)
    serious_ae_rate_pct: float
    discontinuation_ae_rate_pct: float
    warnings: List[str]
    contraindications: List[str]

@dataclass
class DrugProgram:
    drug_id: str
    drug_name: str
    indication: str
    modality: str
    mechanism: str
    moa_short: str
    formulation: str
    route: str
    regimen: str
    trial_id: str
    phase: str
    arms: List[TrialArm]
    population: str
    key_inclusion: List[str]
    key_exclusion: List[str]
    results: TrialResult
    safety: SafetyProfile
    monitoring: List[str]
    version: str
    doc_date: str

def clamp(x, lo, hi):
    return max(lo, min(hi, x))

def fmt_p(p):
    if p < 0.0001: return "<0.0001"
    if p < 0.001:  return f"{p:.4f}"
    return f"{p:.3f}"

def make_trial_id(drug_id: str):
    # Synthetic trial IDs look like: AMGN-D001-201
    base = "AMGN"  # just a placeholder prefix (synthetic)
    suffix = random.choice([201, 202, 301, 302, 401])
    return f"{base}-{drug_id}-{suffix}"

def make_program(drug_id, drug_name, indication, modality) -> DrugProgram:
    # Some controlled variability:
    moa_bank = [
        ("selective pathway inhibitor", "Inhibits a disease-relevant signaling pathway to reduce downstream activation."),
        ("targeted receptor antagonist", "Blocks receptor-mediated signaling to reduce disease activity."),
        ("ligand neutralizing antibody", "Neutralizes a circulating ligand to reduce pathway activation."),
        ("bispecific immune modulator", "Engages two targets to rebalance immune signaling."),
        ("gene expression silencer", "Reduces target mRNA to decrease protein expression.")
    ]
    mech, moa_short = random.choice(moa_bank)

    route = random.choice(["oral", "subcutaneous", "intravenous"])
    if modality in ["monoclonal antibody", "bispecific antibody", "antibody-drug conjugate", "enzyme replacement"]:
        route = random.choice(["intravenous", "subcutaneous"])
    if modality in ["small molecule"]:
        route = "oral"
    if modality in ["siRNA therapy"]:
        route = random.choice(["subcutaneous", "intravenous"])

    formulation = {
        "oral": "tablet",
        "subcutaneous": "prefilled syringe",
        "intravenous": "vial for infusion"
    }[route]

    if route == "oral":
        regimen = random.choice(["QD", "BID"])
        dose = random.choice(["50 mg", "100 mg", "150 mg"])
    elif route == "subcutaneous":
        regimen = random.choice(["QW", "Q2W", "Q4W"])
        dose = random.choice(["100 mg", "200 mg", "300 mg"])
    else:
        regimen = random.choice(["Q3W", "Q4W", "Q8W"])
        dose = random.choice(["3 mg/kg", "5 mg/kg", "10 mg/kg"])

    # Trial set-up
    trial_id = make_trial_id(drug_id)
    phase = random.choice(["Phase 2", "Phase 3"])
    n_total = random.randint(220, 520)
    n_treat = int(n_total * random.uniform(0.48, 0.55))
    n_ctrl = n_total - n_treat
    arms = [
        TrialArm(name=f"{drug_name} {dose} {regimen}", n=n_treat, dose=f"{dose} {regimen}"),
        TrialArm(name="Placebo", n=n_ctrl, dose="matching regimen")
    ]

    population = f"Adults with moderate-to-severe {indication} with inadequate response to standard therapy."

    key_incl = [
        "Confirmed diagnosis per protocol definition",
        "Baseline disease activity above threshold",
        "Stable background therapy for ≥4 weeks"
    ]
    key_excl = [
        "Severe uncontrolled comorbidity (per protocol)",
        "Recent major surgery within 12 weeks",
        "Known hypersensitivity to components"
    ]

    # Efficacy results (synthetic but consistent)
    endpoint_primary = random.choice([
        "Change from baseline in Symptom Score at Week 12",
        "Proportion achieving Response-50 at Week 16",
        "Mean change in Composite Index at Week 24"
    ])
    endpoint_secondary = random.choice([
        "Durable response at Week 24",
        "Quality-of-life improvement at Week 12",
        "Time to flare through Week 24"
    ])

    # Create plausible effect sizes:
    # We'll format as either difference in mean or proportion difference.
    if "Proportion" in endpoint_primary:
        treat_rate = clamp(random.uniform(0.45, 0.70), 0.35, 0.80)
        ctrl_rate  = clamp(treat_rate - random.uniform(0.12, 0.28), 0.10, 0.60)
        effect = treat_rate - ctrl_rate
        primary_effect = f"{treat_rate*100:.1f}% vs {ctrl_rate*100:.1f}% (Δ {effect*100:.1f}%)"
        ci_lo = (effect - 0.08) * 100
        ci_hi = (effect + 0.08) * 100
        primary_ci = f"[{ci_lo:.1f}%, {ci_hi:.1f}%]"
    else:
        # mean difference
        md = random.uniform(-2.2, -0.8)  # improvement negative (synthetic convention)
        primary_effect = f"LS mean difference {md:.2f} (Drug − Placebo)"
        primary_ci = f"[{md-0.40:.2f}, {md+0.40:.2f}]"

    # p-value tied to effect magnitude
    p = clamp(abs(random.gauss(0.01, 0.02)), 0.00005, 0.20)
    if "Δ" in primary_effect and "Δ" in primary_effect:
        if "Δ" in primary_effect:
            # bigger delta -> smaller p (rough)
            p = clamp(0.08 - abs(effect)*0.18 + random.uniform(-0.01, 0.02), 0.00005, 0.20)

    # Secondary effect (simple)
    secondary_effect = random.choice([
        "Numerically favored active arm; supportive trend",
        "Met nominal significance in hierarchical testing",
        "Did not meet prespecified significance threshold"
    ])

    disc_rate = clamp(random.uniform(3.0, 12.0), 1.0, 18.0)

    results = TrialResult(
        endpoint_primary=endpoint_primary,
        endpoint_secondary=endpoint_secondary,
        primary_effect=primary_effect,
        primary_pvalue=p,
        primary_ci=primary_ci,
        secondary_effect=secondary_effect,
        discontinuation_rate_pct=disc_rate
    )

    # Safety: AEs with rates; keep plausible ordering
    ae_pool = [
        "Headache", "Nausea", "Fatigue", "Diarrhea", "Upper respiratory infection",
        "Injection-site reaction", "Rash", "Dizziness", "Arthralgia", "Elevated ALT"
    ]
    random.shuffle(ae_pool)
    common = []
    base = random.uniform(6, 18)
    for i in range(6):
        rate = clamp(base - i*random.uniform(0.8, 2.2) + random.uniform(-1.0, 1.0), 2.0, 22.0)
        common.append((ae_pool[i], rate))

    serious_rate = clamp(random.uniform(1.0, 5.5), 0.5, 8.0)
    disc_ae_rate = clamp(random.uniform(1.0, 6.0), 0.5, 10.0)

    warnings = [
        "Monitor for hypersensitivity reactions.",
        "Assess for infection risk in susceptible patients.",
        "Consider hepatic monitoring if clinically indicated."
    ]
    contraind = ["Known hypersensitivity to active substance or excipients."]

    monitoring = [
        "Baseline labs per protocol (CBC, CMP)",
        "Periodic assessment of liver enzymes",
        "Clinical monitoring for infections"
    ]

    version = f"v{random.randint(1,3)}.{random.randint(0,9)}"
    doc_date = date.today().isoformat()

    return DrugProgram(
        drug_id=drug_id,
        drug_name=drug_name,
        indication=indication,
        modality=modality,
        mechanism=mech,
        moa_short=moa_short,
        formulation=formulation,
        route=route,
        regimen=f"{dose} {regimen}",
        trial_id=trial_id,
        phase=phase,
        arms=arms,
        population=population,
        key_inclusion=key_incl,
        key_exclusion=key_excl,
        results=results,
        safety=SafetyProfile(
            common_ae=common,
            serious_ae_rate_pct=serious_rate,
            discontinuation_ae_rate_pct=disc_ae_rate,
            warnings=warnings,
            contraindications=contraind
        ),
        monitoring=monitoring,
        version=version,
        doc_date=doc_date
    )


DRUG_LIST = [
    ("D001", "Amegena",  "Condition-X", "small molecule"),
    ("D002", "Bromilix",  "Condition-Y", "monoclonal antibody"),
    ("D003", "Cytovara",  "Condition-Z", "peptide"),
    ("D004", "Dermavax",  "Autoimmune-A", "bispecific antibody"),
    ("D005", "Erythron",  "Hematology-B", "small molecule"),
    ("D006", "Fenvora",   "Oncology-C", "antibody-drug conjugate"),
    ("D007", "Glucomet",  "Metabolic-D", "small molecule"),
    ("D008", "Hepatrel",  "Liver-E", "siRNA therapy"),
    ("D009", "Immunara",  "Immunology-F", "monoclonal antibody"),
    ("D010", "Juvencor",  "Rare-G", "enzyme replacement"),
]

programs = [make_program(*d) for d in DRUG_LIST]

# ------------ PDF rendering helpers ------------

def draw_header_footer(c: canvas.Canvas, prog: DrugProgram, page_num: int, total_pages: int, title: str):
    width, height = LETTER
    left = 0.75 * inch
    right = width - 0.75 * inch
    top = height - 0.65 * inch
    bottom = 0.65 * inch

    c.setFont("Helvetica-Bold", 12)
    c.drawString(left, top, f"{prog.drug_name} ({prog.drug_id}) — Synthetic Briefing Book")
    c.setFont("Helvetica", 10)
    c.drawRightString(right, top, f"{prog.phase} | Trial {prog.trial_id}")

    c.setFont("Helvetica-Bold", 11)
    c.drawString(left, top - 18, title)

    c.setFont("Helvetica", 9)
    c.drawString(left, bottom - 10, f"Version {prog.version} | Date {prog.doc_date} | Page {page_num}/{total_pages}")
    c.drawRightString(right, bottom - 10, "Synthetic training material (not real medical guidance).")

    # separator lines
    c.setLineWidth(0.5)
    c.line(left, top - 26, right, top - 26)
    c.line(left, bottom, right, bottom)

def draw_wrapped(c: canvas.Canvas, text: str, x: float, y: float, max_width_chars: int = 100, line_height: int = 13):
    import textwrap
    for line in textwrap.wrap(text, width=max_width_chars):
        c.drawString(x, y, line)
        y -= line_height
    return y

def draw_bullets(c: canvas.Canvas, items: List[str], x: float, y: float, max_width_chars: int = 98, line_height: int = 13):
    import textwrap
    for it in items:
        wrapped = textwrap.wrap(it, width=max_width_chars)
        if wrapped:
            c.drawString(x, y, f"• {wrapped[0]}")
            y -= line_height
            for cont in wrapped[1:]:
                c.drawString(x + 12, y, cont)
                y -= line_height
        else:
            y -= line_height
    return y

def draw_table(c: canvas.Canvas, headers: List[str], rows: List[List[str]], x: float, y: float, col_widths: List[int], row_h: int = 16):
    # very simple table (no fancy styling)
    c.setFont("Helvetica-Bold", 10)
    cx = x
    for h, w in zip(headers, col_widths):
        c.drawString(cx, y, h)
        cx += w
    y -= row_h
    c.setFont("Helvetica", 10)
    for r in rows:
        cx = x
        for cell, w in zip(r, col_widths):
            c.drawString(cx, y, str(cell))
            cx += w
        y -= row_h
    return y

def make_page_texts(prog: DrugProgram) -> List[Tuple[str, str, Dict]]:
    """
    Returns list of 10 pages: (page_title, body_text, structured_data)
    structured_data helps you know what this page contains (optional).
    """
    # page 1: Overview + key messages
    p1_title = "1. Executive Summary & Key Messages"
    p1 = (
        f"Indication: {prog.indication}\n"
        f"Modality: {prog.modality}\n"
        f"Formulation/Route: {prog.formulation} / {prog.route}\n"
        f"Planned regimen (synthetic): {prog.regimen}\n\n"
        "Key messages:\n"
        f"- {prog.drug_name} is being developed for {prog.indication}.\n"
        f"- Primary endpoint: {prog.results.endpoint_primary}.\n"
        f"- Primary result: {prog.results.primary_effect}, p={fmt_p(prog.results.primary_pvalue)}.\n"
        f"- Safety: common AEs include {', '.join([a for a,_ in prog.safety.common_ae[:3]])}.\n"
    )

    # page 2: Program background
    p2_title = "2. Background & Unmet Need"
    p2 = (
        f"{prog.indication} is associated with persistent symptoms and variable response to standard therapy.\n"
        "This synthetic briefing book is structured to support retrieval-augmented generation (RAG) practice.\n\n"
        "Document governance (synthetic):\n"
        f"- Document version: {prog.version}\n"
        f"- Program identifier: {prog.trial_id}\n"
        "- Intended use: internal training / prototype only\n"
    )

    # page 3: MOA
    p3_title = "3. Mechanism of Action"
    p3 = (
        f"Proposed mechanism: {prog.mechanism}.\n"
        f"MOA summary: {prog.moa_short}\n\n"
        "Biology notes (synthetic):\n"
        "- Target pathway is assumed disease-relevant for training purposes.\n"
        "- Biomarker shifts are described as supportive evidence in later pages.\n"
    )

    # page 4: Study design
    p4_title = "4. Study Design"
    arm_lines = "\n".join([f"- Arm: {a.name}, n={a.n}" for a in prog.arms])
    p4 = (
        f"Trial: {prog.trial_id} ({prog.phase}), randomized, double-blind, placebo-controlled (synthetic).\n"
        f"Population: {prog.population}\n\n"
        f"Arms:\n{arm_lines}\n\n"
        "Key inclusion criteria:\n" + "\n".join([f"- {x}" for x in prog.key_inclusion]) + "\n\n"
        "Key exclusion criteria:\n" + "\n".join([f"- {x}" for x in prog.key_exclusion])
    )

    # page 5: Endpoints & estimands
    p5_title = "5. Endpoints & Analysis Overview"
    p5 = (
        f"Primary endpoint: {prog.results.endpoint_primary}\n"
        f"Secondary endpoint: {prog.results.endpoint_secondary}\n\n"
        "Analysis notes (synthetic):\n"
        "- Primary analysis uses an intention-to-treat estimand.\n"
        "- Missing data handled via multiple imputation (illustrative).\n"
        "- Multiplicity control via hierarchical testing (illustrative).\n"
    )

    # page 6: Efficacy results
    p6_title = "6. Efficacy Results"
    p6 = (
        "Primary outcome:\n"
        f"- Result: {prog.results.primary_effect}\n"
        f"- 95% CI: {prog.results.primary_ci}\n"
        f"- p-value: {fmt_p(prog.results.primary_pvalue)}\n\n"
        "Secondary outcome:\n"
        f"- Interpretation: {prog.results.secondary_effect}\n\n"
        "Discontinuations:\n"
        f"- Overall discontinuation rate (synthetic): {prog.results.discontinuation_rate_pct:.1f}%\n"
    )

    # page 7: Safety summary (table)
    p7_title = "7. Safety Summary (TEAEs)"
    p7 = (
        "Safety overview (synthetic):\n"
        f"- Serious AE rate: {prog.safety.serious_ae_rate_pct:.1f}%\n"
        f"- Discontinuation due to AE: {prog.safety.discontinuation_ae_rate_pct:.1f}%\n\n"
        "Common adverse events listed below.\n"
    )

    # page 8: Dosing & administration
    p8_title = "8. Dosing & Administration"
    p8 = (
        f"Route: {prog.route}\n"
        f"Formulation: {prog.formulation}\n"
        f"Regimen: {prog.regimen}\n\n"
        "Administration notes (synthetic):\n"
        "- Missed dose: take as soon as remembered unless near next scheduled dose.\n"
        "- Storage: controlled room temperature unless specified otherwise.\n"
        "- Concomitant therapy: per protocol allowances.\n"
    )

    # page 9: Warnings/contraindications/monitoring
    p9_title = "9. Contraindications, Warnings & Monitoring"
    p9 = (
        "Contraindications:\n" + "\n".join([f"- {x}" for x in prog.safety.contraindications]) + "\n\n"
        "Warnings/Precautions:\n" + "\n".join([f"- {x}" for x in prog.safety.warnings]) + "\n\n"
        "Monitoring recommendations (synthetic):\n" + "\n".join([f"- {x}" for x in prog.monitoring])
    )

    # page 10: Appendix / traceability / abbreviations
    p10_title = "10. Appendix: Abbreviations & Traceability"
    p10 = (
        "Abbreviations:\n"
        "- AE: Adverse event\n"
        "- SAE: Serious adverse event\n"
        "- TEAE: Treatment-emergent adverse event\n"
        "- ITT: Intention-to-treat\n\n"
        "Traceability fields (synthetic):\n"
        f"- Drug ID: {prog.drug_id}\n"
        f"- Trial ID: {prog.trial_id}\n"
        f"- Document version: {prog.version}\n"
        f"- Date: {prog.doc_date}\n\n"
        "Note: This document is synthetic and intended only for RAG/agent practice."
    )

    pages = [
        (p1_title, p1, {"type": "summary"}),
        (p2_title, p2, {"type": "background"}),
        (p3_title, p3, {"type": "moa"}),
        (p4_title, p4, {"type": "design"}),
        (p5_title, p5, {"type": "endpoints"}),
        (p6_title, p6, {"type": "efficacy"}),
        (p7_title, p7, {"type": "safety"}),
        (p8_title, p8, {"type": "dosing"}),
        (p9_title, p9, {"type": "warnings"}),
        (p10_title, p10, {"type": "appendix"}),
    ]
    return pages

def render_pdf(prog: DrugProgram, out_path: str):
    c = canvas.Canvas(out_path, pagesize=LETTER)
    total_pages = 10
    width, height = LETTER
    left = 0.75 * inch
    y_start = height - 1.35 * inch

    pages = make_page_texts(prog)

    for idx, (title, body, meta) in enumerate(pages, start=1):
        draw_header_footer(c, prog, idx, total_pages, title)

        y = y_start
        c.setFont("Helvetica", 10)

        # Body text with some structure
        for line in body.split("\n"):
            if line.strip() == "":
                y -= 10
                continue
            # headings inside page
            if line.endswith(":") or line.startswith("Key messages") or line.startswith("Primary outcome") or line.startswith("Secondary outcome"):
                c.setFont("Helvetica-Bold", 10)
                c.drawString(left, y, line)
                c.setFont("Helvetica", 10)
                y -= 14
            else:
                c.drawString(left, y, line)
                y -= 13

            if y < 1.1 * inch:
                break

        # Add safety table on page 7
        if meta.get("type") == "safety":
            y -= 10
            c.setFont("Helvetica-Bold", 10)
            c.drawString(left, y, "Common TEAEs (synthetic)")
            y -= 16

            headers = ["Adverse Event", "Rate (%)"]
            rows = [[ae, f"{pct:.1f}"] for ae, pct in prog.safety.common_ae]
            y = draw_table(c, headers, rows, left, y, col_widths=[320, 100], row_h=16)

        c.showPage()

    c.save()

def render_txt(prog: DrugProgram, out_path: str):
    pages = make_page_texts(prog)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(f"{prog.drug_name} ({prog.drug_id}) — Synthetic Briefing Book\n")
        f.write(f"Trial: {prog.trial_id} | {prog.phase} | Version {prog.version} | Date {prog.doc_date}\n")
        f.write("="*80 + "\n\n")
        for i, (title, body, meta) in enumerate(pages, start=1):
            f.write(f"[PAGE {i}/10] {title}\n")
            f.write("-"*80 + "\n")
            f.write(body + "\n\n")
            if meta.get("type") == "safety":
                f.write("Common TEAEs (synthetic):\n")
                for ae, pct in prog.safety.common_ae:
                    f.write(f"- {ae}: {pct:.1f}%\n")
                f.write("\n")
            f.write("\n")

# Generate all docs
generated = []
for prog in programs:
    base = f"{prog.drug_id}_{prog.drug_name}".replace(" ", "_")
    pdf_path = os.path.join(OUT_DIR, f"{base}_BriefingBook.pdf")
    txt_path = os.path.join(OUT_DIR, f"{base}_BriefingBook.txt")
    json_path = os.path.join(OUT_DIR, f"{base}_TRUTH.json")

    render_pdf(prog, pdf_path)
    render_txt(prog, txt_path)

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(asdict(prog), f, indent=2)

    generated.append((pdf_path, txt_path, json_path))

# Zip
with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED) as zf:
    for pdf_path, txt_path, json_path in generated:
        zf.write(pdf_path, arcname=os.path.basename(pdf_path))
        zf.write(txt_path, arcname=os.path.basename(txt_path))
        zf.write(json_path, arcname=os.path.basename(json_path))

print(f"Generated {len(generated)} drugs × (PDF+TXT+JSON).")
print("Folder:", OUT_DIR)
print("ZIP:", ZIP_PATH)


Generated 10 drugs × (PDF+TXT+JSON).
Folder: /content/realistic_drug_docs
ZIP: /content/realistic_drug_docs.zip


In [7]:
import os
print(os.getcwd())
print(os.path.exists("/content"))


c:\Users\ngunupud\Desktop\Toy_Agent
True
