In [None]:
# Install libraries
!pip -q install -U transformers accelerate sentence-transformers faiss-cpu pypdf pymupdf

# Import standard utilities
import os, re, json, math, textwrap, random
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple

# Import scientific stack
import numpy as np
import torch

# Import PDF readers
from pypdf import PdfReader
import fitz  # PyMuPDF

# Import vector search
import faiss
from sentence_transformers import SentenceTransformer

# Import LLM runtime
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set seeds to reduce randomness across runs
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Detect runtime device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"DEVICE = {DEVICE} | torch = {torch.__version__} | numpy = {np.__version__}")

# Configure torch behavior
torch.set_grad_enabled(False)
torch.backends.cuda.matmul.allow_tf32 = True if DEVICE == "cuda" else False

# Define a workspace folder
WORKDIR = "/content/research_agent"
os.makedirs(WORKDIR, exist_ok=True)
print("WORKDIR =", WORKDIR)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.2/328.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h



DEVICE = cpu | torch = 2.9.0+cpu | numpy = 2.0.2
WORKDIR = /content/research_agent


In [None]:
# Create work folder, define sources, retry downloads with browser headers, validate PDFs, write manifest
import os, json, hashlib, subprocess
from pathlib import Path
import requests

WORKDIR = Path("/content/research_agent")
WORKDIR.mkdir(parents=True, exist_ok=True)

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
HEADERS = {"User-Agent": UA, "Accept": "application/pdf,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Referer": "https://www.google.com/"}

SOURCES = [
  {"name":"who_school_based_violence_prevention_handbook.pdf",
   "urls":["https://apps.who.int/iris/bitstream/handle/10665/324930/9789241515542-eng.pdf?sequence=1&isAllowed=y"]},

  {"name":"unicef_unesco_behind_the_numbers_school_violence_bullying.pdf",
   "urls":["https://www.unicef.org/media/66496/file/Behind-the-Numbers.pdf",
           "https://www.unicef.org/media/66496/file/Behind-the-Numbers.pdf?download=1"]},

  {"name":"unicef_cyberbullying_leaflet_en.pdf",
   "urls":["https://www.unicef.org/egypt/media/806/file/Cyberbullying-Leaflet-EN.pdf",
           "https://www.unicef.org/egypt/media/806/file/Cyberbullying-Leaflet-EN.pdf?download=1"]},

  {"name":"kidpower_bullying_qa.pdf",
   "urls":["https://kidpowercs.org/wp-content/uploads/2017/09/Bullying-QA.pdf"]},

  {"name":"cci_anger_coping_strategies.pdf",
   "urls":["https://www.cci.health.wa.gov.au/-/media/CCI/Mental-Health-Professionals/Interpersonal/Interpersonal---Information-Sheets/Interpersonal-Information-Sheet---02---Anger-Coping-Strategies.pdf"]},

  {"name":"cci_assert_yourself_module_01.pdf",
   "urls":["https://www.cci.health.wa.gov.au/~/media/CCI/Consumer-Modules/Assert-Yourself/Assert-Yourself---01---What-is-Assertiveness.pdf"]},

  {"name":"cci_what_me_worry_overview_of_worry.pdf",
   "urls":["https://www.cci.health.wa.gov.au/~/media/CCI/Consumer-Modules/What-Me-Worry/What-Me-Worry---02---Overview-of-Worry.pdf"]},

  {"name":"samhsa_anger_management_workbook.pdf",
   "urls":["https://library.samhsa.gov/sites/default/files/anger_management_workbook_508_compliant.pdf"]},
]

def sha1_12(p: Path) -> str:
  h = hashlib.sha1()
  with p.open("rb") as f:
    for chunk in iter(lambda: f.read(1024 * 1024), b""):
      h.update(chunk)
  return h.hexdigest()[:12]

def looks_like_pdf(p: Path) -> bool:
  if not p.exists() or p.stat().st_size < 10_000: return False
  with p.open("rb") as f: return f.read(4) == b"%PDF"

def dl_requests(url: str, out: Path) -> bool:
  try:
    with requests.get(url, headers=HEADERS, stream=True, timeout=60) as r:
      if r.status_code != 200: return False
      out_tmp = out.with_suffix(out.suffix + ".part")
      with out_tmp.open("wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 256):
          if chunk: f.write(chunk)
      out_tmp.replace(out)
    return looks_like_pdf(out)
  except Exception:
    return False

def dl_curl(url: str, out: Path) -> bool:
  cmd = f'curl -L -A "{UA}" -H "Accept: application/pdf" -o "{out}" "{url}"'
  subprocess.run(["bash","-lc", cmd], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  return looks_like_pdf(out)

manifest = []
for src in SOURCES:
  out = WORKDIR / src["name"]
  if looks_like_pdf(out):
    manifest.append({"name":src["name"], "status":"OK (cached)", "size_mb":round(out.stat().st_size/1024/1024,2), "sha1_12":sha1_12(out), "url":"(cached)"})
    continue

  ok_url, ok = None, False
  for url in src["urls"]:
    if dl_requests(url, out) or dl_curl(url, out):
      ok_url, ok = url, True
      break

  if ok:
    manifest.append({"name":src["name"], "status":"OK", "size_mb":round(out.stat().st_size/1024/1024,2), "sha1_12":sha1_12(out), "url":ok_url})
  else:
    if out.exists(): out.unlink()
    manifest.append({"name":src["name"], "status":"FAILED", "size_mb":0, "sha1_12":"", "url":" | ".join(src["urls"])})

(WORKDIR / "sources_manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print(f"Files saved in: {WORKDIR}")
for m in manifest:
  print(f"{m['status']:<11} - {m['name']:<45} | {m['size_mb']} MB | {m['sha1_12']} | {m['url']}")


Files saved in: /content/research_agent
FAILED      - who_school_based_violence_prevention_handbook.pdf | 0 MB |  | https://apps.who.int/iris/bitstream/handle/10665/324930/9789241515542-eng.pdf?sequence=1&isAllowed=y
FAILED      - unicef_unesco_behind_the_numbers_school_violence_bullying.pdf | 0 MB |  | https://www.unicef.org/media/66496/file/Behind-the-Numbers.pdf | https://www.unicef.org/media/66496/file/Behind-the-Numbers.pdf?download=1
FAILED      - unicef_cyberbullying_leaflet_en.pdf           | 0 MB |  | https://www.unicef.org/egypt/media/806/file/Cyberbullying-Leaflet-EN.pdf | https://www.unicef.org/egypt/media/806/file/Cyberbullying-Leaflet-EN.pdf?download=1
OK          - kidpower_bullying_qa.pdf                      | 0.1 MB | d07d1d4cc4c8 | https://kidpowercs.org/wp-content/uploads/2017/09/Bullying-QA.pdf
OK          - cci_anger_coping_strategies.pdf               | 0.14 MB | 7aebb13378f5 | https://www.cci.health.wa.gov.au/-/media/CCI/Mental-Health-Professionals/Interpersonal

In [None]:
# Set work directory
import os, re, hashlib, subprocess, textwrap, requests
WORKDIR = "/content/research_agent"
os.makedirs(WORKDIR, exist_ok=True)

# Define browser-like headers
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
HDR = {
  "User-Agent": UA,
  "Accept": "application/pdf,application/octet-stream;q=0.9,*/*;q=0.8",
  "Accept-Language": "en-US,en;q=0.9",
  "Referer": "https://www.unicef.org/"
}

# Define helpers
def sha1_12(p):
  h = hashlib.sha1()
  with open(p, "rb") as f:
    for chunk in iter(lambda: f.read(1024*1024), b""):
      h.update(chunk)
  return h.hexdigest()[:12]

def looks_like_pdf(path, head_bytes=b""):
  if head_bytes:
    return head_bytes.startswith(b"%PDF")
  try:
    with open(path, "rb") as f:
      return f.read(4) == b"%PDF"
  except:
    return False

def curl_fetch(url, out_path, referer="https://www.unicef.org/"):
  cmd = ["bash","-lc", f'curl -L --fail -A "{UA}" -e "{referer}" "{url}" -o "{out_path}"']
  r = subprocess.run(cmd, capture_output=True, text=True)
  return r.returncode == 0

def extract_pdf_urls_from_html(url):
  try:
    r = requests.get(url, headers=HDR, timeout=30)
    html = r.text if r.ok else ""
    found = re.findall(r'https?://[^"\']+?\.pdf(?:\?[^"\']*)?', html, flags=re.I)
    return list(dict.fromkeys(found))
  except:
    return []

def try_download(name, candidates):
  out_path = os.path.join(WORKDIR, name)
  for u in candidates:
    try:
      r = requests.get(u, headers=HDR, stream=True, allow_redirects=True, timeout=45)
      if r.status_code == 200:
        head = r.raw.read(8)
        data = head + r.raw.read(1024*64)
        if head.startswith(b"%PDF") or b"%PDF" in data[:4096]:
          with open(out_path, "wb") as f:
            f.write(head); f.write(r.raw.read())
          if looks_like_pdf(out_path): return ("OK", u, out_path)
        else:
          tmp = out_path + ".tmp"
          with open(tmp, "wb") as f:
            f.write(head); f.write(r.raw.read())
          os.replace(tmp, out_path)
          if looks_like_pdf(out_path): return ("OK", u, out_path)
      if r.status_code in (401, 403, 429) or r.status_code >= 500:
        if curl_fetch(u, out_path):
          if looks_like_pdf(out_path): return ("OK", u, out_path)
    except:
      if curl_fetch(u, out_path):
        if looks_like_pdf(out_path): return ("OK", u, out_path)
  if os.path.exists(out_path):
    try: os.remove(out_path)
    except: pass
  return ("FAILED", "", out_path)

# Define failed targets with fallback URLs (use reputable alternates if a host blocks automated download)
targets = [
  ("who_school_based_violence_prevention_handbook.pdf", [
    "https://iris.who.int/bitstream/handle/10665/324930/9789241515542-eng.pdf?sequence=1",
    "https://apps.who.int/iris/bitstream/handle/10665/324930/9789241515542-eng.pdf?sequence=1&isAllowed=y",
    "https://www.unicef.org/media/58081/file/UNICEF-WHO-UNESCO-handbook-school-based-violence.pdf",
    "https://resourcecentre.savethechildren.net/pdf/who_school_based_violence_prevention_1.pdf",
    "https://dylbw5db8047o.cloudfront.net/uploads/who_school_based_violence_prevention_1.pdf"
  ]),
  ("unicef_unesco_behind_the_numbers_school_violence_bullying.pdf", [
    "https://www.unicef.org/media/66496/file/Behind-the-Numbers.pdf?download=1",
    "https://www.unicef.org/media/66496/file/Behind-the-Numbers.pdf",
    "https://unesdoc.unesco.org/ark:/48223/pf0000366483/PDF/366483eng.pdf.multi",
    "https://unesdoc.unesco.org/ark:/48223/pf0000366483/PDF/366483eng.pdf"
  ]),
  ("unicef_cyberbullying_leaflet_en.pdf", [
    "https://www.unicef.org/egypt/media/806/file/Cyberbullying-Leaflet-EN.pdf?download=1",
    "https://www.unicef.org/egypt/media/806/file/Cyberbullying-Leaflet-EN.pdf",
    "https://www.sjcs.co.uk/sites/default/files/2024-04/Childnet%20Cyberbullying%20Leaflet.pdf",
    "https://anti-bullyingalliance.org.uk/sites/default/files/uploads/attachments/hirescyberbullyingnobleed%281%29.pdf"
  ])
]

# Add discovered PDF links from wrapper pages (if any)
for i,(name,cands) in enumerate(targets):
  extra = []
  for u in list(cands):
    if u.endswith(".pdf") and "resourcecentre.savethechildren.net/pdf/" in u:
      extra += extract_pdf_urls_from_html(u)
  if extra:
    targets[i] = (name, list(dict.fromkeys(extra + cands)))

# Download and print manifest
manifest = []
for name, cands in targets:
  status, used_url, path = try_download(name, cands)
  size_mb = (os.path.getsize(path) / (1024*1024)) if (status=="OK" and os.path.exists(path)) else 0.0
  s12 = sha1_12(path) if status=="OK" else ""
  print(f"{status:<6} - {name:<45} | {size_mb:>5.2f} MB | {s12:<12} | {used_url}")
  manifest.append((name, status, f"{size_mb:.2f} MB", s12, used_url))

print("\nFiles saved in:", WORKDIR)


FAILED - who_school_based_violence_prevention_handbook.pdf |  0.00 MB |              | 
FAILED - unicef_unesco_behind_the_numbers_school_violence_bullying.pdf |  0.00 MB |              | 
OK     - unicef_cyberbullying_leaflet_en.pdf           |  0.49 MB | 32fee49ee3c7 | https://www.sjcs.co.uk/sites/default/files/2024-04/Childnet%20Cyberbullying%20Leaflet.pdf

Files saved in: /content/research_agent


In [None]:
# Install HTML extraction utilities
!pip -q install -U trafilatura beautifulsoup4 lxml

# Define work paths
import os, json, time, re, hashlib
from urllib.parse import urlparse

import requests
import trafilatura
from bs4 import BeautifulSoup

WORKDIR = "/content/research_agent"
os.makedirs(WORKDIR, exist_ok=True)
OUT_JSONL = os.path.join(WORKDIR, "sources_web.jsonl")

# Define request headers
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
HEADERS = {"User-Agent": UA, "Accept-Language": "en-US,en;q=0.9"}

# Define curated sources (bullying + anger/emotion regulation)
URLS = [
  "https://www.stopbullying.gov/bullying/what-is-bullying",
  "https://www.stopbullying.gov/prevention/how-to-prevent-bullying",
  "https://www.stopbullying.gov/kids/what-you-can-do",
  "https://www.stopbullying.gov/resources/what-you-can-do",
  "https://www.stopbullying.gov/resources/research-resources/mtss-prevention-approaches-and-effective-intervention",
  "https://www.stopbullying.gov/resources/get-help-now",
  "https://www.cdc.gov/youth-violence/about/about-bullying.html",
  "https://stacks.cdc.gov/view/cdc/21596",
  "https://www.apa.org/topics/bullying",
  "https://www.apa.org/topics/bullying/prevent",
  "https://www.apa.org/ed/schools/primer/bullying",
  "https://www.nhs.uk/mental-health/feelings-symptoms-behaviours/feelings-and-symptoms/anger/",
  "https://www.nhsinform.scot/illnesses-and-conditions/mental-health/mental-health-self-help-guides/problems-with-anger-self-help-guide/",
  "https://selfhelp.cntw.nhs.uk/self-help-guides/managing-anger/print/714",
]

# Define text cleaning
def clean_text(t: str) -> str:
  t = re.sub(r"\r", "\n", t or "")
  t = re.sub(r"\n{3,}", "\n\n", t)
  t = re.sub(r"[ \t]{2,}", " ", t)
  return t.strip()

# Define HTML download
def fetch_html(url: str) -> str:
  r = requests.get(url, headers=HEADERS, timeout=45, allow_redirects=True)
  r.raise_for_status()
  return r.text

# Define main-text extraction with fallback
def extract_main_text(url: str, html: str) -> tuple[str, str]:
  downloaded = trafilatura.extract(html, include_comments=False, include_tables=True, favor_precision=True)
  if downloaded and len(downloaded.strip()) > 400:
    title = trafilatura.metadata.extract_metadata(html).title or ""
    return clean_text(title), clean_text(downloaded)

  soup = BeautifulSoup(html, "lxml")
  for tag in soup(["script", "style", "noscript"]):
    tag.decompose()
  main = soup.find("main") or soup.find("article") or soup.body
  title = (soup.title.get_text(" ", strip=True) if soup.title else "")
  text = main.get_text("\n", strip=True) if main else soup.get_text("\n", strip=True)
  text = "\n".join([line for line in (l.strip() for l in text.splitlines()) if len(line) >= 3])
  return clean_text(title), clean_text(text)

# Write extracted sources as JSONL
results = []
with open(OUT_JSONL, "w", encoding="utf-8") as out:
  for i, url in enumerate(URLS, 1):
    try:
      html = fetch_html(url)
      title, text = extract_main_text(url, html)
      domain = urlparse(url).netloc
      sid = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]
      item = {
        "id": sid,
        "url": url,
        "domain": domain,
        "title": title,
        "text": text,
        "retrieved_at_unix": int(time.time()),
        "char_count": len(text),
        "word_count": len(text.split()),
        "tags": ["bullying", "emotion_regulation", "conflict_skills"],
      }
      out.write(json.dumps(item, ensure_ascii=False) + "\n")
      results.append((url, "OK", item["word_count"]))
      print(f"OK  {i:02d}/{len(URLS)}  {domain}  words={item['word_count']}")
      time.sleep(0.4)
    except Exception as e:
      results.append((url, "FAILED", 0))
      print(f"ERR {i:02d}/{len(URLS)}  {url}  -> {e}")

print("\nSaved:", OUT_JSONL)
print("OK:", sum(1 for _,s,_ in results if s=="OK"), "| FAILED:", sum(1 for _,s,_ in results if s=="FAILED"))


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.9/837.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hERR 01/14  https://www.stopbullying.gov/bullying/what-is-bullying  -> 403 Client Error: Forbidden for url: https://www.stopbullying.gov/bullying/what-is-bullying
ERR 02/14  https://www.stopbullying.gov/prevention/how-to-prevent-bullying  -> 403 Client Error: Forbidden for url: 



OK  09/14  www.apa.org  words=0




OK  10/14  www.apa.org  words=0




OK  11/14  www.apa.org  words=0
OK  12/14  www.nhs.uk  words=928
OK  13/14  www.nhsinform.scot  words=6754
OK  14/14  selfhelp.cntw.nhs.uk  words=6326

Saved: /content/research_agent/sources_web.jsonl
OK: 7 | FAILED: 7


In [None]:
# Install system libraries required by headless Chromium
!playwright install-deps chromium || true
!apt-get -qq update || true
!apt-get -qq install -y libatk1.0-0 libatk-bridge2.0-0 libgtk-3-0 libcups2 libxkbcommon0 \
  libxcomposite1 libxdamage1 libxrandr2 libgbm1 libpangocairo-1.0-0 libpango-1.0-0 \
  libnss3 libnspr4 libdrm2 libxshmfence1 libasound2 || true

# Import utilities
import os, json, time, re, hashlib, asyncio
from urllib.parse import urlparse
import trafilatura
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

WORKDIR = "/content/research_agent"
OUT_JSONL = os.path.join(WORKDIR, "sources_web.jsonl")
os.makedirs(WORKDIR, exist_ok=True)

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"

# Define URLs to retry (previously blocked or extracted empty)
RETRY_URLS = [
  "https://www.stopbullying.gov/bullying/what-is-bullying",
  "https://www.stopbullying.gov/prevention/how-to-prevent-bullying",
  "https://www.stopbullying.gov/kids/what-you-can-do",
  "https://www.stopbullying.gov/resources/get-help-now",
  "https://stacks.cdc.gov/view/cdc/21596",
  "https://www.apa.org/topics/bullying",
  "https://www.apa.org/topics/bullying/prevent",
  "https://www.apa.org/ed/schools/primer/bullying",
]

# Clean extracted text
def clean_text(t: str) -> str:
  t = re.sub(r"\r", "\n", t or "")
  t = re.sub(r"\n{3,}", "\n\n", t)
  t = re.sub(r"[ \t]{2,}", " ", t)
  return t.strip()

# Extract main text from HTML
def extract_main_text(html: str):
  extracted = trafilatura.extract(html, include_comments=False, include_tables=True, favor_precision=True)
  if extracted and len(extracted.strip()) > 400:
    md = trafilatura.metadata.extract_metadata(html)
    title = (md.title or "") if md else ""
    return clean_text(title), clean_text(extracted)

  soup = BeautifulSoup(html, "lxml")
  for tag in soup(["script", "style", "noscript"]):
    tag.decompose()
  title = soup.title.get_text(" ", strip=True) if soup.title else ""
  main = soup.find("main") or soup.find("article") or soup.body
  text = main.get_text("\n", strip=True) if main else soup.get_text("\n", strip=True)
  text = "\n".join([line for line in (l.strip() for l in text.splitlines()) if len(line) >= 3])
  return clean_text(title), clean_text(text)

# Load existing JSONL to update entries and avoid duplicates
existing = {}
if os.path.exists(OUT_JSONL):
  with open(OUT_JSONL, "r", encoding="utf-8") as f:
    for line in f:
      if line.strip():
        obj = json.loads(line)
        existing[obj["url"]] = obj

async def run_render_ingest():
  async with async_playwright() as p:
    browser = await p.chromium.launch(
      headless=True,
      args=["--no-sandbox", "--disable-dev-shm-usage"]
    )
    context = await browser.new_context(
      user_agent=UA,
      locale="en-US",
      viewport={"width": 1280, "height": 720},
      extra_http_headers={"Accept-Language": "en-US,en;q=0.9"},
    )
    page = await context.new_page()

    ok_count = 0
    for i, url in enumerate(RETRY_URLS, 1):
      try:
        resp = await page.goto(url, wait_until="domcontentloaded", timeout=60_000)
        status = resp.status if resp else None
        await page.wait_for_timeout(1200)
        html = await page.content()

        title, text = extract_main_text(html)
        wc = len(text.split())
        domain = urlparse(url).netloc
        sid = hashlib.sha1(url.encode("utf-8")).hexdigest()[:12]

        if wc < 250:
          print(f"SKIP {i:02d}/{len(RETRY_URLS)}  {domain}  status={status}  words={wc}")
          continue

        existing[url] = {
          "id": sid,
          "url": url,
          "domain": domain,
          "title": title,
          "text": text,
          "retrieved_at_unix": int(time.time()),
          "char_count": len(text),
          "word_count": wc,
          "tags": ["bullying", "emotion_regulation", "conflict_skills"],
          "fetch_mode": "playwright_async_render",
          "http_status": status,
        }
        ok_count += 1
        print(f"OK   {i:02d}/{len(RETRY_URLS)}  {domain}  status={status}  words={wc}")
      except Exception as e:
        print(f"ERR  {i:02d}/{len(RETRY_URLS)}  {url}  -> {e}")

    await context.close()
    await browser.close()

  with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for obj in existing.values():
      f.write(json.dumps(obj, ensure_ascii=False) + "\n")

  print("\nUpdated:", OUT_JSONL, "| total_sources =", len(existing), "| newly_ok =", ok_count)

await run_render_ingest()


Installing dependencies...
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://cli.github.com/packages stable InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,550 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,633 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [6,205 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,851 



SKIP 06/8  www.apa.org  status=200  words=6
OK   07/8  www.apa.org  status=200  words=482
OK   08/8  www.apa.org  status=200  words=963

Updated: /content/research_agent/sources_web.jsonl | total_sources = 8 | newly_ok = 3


In [None]:
# Create bulk source pack (CBT + assertiveness + emotional regulation + anti-bullying) and estimate total pages
import os, re, json, time, hashlib, zipfile
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup

WORKDIR = "/content/research_agent"
OUTDIR  = os.path.join(WORKDIR, "bulk_sources")
ZDIR    = os.path.join(OUTDIR, "zips")
PDIR    = os.path.join(OUTDIR, "pdfs")
MDIR    = os.path.join(OUTDIR, "manifests")
os.makedirs(ZDIR, exist_ok=True); os.makedirs(PDIR, exist_ok=True); os.makedirs(MDIR, exist_ok=True)

UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
HDRS = {"User-Agent": UA, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9"}
TIMEOUT = 60

def sha1_file(path, chunk=1024*1024):
  h = hashlib.sha1()
  with open(path, "rb") as f:
    while True:
      b = f.read(chunk)
      if not b: break
      h.update(b)
  return h.hexdigest()

def safe_name(s):
  s = re.sub(r"[^a-zA-Z0-9._-]+", "_", s.strip())
  return s[:180] if len(s) > 180 else s

def download(url, dest_path, tries=3):
  if os.path.exists(dest_path) and os.path.getsize(dest_path) > 1024:
    return {"ok": True, "skipped": True, "path": dest_path, "bytes": os.path.getsize(dest_path)}
  last_err = None
  for k in range(tries):
    try:
      r = requests.get(url, headers=HDRS, stream=True, timeout=TIMEOUT, allow_redirects=True)
      r.raise_for_status()
      tmp = dest_path + ".part"
      with open(tmp, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024*256):
          if chunk: f.write(chunk)
      os.replace(tmp, dest_path)
      return {"ok": True, "skipped": False, "path": dest_path, "bytes": os.path.getsize(dest_path)}
    except Exception as e:
      last_err = str(e)
      time.sleep(1.2 * (k + 1))
  return {"ok": False, "error": last_err, "path": dest_path}

def harvest_cci_assets(label, page_url):
  try:
    r = requests.get(page_url, headers=HDRS, timeout=TIMEOUT)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    hrefs = []
    for a in soup.select("a[href]"):
      href = a.get("href", "").strip()
      if not href: continue
      full = urljoin(page_url, href)
      if "cci.health.wa.gov.au" not in urlparse(full).netloc:
        continue
      if any(full.lower().endswith(ext) for ext in [".zip", ".pdf"]):
        hrefs.append(full)
    hrefs = sorted(set(hrefs))
    return {"label": label, "page": page_url, "assets": hrefs, "ok": True}
  except Exception as e:
    return {"label": label, "page": page_url, "assets": [], "ok": False, "error": str(e)}

# Choose “index pages” that usually expose “Download the entire workbook” ZIPs + module PDFs
CCI_PAGES = [
  ("cci_assertiveness",        "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/assertiveness"),
  ("cci_worry_rumination",     "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/worry-and-rumination"),
  ("cci_tolerating_distress",  "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/tolerating-distress"),
  ("cci_social_anxiety",       "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/social-anxiety"),
  ("cci_self_esteem",          "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/self-esteem"),
  ("cci_perfectionism",        "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/perfectionism"),
  ("cci_self_compassion",      "https://www.cci.health.wa.gov.au/resources/looking-after-yourself/self-compassion"),
  ("cci_interpersonal_clin",   "https://www.cci.health.wa.gov.au/Resources/For-Clinicians/Interpersonal-Problems"),
]

# Add extra reputable PDFs focused on bullying prevention (CDC)
DIRECT_PDFS = [
  ("cdc_understanding_bullying_2016.pdf", "https://stacks.cdc.gov/view/cdc/41572/cdc_41572_DS1.pdf"),
  ("cdc_understanding_school_violence_2016.pdf", "https://stacks.cdc.gov/view/cdc/43376/cdc_43376_DS1.pdf"),
  ("cdc_bullying_prevention_for_educators.pdf", "https://www2c.cdc.gov/podcasts/media/pdf/BullyingPrevention_Educators.pdf"),
  ("cdc_school_based_anti_bullying_interventions.pdf", "https://stacks.cdc.gov/view/cdc/168632/cdc_168632_DS4.pdf"),
  ("cdc_youth_violence_technical_package_spanish.pdf", "https://www.cdc.gov/violenceprevention/pdf/yv-technicalpackage-spanish.pdf"),
]

manifest = {"harvest": [], "downloads": [], "unzips": [], "summary": {}}

# Harvest assets from CCI pages
all_assets = []
for label, page in CCI_PAGES:
  info = harvest_cci_assets(label, page)
  manifest["harvest"].append(info)
  all_assets.extend([(label, u) for u in info.get("assets", [])])

# Download assets (ZIPs and PDFs)
ok, fail = 0, 0
for label, url in all_assets:
  fname = safe_name(os.path.basename(urlparse(url).path) or (label + ".bin"))
  target_dir = ZDIR if fname.lower().endswith(".zip") else PDIR
  dest = os.path.join(target_dir, f"{label}__{fname}")
  res = download(url, dest)
  rec = {"label": label, "url": url, "file": dest, **{k:v for k,v in res.items() if k in ["ok","skipped","bytes","error"]}}
  if res.get("ok"):
    rec["sha1"] = sha1_file(dest)[:12]
    ok += 1
  else:
    fail += 1
  manifest["downloads"].append(rec)

# Download direct CDC PDFs
for fname, url in DIRECT_PDFS:
  dest = os.path.join(PDIR, fname)
  res = download(url, dest)
  rec = {"label": "direct_pdf", "url": url, "file": dest, **{k:v for k,v in res.items() if k in ["ok","skipped","bytes","error"]}}
  if res.get("ok"):
    rec["sha1"] = sha1_file(dest)[:12]
    ok += 1
  else:
    fail += 1
  manifest["downloads"].append(rec)

# Unzip any ZIPs into structured folders
for d in list(manifest["downloads"]):
  if not d.get("ok"):
    continue
  if not str(d.get("file","")).lower().endswith(".zip"):
    continue
  zip_path = d["file"]
  out_sub = os.path.join(PDIR, safe_name(os.path.splitext(os.path.basename(zip_path))[0]))
  os.makedirs(out_sub, exist_ok=True)
  try:
    with zipfile.ZipFile(zip_path, "r") as z:
      z.extractall(out_sub)
    manifest["unzips"].append({"zip": zip_path, "to": out_sub, "ok": True})
  except Exception as e:
    manifest["unzips"].append({"zip": zip_path, "to": out_sub, "ok": False, "error": str(e)})

# Estimate total pages across all PDFs gathered so far
try:
  from pypdf import PdfReader
  pdf_paths = []
  for root, _, files in os.walk(WORKDIR):
    for f in files:
      if f.lower().endswith(".pdf"):
        pdf_paths.append(os.path.join(root, f))
  total_pages = 0
  counted = 0
  for p in pdf_paths:
    try:
      total_pages += len(PdfReader(p).pages)
      counted += 1
    except Exception:
      pass
  manifest["summary"] = {
    "workdir": WORKDIR,
    "bulk_outdir": OUTDIR,
    "pdf_files_counted": counted,
    "total_estimated_pages": total_pages,
    "downloads_ok": ok,
    "downloads_failed": fail,
  }
except Exception as e:
  manifest["summary"] = {"error": str(e), "downloads_ok": ok, "downloads_failed": fail}

# Save manifest
out_json = os.path.join(MDIR, f"bulk_manifest_{int(time.time())}.json")
with open(out_json, "w", encoding="utf-8") as f:
  json.dump(manifest, f, indent=2, ensure_ascii=False)

print("DONE")
print("OUTDIR =", OUTDIR)
print("MANIFEST =", out_json)
print("OK =", manifest["summary"].get("downloads_ok"), "| FAILED =", manifest["summary"].get("downloads_failed"))
print("PDFs counted =", manifest["summary"].get("pdf_files_counted"), "| Est. pages =", manifest["summary"].get("total_estimated_pages"))




DONE
OUTDIR = /content/research_agent/bulk_sources
MANIFEST = /content/research_agent/bulk_sources/manifests/bulk_manifest_1765995170.json
OK = 164 | FAILED = 3
PDFs counted = 285 | Est. pages = 1890


In [None]:
# Build RAG store from PDFs + sources_web.jsonl, then save FAISS index + chunks metadata

import os, re, json, time, shutil
from pathlib import Path

import numpy as np
import faiss
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer

WORKDIR = Path("/content/research_agent")
SOURCES_WEB = WORKDIR / "sources_web.jsonl"

RAGDIR = WORKDIR / "rag_store"
INDEX_PATH = RAGDIR / "faiss.index"
CHUNKS_PATH = RAGDIR / "chunks.jsonl"
MANIFEST_PATH = RAGDIR / "manifest.json"

RESET = True
EMBED_MODEL = "BAAI/bge-small-en-v1.5"

CHUNK_CHARS = 1400
OVERLAP_CHARS = 220

BATCH_SIZE = 64
MAX_PDFS = None          # Set to an integer to limit
MAX_PAGES_PER_PDF = None # Set to an integer to limit

def clean_text(t: str) -> str:
    t = (t or "").replace("\x00", " ")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

def chunk_text(t: str, chunk_chars: int, overlap_chars: int):
    t = clean_text(t)
    if len(t) < 120:
        return []
    chunks = []
    start = 0
    n = len(t)
    while start < n:
        end = min(n, start + chunk_chars)
        if end < n:
            cut = t.rfind("\n\n", start, end)
            if cut != -1 and cut > start + int(chunk_chars * 0.6):
                end = cut
        chunk = t[start:end].strip()
        if len(chunk) >= 200:
            chunks.append(chunk)
        if end >= n:
            break
        start = max(0, end - overlap_chars)
    return chunks

def iter_pdfs(root: Path):
    pdfs = sorted([p for p in root.rglob("*.pdf")])
    if MAX_PDFS is not None:
        pdfs = pdfs[:MAX_PDFS]
    for pdf_path in pdfs:
        yield pdf_path

def extract_pdf_pages(pdf_path: Path):
    doc = fitz.open(str(pdf_path))
    page_count = doc.page_count
    if MAX_PAGES_PER_PDF is not None:
        page_count = min(page_count, MAX_PAGES_PER_PDF)
    for i in range(page_count):
        page = doc.load_page(i)
        txt = page.get_text("text")
        txt = clean_text(txt)
        if len(txt) < 120:
            continue
        yield (i + 1, txt)
    doc.close()

def iter_web_sources(jsonl_path: Path):
    if not jsonl_path.exists():
        return
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except:
                continue
            text = clean_text(obj.get("text", ""))
            if len(text) < 200:
                continue
            yield obj

if RESET and RAGDIR.exists():
    shutil.rmtree(RAGDIR)
RAGDIR.mkdir(parents=True, exist_ok=True)

model = SentenceTransformer(EMBED_MODEL)
dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatIP(dim)

total_chunks = 0
pdf_files = 0
web_sources = 0
t0 = time.time()

def write_chunk(fp, meta: dict, text: str):
    rec = {"meta": meta, "text": text}
    fp.write(json.dumps(rec, ensure_ascii=False) + "\n")

with open(CHUNKS_PATH, "w", encoding="utf-8") as out:
    buffer_texts = []
    buffer_metas = []

    # Ingest web sources
    for obj in iter_web_sources(SOURCES_WEB) or []:
        web_sources += 1
        url = obj.get("url", "")
        title = obj.get("title", "")
        domain = obj.get("domain", "")
        chunks = chunk_text(obj.get("text", ""), CHUNK_CHARS, OVERLAP_CHARS)
        for ci, ch in enumerate(chunks, 1):
            meta = {
                "source_type": "web",
                "url": url,
                "domain": domain,
                "title": title,
                "chunk_index": ci,
            }
            buffer_texts.append(ch)
            buffer_metas.append(meta)

            if len(buffer_texts) >= BATCH_SIZE:
                emb = model.encode(buffer_texts, batch_size=BATCH_SIZE, normalize_embeddings=True, convert_to_numpy=True).astype("float32")
                index.add(emb)
                for m, tx in zip(buffer_metas, buffer_texts):
                    write_chunk(out, m, tx)
                total_chunks += len(buffer_texts)
                buffer_texts, buffer_metas = [], []

    # Ingest PDFs
    for pdf_path in iter_pdfs(WORKDIR):
        pdf_files += 1
        rel = str(pdf_path.relative_to(WORKDIR))
        try:
            for page_num, page_text in extract_pdf_pages(pdf_path):
                chunks = chunk_text(page_text, CHUNK_CHARS, OVERLAP_CHARS)
                for ci, ch in enumerate(chunks, 1):
                    meta = {
                        "source_type": "pdf",
                        "file": rel,
                        "page": page_num,
                        "chunk_index": ci,
                    }
                    buffer_texts.append(ch)
                    buffer_metas.append(meta)

                    if len(buffer_texts) >= BATCH_SIZE:
                        emb = model.encode(buffer_texts, batch_size=BATCH_SIZE, normalize_embeddings=True, convert_to_numpy=True).astype("float32")
                        index.add(emb)
                        for m, tx in zip(buffer_metas, buffer_texts):
                            write_chunk(out, m, tx)
                        total_chunks += len(buffer_texts)
                        buffer_texts, buffer_metas = [], []
        except Exception as e:
            print("SKIP PDF (read error):", rel, "->", e)

    # Flush remaining
    if buffer_texts:
        emb = model.encode(buffer_texts, batch_size=BATCH_SIZE, normalize_embeddings=True, convert_to_numpy=True).astype("float32")
        index.add(emb)
        for m, tx in zip(buffer_metas, buffer_texts):
            write_chunk(out, m, tx)
        total_chunks += len(buffer_texts)

faiss.write_index(index, str(INDEX_PATH))

manifest = {
    "created_at_unix": int(time.time()),
    "workdir": str(WORKDIR),
    "ragdir": str(RAGDIR),
    "embed_model": EMBED_MODEL,
    "faiss_index": str(INDEX_PATH),
    "chunks_jsonl": str(CHUNKS_PATH),
    "embedding_dim": dim,
    "chunk_chars": CHUNK_CHARS,
    "overlap_chars": OVERLAP_CHARS,
    "batch_size": BATCH_SIZE,
    "pdf_files_seen": pdf_files,
    "web_sources_seen": web_sources,
    "total_chunks": total_chunks,
    "elapsed_sec": round(time.time() - t0, 2),
}
with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

print("DONE")
print("PDF files seen =", pdf_files, "| Web sources seen =", web_sources)
print("Total chunks =", total_chunks, "| Dim =", dim)
print("Saved:", INDEX_PATH)
print("Saved:", CHUNKS_PATH)
print("Saved:", MANIFEST_PATH)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

DONE
PDF files seen = 286 | Web sources seen = 7
Total chunks = 4018 | Dim = 384
Saved: /content/research_agent/rag_store/faiss.index
Saved: /content/research_agent/rag_store/chunks.jsonl
Saved: /content/research_agent/rag_store/manifest.json


In [None]:
# Load RAG store paths
import os, json, re, textwrap
import numpy as np
import torch
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

WORKDIR = "/content/research_agent"
RAGDIR = f"{WORKDIR}/rag_store"
INDEX_PATH = f"{RAGDIR}/faiss.index"
CHUNKS_PATH = f"{RAGDIR}/chunks.jsonl"

# Select runtime device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
torch.set_grad_enabled(False)

# Load embedder used for FAISS
EMBED_MODEL = "BAAI/bge-small-en-v1.5"
embedder = SentenceTransformer(EMBED_MODEL)

# Load FAISS index and chunks
index = faiss.read_index(INDEX_PATH)

chunks = []
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
  for line in f:
    if line.strip():
      chunks.append(json.loads(line))

# Load a local instruction model
LLM_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(LLM_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
  LLM_NAME,
  torch_dtype=DTYPE,
  device_map="auto",
  low_cpu_mem_usage=True
)
model.eval()

# Define citation formatting
def format_citation(meta: dict) -> str:
  if meta.get("source_type") == "pdf":
    return f"[pdf:{meta.get('file','?')}#p{meta.get('page','?')}]"
  if meta.get("source_type") == "web":
    dom = meta.get("domain","web")
    return f"[web:{dom}]"
  return "[source]"

# Define retrieval
def retrieve(query: str, k: int = 6):
  q = (query or "").strip()
  if not q:
    return []
  qv = embedder.encode([q], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
  D, I = index.search(qv, k)
  hits = []
  for score, idx in zip(D[0].tolist(), I[0].tolist()):
    if idx < 0 or idx >= len(chunks):
      continue
    hits.append({"score": float(score), "meta": chunks[idx]["meta"], "text": chunks[idx]["text"]})
  return hits

# Define scope gate
SCOPE_KEYWORDS = [
  "bully", "bullying", "harass", "harassment", "tease", "teasing", "intimidat", "threat",
  "anger", "anxiety", "worry", "stress", "emotion", "regulation", "cbt", "thought", "rumination",
  "conflict", "resolution", "negotiat", "assert", "assertive", "boundary", "boundaries", "de-escalat"
]

def is_sensitive_crisis(q: str) -> bool:
  ql = (q or "").lower()
  return any(x in ql for x in [
    "suicid", "kill myself", "self-harm", "self harm", "cut myself", "end my life",
    "domestic violence", "partner hits", "abusive partner", "sexual assault", "rape"
  ])

def is_in_scope(q: str, hits: list) -> bool:
  ql = (q or "").lower()
  kw = any(k in ql for k in SCOPE_KEYWORDS)
  best = max([h["score"] for h in hits], default=0.0)
  return (kw and best >= 0.18) or (best >= 0.27)

# Define prompt builder
def build_prompt(question: str, hits: list) -> str:
  packed = []
  for h in hits:
    cite = format_citation(h["meta"])
    snippet = textwrap.shorten(h["text"].replace("\n", " "), width=900, placeholder="…")
    packed.append(f"{cite} score={h['score']:.3f}\n{snippet}")
  sources_block = "\n\n".join(packed)

  return f"""
You are a scope-limited assistant.
Allowed topics: bullying prevention/response skills, CBT-based emotion regulation (anger/anxiety/stress), conflict resolution, negotiation, assertiveness, boundaries, de-escalation.
Rule: answer ONLY using the provided sources. If sources are insufficient, say you do not have enough information and ask for a source.
Rule: if the question is out of scope (e.g., cooking), refuse politely and say it is out of scope.
Rule: keep advice practical, non-graphic, and appropriate for teens; encourage trusted adults/professionals when needed.

Question:
{question}

Sources:
{sources_block}

Write the answer with:
- Clear steps or bullet points
- Citations inline after claims, using the provided [pdf:...#p..] or [web:domain] tags
""".strip()

# Define text generation
def generate_text(prompt: str, max_new_tokens: int = 450) -> str:
  messages = [
    {"role": "system", "content": "Follow the rules exactly."},
    {"role": "user", "content": prompt},
  ]
  try:
    inp = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
  except Exception:
    inp = tokenizer(prompt, return_tensors="pt").input_ids
  inp = inp.to(model.device)

  out = model.generate(
    inp,
    max_new_tokens=max_new_tokens,
    do_sample=True,
    temperature=0.3,
    top_p=0.9,
    repetition_penalty=1.08
  )
  txt = tokenizer.decode(out[0], skip_special_tokens=True)
  return txt.strip()

# Define single entrypoint
def ask(question: str, k: int = 6) -> str:
  q = (question or "").strip()
  if not q:
    return "Ask a question."

  if is_sensitive_crisis(q):
    return ("If you or someone else is in immediate danger or might self-harm, contact local emergency services "
            "or a trusted adult right now. I can still help with general, non-urgent coping and communication steps, "
            "but I can’t replace professional or emergency support.")

  hits = retrieve(q, k=k)

  if not is_in_scope(q, hits):
    return "Out of scope for this agent. Ask about bullying, CBT emotion regulation, conflict resolution, or negotiation."

  prompt = build_prompt(q, hits)
  return generate_text(prompt)

print(f"READY | device={DEVICE} | chunks={len(chunks)} | llm={LLM_NAME}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

READY | device=cpu | chunks=4018 | llm=Qwen/Qwen2.5-1.5B-Instruct


In [None]:
# Define 10 persona-specific questions, run the agent, cap answer to 10 lines, print top 3 sources used

import re

QUESTIONS = [
  ("Q1",  "Adult woman, 34, school counselor",
   "Define bullying vs a one-time conflict, and list 4 signs that a situation is bullying (not just disagreement)."),
  ("Q2",  "Teen boy, 14, student",
   "I’m being bullied at school. Give me 5 concrete things I can do this week to reduce risk and get support."),
  ("Q3",  "Teen girl, 16, student",
   "How do I ask a teacher or school staff for help in a way that is clear and likely to be taken seriously? Give a short script."),
  ("Q4",  "Teen boy, 15, student",
   "Give me a 2–3 sentence response to teasing that sets a boundary without escalating, plus what to do right after."),
  ("Q5",  "Adult man, 28, workplace employee",
   "A coworker repeatedly makes 'jokes' at my expense. How do I set a boundary assertively and document it without escalating?"),
  ("Q6",  "Teen girl, 17, student",
   "When I get angry I explode. Give me a CBT-based 3-step routine I can do in under 5 minutes to calm down and respond better."),
  ("Q7",  "Adult woman, 41, parent",
   "My child (10) gets anxious before school. Give 6 practical steps I can coach them to use in the morning."),
  ("Q8",  "Adult man, 40",
   "I ruminate for hours after conflicts. Give a CBT-style plan to interrupt rumination and refocus in the moment."),
  ("Q9",  "Teen boy, 13, student",
   "During an argument, things escalate fast. Give me 5 de-escalation techniques I can use safely and respectfully."),
  ("Q10", "Adult woman, 52, manager",
   "I need a 'win-win' negotiation approach to resolve a repeated conflict with a colleague. Provide a step-by-step structure and a script opener.")
]

def _unique_top_sources(hits, n=3):
  seen, out = set(), []
  for h in hits:
    c = format_citation(h["meta"])
    if c not in seen:
      out.append(c); seen.add(c)
    if len(out) >= n:
      break
  return out

def _cap_lines(txt, max_lines=10):
  lines = [l.rstrip() for l in (txt or "").splitlines() if l.strip() != ""]
  lines = lines[:max_lines]
  return "\n".join(lines).strip()

def ask_with_sources(question, k=7, max_new_tokens=260):
  q = (question or "").strip()
  if not q:
    return ("", [])
  hits = retrieve(q, k=k)
  if not is_in_scope(q, hits):
    return ("Out of scope for this agent. Ask about bullying, CBT emotion regulation, conflict resolution, or negotiation.", [])
  prompt = build_prompt(
    question=q + "\n\nConstraints: Answer in at most 10 lines. Be practical. Keep it appropriate for teens.",
    hits=hits
  )
  # Generate only the completion (avoid echoing prompt)
  messages = [{"role":"system","content":"Follow the rules exactly."},{"role":"user","content":prompt}]
  try:
    inp = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
  except Exception:
    inp = tokenizer(prompt, return_tensors="pt").input_ids
  inp = inp.to(model.device)
  out = model.generate(
    inp,
    max_new_tokens=max_new_tokens,
    do_sample=True,
    temperature=0.3,
    top_p=0.9,
    repetition_penalty=1.08
  )
  gen = out[0][inp.shape[-1]:]
  txt = tokenizer.decode(gen, skip_special_tokens=True).strip()
  return (_cap_lines(txt, 10), _unique_top_sources(hits, 3))

for qid, persona, q in QUESTIONS:
  full_q = f"Persona: {persona}\nQuestion: {q}"
  ans, srcs = ask_with_sources(full_q, k=7, max_new_tokens=260)
  print(f"\n===== {qid} =====")
  print(full_q)
  print("\nAnswer (max 10 lines):")
  print(ans if ans else "(no answer)")
  print("\nTop 3 sources used:")
  if srcs:
    for s in srcs: print("-", s)
  else:
    print("- (none)")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



===== Q1 =====
Persona: Adult woman, 34, school counselor
Question: Define bullying vs a one-time conflict, and list 4 signs that a situation is bullying (not just disagreement).

Answer (max 10 lines):
**Definition of Bullying vs. One-Time Conflict**
Bullying, including cyberbullying, involves repeated actions intended to cause harm in relationships where there is a real or perceived power imbalance. It can be verbal (e.g., purposeful humiliation, teasing, threatening), physical (e.g., hitting, kicking, etc.), relational (e.g., spreading rumors, excluding from group activities), or through technology (e.g., sending mean messages, posting hurtful comments online).
**Signs of Bullying**
1. **Repetitive Behavior**: The bullying occurs repeatedly over time, often daily or almost daily.
2. **Power Imbalance**: There is a significant age, gender, or status difference between the bully and the victim.
3. **Harmful Intent**: The actions are meant to cause physical, psychological, social, or 

In [None]:
# Install packages
!pip -q install -U gradio sentence-transformers faiss-cpu pymupdf transformers accelerate python-pptx

# Import
import os, json, time, re, textwrap
from pathlib import Path

import numpy as np
import torch
import faiss
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from pptx import Presentation
from pptx.util import Inches, Pt

# Define paths
WORKDIR = Path("/content/research_agent")
RAGDIR  = WORKDIR / "rag_store"
INDEX_PATH  = RAGDIR / "faiss.index"
CHUNKS_PATH = RAGDIR / "chunks.jsonl"

OUTDIR = WORKDIR / "exports"
OUTDIR.mkdir(parents=True, exist_ok=True)

# Load embedder + FAISS + chunks
EMBED_MODEL = "BAAI/bge-small-en-v1.5"
embedder = SentenceTransformer(EMBED_MODEL)
index = faiss.read_index(str(INDEX_PATH))

chunks = []
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
  for line in f:
    line = line.strip()
    if line:
      chunks.append(json.loads(line))

# Load local instruction model
LLM_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE  = torch.float16 if DEVICE == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(LLM_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(LLM_NAME, device_map="auto", torch_dtype=DTYPE, low_cpu_mem_usage=True)
model.eval()
torch.set_grad_enabled(False)

# Define citation formatting
def format_citation(meta: dict) -> str:
  if meta.get("source_type") == "pdf":
    return f"[pdf:{meta.get('file','?')}#p{meta.get('page','?')}]"
  if meta.get("source_type") == "web":
    dom = meta.get("domain","web")
    return f"[web:{dom}]"
  return "[source]"

# Define retrieval
def retrieve(query: str, k: int = 7):
  q = (query or "").strip()
  if not q:
    return []
  qv = embedder.encode([q], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
  D, I = index.search(qv, k)
  hits = []
  for score, idx in zip(D[0].tolist(), I[0].tolist()):
    if idx < 0 or idx >= len(chunks):
      continue
    hits.append({"score": float(score), "meta": chunks[idx]["meta"], "text": chunks[idx]["text"]})
  return hits

# Define scope gate
SCOPE_KEYWORDS = [
  "bully", "bullying", "harass", "harassment", "tease", "teasing", "intimidat", "threat",
  "anger", "anxiety", "worry", "stress", "emotion", "regulation", "cbt", "thought", "rumination",
  "conflict", "resolution", "negotiat", "assert", "assertive", "boundary", "boundaries", "de-escalat"
]
def is_in_scope(q: str, hits: list) -> bool:
  ql = (q or "").lower()
  kw = any(k in ql for k in SCOPE_KEYWORDS)
  best = max([h["score"] for h in hits], default=0.0)
  return (kw and best >= 0.18) or (best >= 0.27)

# Define sensitive safety gate (keep brief)
def is_sensitive_crisis(q: str) -> bool:
  ql = (q or "").lower()
  return any(x in ql for x in ["suicid", "self-harm", "self harm", "kill myself", "end my life", "domestic violence", "abusive partner"])

# Define prompt builder
def build_prompt(question: str, hits: list) -> str:
  packed = []
  for h in hits:
    cite = format_citation(h["meta"])
    snippet = textwrap.shorten(h["text"].replace("\n", " "), width=900, placeholder="…")
    packed.append(f"{cite} score={h['score']:.3f}\n{snippet}")
  sources_block = "\n\n".join(packed)

  return f"""
You are a scope-limited assistant.

Allowed topics:
- Bullying prevention/response skills (including cyberbullying)
- CBT-based emotion regulation (anger/anxiety/stress/worry/rumination)
- Conflict resolution, negotiation, assertiveness, boundaries, de-escalation

Rules:
- Answer ONLY using the provided sources
- If sources are insufficient, say you do not have enough information and ask for a source to add
- If question is out of scope, refuse politely and say it is out of scope
- Keep advice practical, non-graphic, and appropriate for teens
- Write at most 10 lines

Question:
{question}

Sources:
{sources_block}

Write the answer as bullets or numbered steps, with inline citations after key claims using the provided [pdf:...#p..] or [web:domain] tags.
""".strip()

# Define generation (pass attention_mask to avoid warning)
def generate_from_prompt(prompt: str, max_new_tokens: int = 260) -> str:
  messages = [{"role":"system","content":"Follow the rules exactly."},{"role":"user","content":prompt}]
  try:
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
  except Exception:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

  input_ids = input_ids.to(model.device)
  attention_mask = torch.ones_like(input_ids)
  pad_id = tokenizer.eos_token_id

  out = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=max_new_tokens,
    do_sample=True,
    temperature=0.3,
    top_p=0.9,
    repetition_penalty=1.08,
    pad_token_id=pad_id
  )
  gen = out[0][input_ids.shape[-1]:]
  txt = tokenizer.decode(gen, skip_special_tokens=True).strip()
  return txt

# Define helpers
def top_sources(hits, n=3):
  seen, out = set(), []
  for h in hits:
    c = format_citation(h["meta"])
    if c not in seen:
      out.append(c); seen.add(c)
    if len(out) >= n:
      break
  return out

def cap_lines(txt, max_lines=10):
  lines = [l.strip() for l in (txt or "").splitlines() if l.strip()]
  return "\n".join(lines[:max_lines])

# Define ask()
def ask(question: str, k: int = 7):
  q = (question or "").strip()
  if not q:
    return ("Ask a question.", "")
  if is_sensitive_crisis(q):
    return ("If someone might be in immediate danger, contact local emergency services or a trusted adult/professional now. "
            "I can help with general coping and communication steps within this agent’s scope.", "")

  hits = retrieve(q, k=k)
  if not is_in_scope(q, hits):
    return ("Out of scope for this agent. Ask about bullying, CBT emotion regulation, conflict resolution, negotiation, assertiveness, or boundaries.", "")

  prompt = build_prompt(q, hits)
  ans = cap_lines(generate_from_prompt(prompt), 10)
  srcs = "\n".join(top_sources(hits, 3))
  return (ans, srcs)

# Define 10 showcase questions
SHOWCASE_QS = [
  ("Q1",  "Persona: Adult woman, 34, school counselor\nQuestion: Define bullying vs a one-time conflict, and list 4 signs that a situation is bullying (not just disagreement)."),
  ("Q2",  "Persona: Teen boy, 14, student\nQuestion: I’m being bullied at school. Give 5 concrete things I can do this week to reduce risk and get support."),
  ("Q3",  "Persona: Teen girl, 16, student\nQuestion: How do I ask a teacher or school staff for help in a way that is clear and likely to be taken seriously? Give a short script."),
  ("Q4",  "Persona: Teen boy, 15, student\nQuestion: Give me a 2–3 sentence response to teasing that sets a boundary without escalating, plus what to do right after."),
  ("Q5",  "Persona: Adult man, 28, workplace employee\nQuestion: A coworker repeatedly makes 'jokes' at my expense. How do I set a boundary assertively and document it without escalating?"),
  ("Q6",  "Persona: Teen girl, 17, student\nQuestion: When I get angry I explode. Give me a CBT-based 3-step routine I can do in under 5 minutes to calm down and respond better."),
  ("Q7",  "Persona: Adult woman, 41, parent\nQuestion: My child (10) gets anxious before school. Give 6 practical steps I can coach them to use in the morning."),
  ("Q8",  "Persona: Adult man, 40\nQuestion: I ruminate for hours after conflicts. Give a CBT-style plan to interrupt rumination and refocus in the moment."),
  ("Q9",  "Persona: Teen boy, 13, student\nQuestion: During an argument, things escalate fast. Give 5 de-escalation techniques I can use safely and respectfully."),
  ("Q10", "Persona: Adult woman, 52, manager\nQuestion: I need a 'win-win' negotiation approach to resolve a repeated conflict with a colleague. Provide a step-by-step structure and a script opener."),
]

# Define export report
def export_showcase():
  md_path = OUTDIR / "showcase_report.md"
  pptx_path = OUTDIR / "showcase_slides.pptx"

  lines = []
  lines.append("# Bullying + CBT + Conflict Skills Agent — Showcase Pack")
  lines.append(f"- Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}")
  lines.append(f"- Model: {LLM_NAME}")
  lines.append("")

  slides = Presentation()
  title_slide = slides.slides.add_slide(slides.slide_layouts[0])
  title_slide.shapes.title.text = "Bullying + CBT + Conflict Skills Agent"
  title_slide.placeholders[1].text = "Showcase Pack (10 Q&A)\nCited answers from indexed sources"

  for qid, q in SHOWCASE_QS:
    ans, srcs = ask(q, k=7)
    lines.append(f"## {qid}")
    lines.append(q)
    lines.append("")
    lines.append("**Answer (≤10 lines)**")
    lines.append(ans)
    lines.append("")
    lines.append("**Top 3 sources**")
    lines.append(srcs if srcs else "(none)")
    lines.append("\n---\n")

    slide = slides.slides.add_slide(slides.slide_layouts[1])
    slide.shapes.title.text = qid
    body = slide.shapes.placeholders[1].text_frame
    body.clear()
    for bullet in (ans.splitlines()[:10] if ans else ["(no answer)"]):
      p = body.add_paragraph() if body.text else body.paragraphs[0]
      p.text = bullet
      p.level = 0
      p.font.size = Pt(16)
    if srcs:
      p = body.add_paragraph()
      p.text = "Sources: " + ", ".join(srcs.splitlines())
      p.level = 0
      p.font.size = Pt(12)

  md_path.write_text("\n".join(lines), encoding="utf-8")
  slides.save(str(pptx_path))
  return (str(md_path), str(pptx_path))

md_file, pptx_file = export_showcase()
print("READY")
print("Exports:")
print("-", md_file)
print("-", pptx_file)

# Launch UI
with gr.Blocks() as app:
  gr.Markdown("## Bullying + CBT + Conflict Skills Agent\nAsk a question (in scope). The answer is capped to 10 lines and shows top sources.")
  q_in = gr.Textbox(lines=4, label="Question (include persona: age + role)")
  k_in = gr.Slider(3, 12, value=7, step=1, label="Top-K retrieval")
  ans_out = gr.Textbox(lines=10, label="Answer (≤10 lines)")
  src_out = gr.Textbox(lines=4, label="Top 3 sources")
  run_btn = gr.Button("Ask")
  run_btn.click(fn=ask, inputs=[q_in, k_in], outputs=[ans_out, src_out])

  gr.Markdown("### Exports")
  gr.Markdown(f"- Markdown report: `{md_file}`\n- Slides: `{pptx_file}`")

app.launch(share=True)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.0/23.0 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h



READY
Exports:
- /content/research_agent/exports/showcase_report.md
- /content/research_agent/exports/showcase_slides.pptx
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://703883778e1d6a04d9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


