In [0]:
%sql
-- 1) Schemas da Medalhão
CREATE SCHEMA IF NOT EXISTS sinesp.bronze;
CREATE SCHEMA IF NOT EXISTS sinesp.silver;
CREATE SCHEMA IF NOT EXISTS sinesp.gold;
CREATE SCHEMA IF NOT EXISTS sinesp.source;  -- para volumes/artefatos de origem

-- 2) Volume para "landing/raw" (arquivos baixados da web, binários)
CREATE VOLUME IF NOT EXISTS sinesp.source.landing
COMMENT 'Landing/RAW files do SINESP (arquivos originais da web)';

-- 3) (opcional) Volumes auxiliares
CREATE VOLUME IF NOT EXISTS sinesp.source.tmp
COMMENT 'Área temporária para staging/conversões';

In [0]:
import os, io, csv, hashlib, datetime, requests, time
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ===== CONFIG =====
BASE_DIR = "/Volumes/sinesp/source/landing/sinesp"  # ajuste se necessário
DATASET_NAME = "sistema-nacional-de-estatisticas-de-seguranca-publica"        # slug do CKAN
CKAN_BASE = "https://dados.mj.gov.br/api/3/action/package_show"
MANIFEST_CSV = os.path.join(BASE_DIR, "manifest.csv")

# (opcional) URLs fallback se a API estiver fora
FALLBACK = {
    "municipios": "https://dados.mj.gov.br/dataset/210b9ae2-21fc-4986-89c6-2006eb4db247/resource/03af7ce2-174e-4ebd-b085-384503cfb40f/download/indicadoressegurancapublicamunic.xlsx",
    "uf":         "https://dados.mj.gov.br/dataset/210b9ae2-21fc-4986-89c6-2006eb4db247/resource/feeae05e-faba-406c-8a4a-512aec91a9d1/download/indicadoressegurancapublicauf.xlsx",
}

# ===== Helpers =====
def session_with_retry(total=5, backoff=0.5):
    s = requests.Session()
    r = Retry(total=total, backoff_factor=backoff, status_forcelist=[429, 500, 502, 503, 504])
    s.mount("https://", HTTPAdapter(max_retries=r))
    s.mount("http://", HTTPAdapter(max_retries=r))
    return s

def sha256_bytes(b: bytes) -> str:
    h = hashlib.sha256(); h.update(b); return h.hexdigest()

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def manifest_write_row(row: dict):
    new_file = not os.path.exists(MANIFEST_CSV)
    ensure_dir(os.path.dirname(MANIFEST_CSV))
    with open(MANIFEST_CSV, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=[
            "acquired_date","scope","file_name","url","sha256","size_bytes","last_modified","etag","landing_path"
        ])
        if new_file:
            w.writeheader()
        w.writerow(row)

def discover_resources():
    """
    Usa a API CKAN para localizar os recursos de UF e Municípios (XLSX).
    Se não achar, usa os FALLBACKs.
    """
    s = session_with_retry()
    try:
        resp = s.get(CKAN_BASE, params={"id": DATASET_NAME}, timeout=30)
        resp.raise_for_status()
        pkg = resp.json()["result"]
        resources = pkg.get("resources", [])
    except Exception:
        resources = []

    urls = {"municipios": None, "uf": None}
    for r in resources:
        if r.get("format", "").lower() == "xlsx":
            name = (r.get("name") or r.get("description") or "").lower()
            url  = r.get("url")
            if not url: 
                continue
            if "munic" in name or "município" in name or "municipios" in name:
                urls["municipios"] = url
            if "uf" in name or "unidade da federação" in name:
                urls["uf"] = url

    # fallbacks se necessário
    for k, v in FALLBACK.items():
        if not urls.get(k):
            urls[k] = v
    return urls

def download_and_stage(scope: str, url: str):
    """
    Baixa o arquivo, calcula sha256, salva em:
    BASE_DIR/scope/acquired_date=YYYY-MM-DD/<sha256>/<nome>.xlsx
    """
    s = session_with_retry()
    # tenta metadados
    etag = last_modified = None
    try:
        h = s.head(url, timeout=20)
        etag = h.headers.get("ETag")
        last_modified = h.headers.get("Last-Modified")
    except Exception:
        pass

    r = s.get(url, timeout=120)
    r.raise_for_status()
    content = r.content
    digest = sha256_bytes(content)
    size   = len(content)

    today = datetime.date.today().isoformat()
    file_name = url.split("/")[-1].split("?")[0] or f"{scope}.xlsx"
    rel_dir = os.path.join(BASE_DIR, scope, f"acquired_date={today}", digest)
    ensure_dir(rel_dir)
    dest_path = os.path.join(rel_dir, file_name)

    # evita regravar se já existe este hash
    if os.path.exists(dest_path):
        print(f"[skip] já existe: {dest_path}")
    else:
        with open(dest_path, "wb") as f:
            f.write(content)
        print(f"[ok] salvo: {dest_path} ({size/1024/1024:.2f} MB)")

    manifest_write_row({
        "acquired_date": today,
        "scope": scope,
        "file_name": file_name,
        "url": url,
        "sha256": digest,
        "size_bytes": size,
        "last_modified": last_modified or "",
        "etag": etag or "",
        "landing_path": dest_path,
    })

# ===== Run =====
def main():
    ensure_dir(BASE_DIR)
    urls = discover_resources()
    for scope in ["municipios", "uf"]:
        try:
            download_and_stage(scope, urls[scope])
        except Exception as e:
            print(f"[erro] {scope}: {e}")

if __name__ == "__main__":
    main()
    print("\nProcesso concluído.")
