In [6]:
import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
import pandas as pd
from collections import defaultdict

BASE = Path("data/green_monkey")

# --- Helpers ---

def parse_json_filename(p: Path) -> Optional[Tuple[str, str, str]]:
    """
    Parse 'chr1_12hrs_untr_aligned.json' (also tolerates '_alinged').
    Returns (chr, hours, cond) or None.
    """
    # Cond is anything without an underscore to avoid matching "..._gene_aligned.json"
    pat = r"^(chr\d+)_([0-9]+hrs)_([^_]+)_(?:aligned|alinged)\.json$"
    m = re.match(pat, p.name, re.IGNORECASE)
    if not m:
        return None
    chr_name = m.group(1).lower()
    hours = m.group(2).lower()
    cond = m.group(3).lower()
    return chr_name, hours, cond

def csv_path_for(chr_name: str, hours: str, cond_lower: str) -> Path:
    return (
        BASE
        / "all_structure_files"
        / chr_name
        / hours
        / cond_lower
        / f"structure_{hours}_{cond_lower}_gene_info.csv"
    )

def try_load_aligned_json(p: Path) -> Optional[Tuple[List[List[float]], List[int]]]:
    """Return (positions, clusters) if schema matches; otherwise None."""
    with open(p, "r") as f:
        j = json.load(f)
    if not isinstance(j, dict):
        return None
    positions = j.get("position") or j.get("positions")
    clusters = j.get("clusters")
    if not (isinstance(positions, list) and isinstance(clusters, list)):
        return None
    if not positions or not isinstance(positions[0], (list, tuple)) or len(positions[0]) < 3:
        return None
    return positions, clusters

def merge_to_objects(df: pd.DataFrame, positions: List[List[float]], clusters: List[int]) -> List[Dict[str, Any]]:
    # remove any old position columns if present
    df = df.drop(columns=[c for c in ["pos_x", "pos_y", "pos_z"] if c in df.columns], errors="ignore").reset_index(drop=True)
    n = min(len(df), len(positions), len(clusters))
    if n == 0:
        return []
    if len(df) != len(positions) or len(df) != len(clusters):
        df = df.iloc[:n].copy()
        positions = positions[:n]
        clusters = clusters[:n]
    recs = df.to_dict(orient="records")
    out: List[Dict[str, Any]] = []
    for i in range(n):
        r = dict(recs[i])
        r["aligned_pos"] = [float(positions[i][0]), float(positions[i][1]), float(positions[i][2])]
        r["cluster"] = int(clusters[i])
        out.append(r)
    return out

def write_output_json(out_path: Path, items: List[Dict[str, Any]]) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(items, f, indent=2, separators=(",", ":"))

def _rel(p: Path) -> str:
    try:
        return str(p.relative_to(BASE))
    except Exception:
        return str(p)

# --- Discovery ---

aligned_root = BASE / "structure_genes_aligned"

def _scan(root: Path) -> List[Path]:
    if not root.exists():
        return []
    out: List[Path] = []
    for pat in ("*_aligned.json", "*_alinged.json"):
        for p in root.rglob(pat):
            # skip outputs produced by this script
            name = p.name.lower()
            if "_gene_aligned.json" in name or "_gene_alinged.json" in name:
                continue
            out.append(p)
    return out

# Prefer the aligned_root; fall back to BASE if nothing found
cands = _scan(aligned_root)
if not cands:
    cands = _scan(BASE)

# Keep only parsable filenames
cands = [p for p in cands if parse_json_filename(p) is not None]

# Group by (chr, hours, cond) so each logical dataset is handled once
groups: Dict[Tuple[str, str, str], List[Path]] = defaultdict(list)
for p in sorted(set(cands)):
    key = parse_json_filename(p)
    if key:
        groups[key].append(p)

# --- Process each group exactly once ---

for (chr_name, hours, cond_lower), paths in groups.items():
    # pick the first path that loads with the expected schema
    picked_path: Optional[Path] = None
    loaded: Optional[Tuple[List[List[float]], List[int]]] = None
    for jp in paths:
        loaded = try_load_aligned_json(jp)
        if loaded:
            picked_path = jp
            break

    if not loaded or not picked_path:
        # one consolidated skip per dataset; avoids "error then ok"
        bad_list = ", ".join(_rel(p) for p in paths)
        print(f"[SKIP] {chr_name}_{hours}_{cond_lower}: none of {{ {bad_list} }} have 'position(s)' + 'clusters'")
        continue

    positions, clusters = loaded

    # find CSV
    csvp = csv_path_for(chr_name, hours, cond_lower)
    if not csvp.exists():
        hits = list(BASE.rglob(f"structure_{hours}_{cond_lower}_gene_info.csv"))
        if hits:
            csvp = hits[0]
    if not csvp.exists():
        print(f"[ERROR] {_rel(picked_path)}: CSV not found for {chr_name}/{hours}/{cond_lower}")
        continue

    try:
        df = pd.read_csv(csvp)
        items = merge_to_objects(df, positions, clusters)
        outp = picked_path.with_name(f"{chr_name}_{hours}_{cond_lower}_gene_aligned.json")
        write_output_json(outp, items)
        print(f"[OK] {_rel(picked_path)} → {_rel(outp)} ({len(items)} genes)")
    except Exception as e:
        print(f"[ERROR] {_rel(picked_path)}: {e}")


[OK] structure_genes_aligned/chr1/chr1_12hrs_untr_aligned.json → structure_genes_aligned/chr1/chr1_12hrs_untr_gene_aligned.json (1931 genes)
[OK] structure_genes_aligned/chr1/chr1_12hrs_vacv_aligned.json → structure_genes_aligned/chr1/chr1_12hrs_vacv_gene_aligned.json (1931 genes)
[OK] structure_genes_aligned/chr1/chr1_18hrs_untr_aligned.json → structure_genes_aligned/chr1/chr1_18hrs_untr_gene_aligned.json (1931 genes)
[OK] structure_genes_aligned/chr1/chr1_18hrs_vacv_aligned.json → structure_genes_aligned/chr1/chr1_18hrs_vacv_gene_aligned.json (1931 genes)
[OK] structure_genes_aligned/chr1/chr1_24hrs_untr_aligned.json → structure_genes_aligned/chr1/chr1_24hrs_untr_gene_aligned.json (1931 genes)
[OK] structure_genes_aligned/chr1/chr1_24hrs_vacv_aligned.json → structure_genes_aligned/chr1/chr1_24hrs_vacv_gene_aligned.json (1931 genes)
[OK] structure_genes_aligned/chr10/chr10_12hrs_untr_aligned.json → structure_genes_aligned/chr10/chr10_12hrs_untr_gene_aligned.json (1136 genes)
[OK] stru