In [7]:
import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
import pandas as pd

BASE = Path("data/green_monkey")

# --- Helpers ---

def parse_json_filename(p: Path) -> Optional[Tuple[str, str, str]]:
    """
    Parse 'chr1_12hrs_UNTR_aligned.json' (also tolerates '_alinged').
    Returns (chr, hours, COND) or None.
    """
    pat = r"^(chr\d+)_([0-9]+hrs)_([\w\-]+)_(?:aligned|alinged)\.json$"
    m = re.match(pat, p.name, re.IGNORECASE)
    if not m:
        return None
    chr_name = m.group(1).lower()
    hours = m.group(2).lower()
    cond = m.group(3).upper()
    return chr_name, hours, cond

def csv_path_for(chr_name: str, hours: str, cond_upper: str) -> Path:
    cond_lower = cond_upper.lower()
    return (
        BASE
        / "all_structure_files"
        / chr_name
        / hours
        / cond_lower
        / f"structure_{hours}_{cond_lower}_gene_info.csv"
    )

def load_aligned_json(p: Path) -> Tuple[List[List[float]], List[int]]:
    with open(p, "r") as f:
        j = json.load(f)
    positions = j.get("position") or j.get("positions")
    clusters = j.get("clusters")
    if not isinstance(positions, list) or not isinstance(clusters, list):
        raise ValueError("Missing 'position(s)' or 'clusters'.")
    if not positions or not isinstance(positions[0], (list, tuple)) or len(positions[0]) < 3:
        raise ValueError("'position' must be a list of [x,y,z].")
    return positions, clusters

def merge_to_objects(df: pd.DataFrame, positions: List[List[float]], clusters: List[int]) -> List[Dict[str, Any]]:
    # remove any old position columns if present
    df = df.drop(columns=[c for c in ["pos_x", "pos_y", "pos_z"] if c in df.columns], errors="ignore").reset_index(drop=True)
    n = min(len(df), len(positions), len(clusters))
    if n == 0:
        return []
    if len(df) != len(positions) or len(df) != len(clusters):
        df = df.iloc[:n].copy()
        positions = positions[:n]
        clusters = clusters[:n]
    recs = df.to_dict(orient="records")
    out: List[Dict[str, Any]] = []
    for i in range(n):
        r = dict(recs[i])
        r["aligned_pos"] = [float(positions[i][0]), float(positions[i][1]), float(positions[i][2])]
        r["cluster"] = int(clusters[i])
        out.append(r)
    return out

def write_output_json(out_path: Path, items: List[Dict[str, Any]]) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(items, f, indent=2, separators=(",", ":"))

# --- Discovery ---

aligned_root = BASE / "structure_genes_aligned"
json_candidates = []
if aligned_root.exists():
    json_candidates += list(aligned_root.rglob("*_aligned.json"))
    json_candidates += list(aligned_root.rglob("*_alinged.json"))
# also allow loose files at BASE
json_candidates += list((BASE).glob("*_aligned.json"))
json_candidates += list((BASE).glob("*_alinged.json"))
json_candidates = sorted(set(json_candidates))

# fallback to a known example if none found
if not json_candidates:
    sample = BASE / "chr1_12hrs_UNTR_aligned.json"
    if sample.exists():
        json_candidates = [sample]

# --- Process ---

for jp in json_candidates:
    triplet = parse_json_filename(jp)
    if not triplet:
        continue
    chr_name, hours, cond_upper = triplet
    csvp = csv_path_for(chr_name, hours, cond_upper)
    if not csvp.exists():
        # try best-effort fallback search
        hits = list(BASE.rglob(f"structure_{hours}_{cond_upper.lower()}_gene_info.csv"))
        if hits:
            csvp = hits[0]
    try:
        positions, clusters = load_aligned_json(jp)
        df = pd.read_csv(csvp)
        items = merge_to_objects(df, positions, clusters)
        outp = jp.with_name(f"{chr_name}_{hours}_{cond_upper}_gene_aligned.json")
        write_output_json(outp, items)
        print(f"[OK] {jp.name} → {outp.name} ({len(items)} genes)")
    except Exception as e:
        print(f"[ERROR] {jp.name}: {e}")


[OK] chr1_12hrs_UNTR_aligned.json → chr1_12hrs_UNTR_gene_aligned.json (1931 genes)
[OK] chr1_12hrs_VACV_aligned.json → chr1_12hrs_VACV_gene_aligned.json (1931 genes)
[OK] chr1_18hrs_UNTR_aligned.json → chr1_18hrs_UNTR_gene_aligned.json (1931 genes)
[OK] chr1_18hrs_VACV_aligned.json → chr1_18hrs_VACV_gene_aligned.json (1931 genes)
[OK] chr1_24hrs_UNTR_aligned.json → chr1_24hrs_UNTR_gene_aligned.json (1931 genes)
[OK] chr1_24hrs_VACV_aligned.json → chr1_24hrs_VACV_gene_aligned.json (1931 genes)
[OK] chr10_12hrs_UNTR_aligned.json → chr10_12hrs_UNTR_gene_aligned.json (1136 genes)
[OK] chr10_12hrs_VACV_aligned.json → chr10_12hrs_VACV_gene_aligned.json (1136 genes)
[OK] chr10_18hrs_UNTR_aligned.json → chr10_18hrs_UNTR_gene_aligned.json (1136 genes)
[OK] chr10_18hrs_VACV_aligned.json → chr10_18hrs_VACV_gene_aligned.json (1136 genes)
[OK] chr10_24hrs_UNTR_aligned.json → chr10_24hrs_UNTR_gene_aligned.json (1136 genes)
[OK] chr10_24hrs_VACV_aligned.json → chr10_24hrs_VACV_gene_aligned.json (1136