In [21]:
import os, glob, json
import pandas as pd
from pathlib import Path
import json

In [13]:
with open("/data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/july_2025_mmseq/jobs.json", "r") as f:
    jobs = json.load(f)

In [11]:
# === Config ===
base_path = "/data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/july_2025_mmseq"
base_model_nms = ["TSp_vs_nonProm", "TSp_vs_genNullseqs"]
length_nm = "3k"
# subsets = ["tspAll_", "tspliver_", "tsptestis_", "tspbrain_"]
subsets = ["tspmuscle_"]

data_path = "/data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/data/jul_2025/split"
tokenizer_name = 'jaandoui/DNABERT2-AttentionExtracted'

In [13]:
jobs = []

for base_model_nm in base_model_nms:
    length_dir = os.path.join(base_path, base_model_nm, length_nm)
    if not os.path.exists(length_dir):
        print(f"‚ö†Ô∏è Missing length dir: {length_dir}")
        continue

    # All dirs under length_dir
    all_subdirs = [d for d in os.listdir(length_dir) if os.path.isdir(os.path.join(length_dir, d))]

    # Match subset dirs by prefix
    matched_subdirs = [d for d in all_subdirs if any(d.startswith(pref) for pref in subsets)]
    if not matched_subdirs:
        print(f"‚ö†Ô∏è No matching subset dirs under {length_dir}")
        continue

    for subset in matched_subdirs:
        search_dir = os.path.join(length_dir, subset)

        # lr dirs: accept anything starting with "lr3e-5"
        lr_dirs = [d for d in os.listdir(search_dir) if d.startswith("lr3e-5")]
        if not lr_dirs:
            print(f"‚ö†Ô∏è No lr3e-5 dirs under {search_dir}")
            continue

        for lr_dir in lr_dirs:
            model_dir = os.path.join(search_dir, lr_dir)
            checkpoints = [d for d in os.listdir(model_dir) if d.startswith("checkpoint-")]
            if not checkpoints:
                print(f"‚ö†Ô∏è No checkpoints in {model_dir}")
                continue

            # Pick latest checkpoint
            latest_ckpt = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
            model_path = os.path.join(model_dir, latest_ckpt)

            # Data dir mirrors actual subset name
            data_dir = os.path.join(data_path, base_model_nm, length_nm, subset)

            # Result dir mirrors lr-dir
            res_pdir = f"{base_path}/RESULT/{lr_dir}/{base_model_nm}_{length_nm}_{subset}"
            os.makedirs(res_pdir, exist_ok=True)

            jobs.append({
                "base_model_nm": base_model_nm,
                "subset": subset,
                "lr_dir": lr_dir,
                "model_path": model_path,
                "data_dir": data_dir,
                "res_pdir": res_pdir,
            })

print(f"-> Collected {len(jobs)} jobs")
for j in jobs:
    print(f"** {j['base_model_nm']} | {j['subset']} | {j['lr_dir']} | {os.path.basename(j['model_path'])}")


-> Collected 2 jobs
** TSp_vs_nonProm | tspmuscle_nonPromHu | lr3e-5_ep10 | checkpoint-600
** TSp_vs_genNullseqs | tspmuscle_genNullseqs | lr3e-5_ep10 | checkpoint-300


In [22]:
# write for spleen and muscle jobs.json
out_path = Path("/data/private/psurana/TSProm/src/files/jobs1.json")

out_path.parent.mkdir(parents=True, exist_ok=True)

with open(out_path, "w") as f:
    json.dump(jobs, f, indent=4)

print(f"[ok] Saved {len(jobs)} jobs to {out_path}")


[ok] Saved 2 jobs to /data/private/psurana/TSProm/src/files/jobs1.json


In [14]:
jobs[1]

{'base_model_nm': 'TSp_vs_genNullseqs',
 'subset': 'tspmuscle_genNullseqs',
 'lr_dir': 'lr3e-5_ep10',
 'model_path': '/data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/july_2025_mmseq/TSp_vs_genNullseqs/3k/tspmuscle_genNullseqs/lr3e-5_ep10/checkpoint-300',
 'data_dir': '/data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/data/jul_2025/split/TSp_vs_genNullseqs/3k/tspmuscle_genNullseqs',
 'res_pdir': '/data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/july_2025_mmseq/RESULT/lr3e-5_ep10/TSp_vs_genNullseqs_3k_tspmuscle_genNullseqs'}

In [15]:
def pick_pred_file(split, folder):
    """Prefer *_all_predictions.csv, else *_correct.csv, else first {split}*.csv."""
    preferred = [
        os.path.join(folder, f"{split}_all_predictions.csv"),
        os.path.join(folder, f"{split}_correct.csv"),
    ]
    for p in preferred:
        if os.path.exists(p):
            return p
    matches = sorted(glob.glob(os.path.join(folder, f"{split}*.csv")))
    return matches[0] if matches else None

def safe_colbind(df_left, df_right):
    """Column-bind with sensible handling of duplicate sequence columns."""
    left = df_left.copy()
    right = df_right.copy()

    # Normalize headers
    left.columns  = [c.strip() for c in left.columns]
    right.columns = [c.strip() for c in right.columns]

    if "sequence" in left.columns and "sequence" in right.columns:
        if len(left) == len(right) and left["sequence"].astype(str).equals(right["sequence"].astype(str)):
            right = right.drop(columns=["sequence"], errors="ignore")
            return pd.concat([left, right], axis=1)
        left["_order_idx"] = range(len(left))
        merged = pd.merge(left, right, on="sequence", how="left", suffixes=("", "_pred"))
        merged = merged.sort_values("_order_idx").drop(columns=["_order_idx"])
        return merged

    return pd.concat([left, right], axis=1)

In [17]:
for job in jobs:
    preds_dir = os.path.join(job["res_pdir"], "preds")
    data_dir  = job["data_dir"]
    os.makedirs(preds_dir, exist_ok=True)

    all_splits = []
    for split in ["train", "dev", "test"]:
        data_csv = os.path.join(data_dir, f"{split}.csv")
        if not os.path.exists(data_csv):
            print(f"‚ö†Ô∏è Missing data CSV for {split} in {data_dir}. Skipping.")
            continue
        df_data = pd.read_csv(data_csv)

        pred_csv = pick_pred_file(split, preds_dir)
        if pred_csv is None:
            print(f"‚ö†Ô∏è No prediction file found for {split} in {preds_dir}. Skipping.")
            continue
        df_pred = pd.read_csv(pred_csv)

        combined = safe_colbind(df_data, df_pred)
        combined.insert(0, "split", split)
        all_splits.append(combined)

    if not all_splits:
        print(f"‚ùå No splits combined for {job['subset']} ({job['res_pdir']})")
        continue

    final_df = pd.concat(all_splits, ignore_index=True)
    
    # üîó merge with *_balanced.csv if exists
    bal_files = glob.glob(os.path.join(data_dir, "*_balanced.csv"))
    if bal_files:
        df_bal = pd.read_csv(bal_files[0])
        final_df = pd.merge(final_df, df_bal, on=["Sequence", "Label"], how="left")
        
    out_path = os.path.join(preds_dir, "2_combined.csv")
    final_df.to_csv(out_path, index=False)
    print(f"‚úÖ Combined CSV written for {job['subset']} ‚Üí {out_path}")

‚úÖ Combined CSV written for tspmuscle_nonPromHu ‚Üí /data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/july_2025_mmseq/RESULT/lr3e-5_ep10/TSp_vs_nonProm_3k_tspmuscle_nonPromHu/preds/2_combined.csv
‚úÖ Combined CSV written for tspmuscle_genNullseqs ‚Üí /data/projects/dna/pallavi/DNABERT_runs/DATA_RUN/dnabert2_FineTune_Zhihan_attention_extracted/july_2025_mmseq/RESULT/lr3e-5_ep10/TSp_vs_genNullseqs_3k_tspmuscle_genNullseqs/preds/2_combined.csv
