In [None]:
from gh20_design.paths import make_run_dirs, load_run_config

run_id = "gh20__2026-01-31__nr90__bs100__I2.0__t5000"
D = make_run_dirs("../outputs", run_id)

p = load_run_config(D["run"])
print("RUN_DIR =", D["run"])
print("PARAMS  =", p)

In [None]:
from gh20_design.acquire import ProteinIdRecord, append_record_from_sources

r = ProteinIdRecord(
    family="GH20",
    kingdom="Bacteria",
    organism="Chitinibacter sp. GC72",
    protein_id="WP_157314022.1",
    source="ncbi",
)

res = append_record_from_sources(
    r,
    db_fasta="../data/CAZyDB.fa",
    curated_fasta=D["acquire"] / "curated.fasta",
    missing_tsv=D["acquire"] / "missing.tsv",
    header_style="rich",
    try_ncbi=True,
)

print(res)



In [None]:
from gh20_design.acquire import ResolveConfig, resolve_sequences

cfg = ResolveConfig(
    txt_path="../data/GH20.txt",
    db_fasta="../data/CAZyDB.fa",
    out_fasta=str(D["acquire"] / "curated.fasta"),
    out_missing_tsv=str(D["acquire"] / "missing.tsv"),
    header_style="rich",
    fetch_missing_from_ncbi=True,
    ncbi_batch_size=200,
    ncbi_sleep_s=0.34,
)

report = resolve_sequences(cfg)
print(report.total, report.matched, report.missing)


In [None]:
from gh20_design.database import LengthFilterConfig, CDHitConfig, length_filter_fasta, run_cdhit

curated  = D["acquire"] / "curated.fasta"
filtered = D["clean"] / "filtered_min200.fasta"
nr_fasta = D["clean"] / f"nr{int(p.nr*100)}.fasta"

lf_cfg = LengthFilterConfig(min_len=200, max_len=None, drop_ambiguous=False)
lf_summary = length_filter_fasta(curated, filtered, lf_cfg)
print("Length filter:", lf_summary)

cd_cfg = CDHitConfig(identity=p.nr, threads=8, memory_mb=16000, description_len=0)
cd_summary = run_cdhit(filtered, nr_fasta, cd_cfg)
print("CD-HIT:", cd_summary)

print("nr_fasta =", nr_fasta)


In [None]:
from gh20_design.ssn import DiamondConfig, AllVsAllConfig, EdgeFilterConfig
from gh20_design.ssn import make_diamond_db, run_all_vs_all, filter_edges

fasta_nr = D["clean"] / f"nr{int(p.nr*100)}.fasta"

db_prefix = D["ssn"] / "diamond_db"
hits_tsv  = D["ssn"] / "hits.tsv"
edges_tsv = D["ssn"] / f"edges_bs{int(p.bitscore)}.tsv"

diamond_cfg = DiamondConfig(threads=8, tmpdir=str(D["ssn"]))

# Build DB
db_summary = make_diamond_db(fasta_nr, db_prefix, diamond_cfg)
print("DIAMOND DB:", db_summary["db_file"])

# All-vs-all
av_cfg = AllVsAllConfig(evalue=1e-5, max_target_seqs=p.max_targets)
hits_summary = run_all_vs_all(fasta_nr, db_prefix, hits_tsv, diamond_cfg, av_cfg)
print("Hits:", hits_summary["out_tsv"])

# Filter to edges
ef_cfg = EdgeFilterConfig(bitscore_min=p.bitscore, evalue_max=1e-5, drop_self_hits=True)
edge_summary = filter_edges(hits_tsv, edges_tsv, ef_cfg)
print("Edges:", {k: edge_summary[k] for k in ["total_hits", "kept_edges", "edges_tsv"]})


In [None]:
from gh20_design.clustering import (
    EdgeToMCLConfig, MCLConfig,
    edges_tsv_to_abc_stream, run_mcl, clusters_raw_to_tsv
)

edges_tsv = D["ssn"] / f"edges_bs{int(p.bitscore)}.tsv"

abc_path = D["cluster"] / f"edges_bs{int(p.bitscore)}.abc"
raw_path = D["cluster"] / f"mcl_I{p.inflation}.raw"
clu_tsv  = D["cluster"] / f"clusters_I{p.inflation}.tsv"

abc_summary = edges_tsv_to_abc_stream(edges_tsv, abc_path, EdgeToMCLConfig(transform="log10"))
print("ABC:", abc_summary)

mcl_summary = run_mcl(abc_path, raw_path, MCLConfig(inflation=p.inflation, threads=8))
print("MCL:", mcl_summary)

clu_summary = clusters_raw_to_tsv(raw_path, clu_tsv, min_size=2)
print("Clusters:", clu_summary)

print("clusters_tsv =", clu_tsv)


In [None]:
from gh20_design.consensus import split_fasta_by_cluster, MafftConfig, run_mafft, consensus_from_alignment
from pathlib import Path

nr_fa = D["clean"] / f"nr{int(p.nr*100)}.fasta"
clusters_tsv = D["cluster"] / f"clusters_I{p.inflation}.tsv"

cluster_fastas = D["consensus"] / "cluster_fastas"
aln_dir = D["consensus"] / "alignments"
cons_dir = D["consensus"] / "consensus_fastas"
final_cons = D["consensus"] / "consensus_all.fasta"

split_summary = split_fasta_by_cluster(nr_fa, clusters_tsv, cluster_fastas, min_cluster_size=2)
print("Split:", split_summary)

mafft_cfg = MafftConfig(threads=8, mode="auto")

final_cons.parent.mkdir(parents=True, exist_ok=True)
with open(final_cons, "w", encoding="utf-8") as out_all:
    for fp in sorted(Path(cluster_fastas).glob("C*.fasta")):
        cid = fp.stem
        aln = aln_dir / f"{cid}.aln.fasta"
        cons = cons_dir / f"{cid}.cons.fasta"

        run_mafft(fp, aln, mafft_cfg)
        consensus_from_alignment(aln, cons, consensus_id=f"{cid}|consensus", min_fraction=0.5)
        out_all.write(cons.read_text(encoding="utf-8"))

print("Final consensus:", final_cons)


In [None]:
from gh20_design.explorer import consensus_for_protein

clusters_tsv = D["cluster"] / f"clusters_I{p.inflation}.tsv"
cons_all = D["consensus"] / "consensus_all.fasta"

res = consensus_for_protein("WP_157314022.1", clusters_tsv, cons_all)

if res["found"]:
    print("Cluster:", res["cluster_id"])
    print("Consensus (first 80):", res["consensus_seq"][:80])
else:
    print(res)

