In [None]:
#!/usr/bin/env python3
# extract_unique_sequences.py -------------------------------------------
# Extracts all unique, valid sequences from DLKcat and EITLEM sources
# and writes them to a text file (one sequence per line)

import json
from pathlib import Path

# --------------------- input paths ------------------------------------
KINFORM_ROOT   = Path(__file__).resolve().parent.parent
RAW_DLKCAT     = KINFORM_ROOT / "data/dlkcat_raw.json"
EITLEM_JSON    = KINFORM_ROOT / "data/EITLEM_data/KCAT/kcat_data.json"
KM_JSON        = KINFORM_ROOT / "data/KM_data_raw.json"
OUT_PATH       = Path("/home/msp/saleh/pretrain_function/data/sequences.txt")

# --------------------- load DLKcat sequences --------------------------
with RAW_DLKCAT.open() as f:
    dlkcat_raw = json.load(f)

dlkcat_sequences = [
    d["Sequence"]
    for d in dlkcat_raw
    if len(d["Sequence"]) <= 1499 and float(d["Value"]) > 0 and "." not in d["Smiles"]
]

# --------------------- load EITLEM sequences --------------------------
with EITLEM_JSON.open() as f:
    eitlem_raw = json.load(f)

eitlem_sequences = [
    r["sequence"]
    for r in eitlem_raw
    if len(r["sequence"]) <= 1499 and float(r["value"]) > 0
]

# --------------------- load KM sequences ------------------------------
with KM_JSON.open() as f:
    km_raw = json.load(f)

km_sequences = [
    d["Sequence"]
    for d in km_raw
    if len(d["Sequence"]) <= 1499 and "." not in d["smiles"]
]

# --------------------- combine and deduplicate ------------------------
all_sequences = set(dlkcat_sequences) | set(eitlem_sequences) | set(km_sequences)

print(f"✔ Extracted {len(all_sequences):,} unique sequences")
print(f"✔ DLKcat: {len(list(set(dlkcat_sequences))):,} sequences")
print(f"✔ EITLEM: {len(list(set(eitlem_sequences))):,} sequences")
print(f"✔ KM: {len(list(set(km_sequences))):,} sequences")

# --------------------- write to text file -----------------------------
with OUT_PATH.open("w") as f:
    for seq in sorted(all_sequences):
        f.write(seq + "\n")

print(f"✔ Written to {OUT_PATH}")

✔ Extracted 19,517 unique sequences
✔ DLKcat: 7,797 sequences
✔ EITLEM: 15,035 sequences
✔ KM: 6,956 sequences
✔ Written to /home/msp/saleh/pretrain_function/data/sequences.txt
