In [1]:
from pathlib import Path
import json
DATA_DIR     = Path("/home/saleh/KinForm-1")
RAW_DLKCAT     = DATA_DIR / "data/dlkcat_raw.json"
RAW_KM = DATA_DIR / "data/KM_data_raw.json"
RAW_EITLEM  = DATA_DIR / "data/EITLEM_data/KCAT/kcat_data.json"
SEQ_LOOKUP   = DATA_DIR / "results/sequence_id_to_sequence.pkl"

In [8]:
import pickle
with open(SEQ_LOOKUP, "rb") as f:
    seq_id_to_sequence = pickle.load(f)
sequnece_to_seq_id = {v: k for k, v in seq_id_to_sequence.items()}

In [10]:
# Load dlkcat sequences
raw = [
    d
    for d in json.loads(RAW_DLKCAT.read_text())
    if len(d["Sequence"]) <= 1499 and float(d["Value"]) > 0 and "." not in d["Smiles"]
]
sequences_dlkcat = [d["Sequence"] for d in raw]
seq_ids_dlkcat = [sequnece_to_seq_id[s] for s in sequences_dlkcat]

In [11]:
# Load KM sequences
with open(RAW_KM, 'r') as fp:
    raw = json.load(fp)

raw = [d for d in raw if len(d["Sequence"]) <= 1499 and "." not in d["smiles"]]
sequences_km = [d["Sequence"] for d in raw]
seq_ids_km = [sequnece_to_seq_id[s] for s in sequences_km]

In [13]:
# Load EITLEM sequences
with RAW_EITLEM.open() as fp:
    raw = json.load(fp)
valid = [(i, r) for i, r in enumerate(raw) if len(r["sequence"]) <= 1499 and float(r["value"]) > 0]
orig_idx = [i for i, _ in valid]
seqs = [r["sequence"] for _, r in valid]
seq_ids_eitlem = [sequnece_to_seq_id[s] for s in seqs]

In [18]:
unique_seq_ids = set(seq_ids_dlkcat + seq_ids_km + seq_ids_eitlem)
print("DLKCat:", len(seq_ids_dlkcat))
print("KM:", len(seq_ids_km))
print("EITLEM:", len(seq_ids_eitlem))
print("Total unique sequences:", len(unique_seq_ids))
# Save unique sequence IDs to a file
with open("/home/saleh/KinForm-1/data/unique_seq_ids.txt", "w") as f:
    for seq_id in unique_seq_ids:
        f.write(f"{seq_id}\n")

DLKCat: 16775
KM: 11669
EITLEM: 35001
Total unique sequences: 19517
