# Third protein mapping implementation

Main idea:
- get from CATH domain_ids + cath_domain_coordinates
- get from RSCB protein_seq
- search in RSCB for  rscb_domain_coordinates
  - exact match of domain_id
  - cath_domain_coordinates
  - length
  - seq alignment
- cut domain_seq from protein using rscb_domain_coordinates

Problem:
- still some of domain_ids could not be mapped

TODO:
- сheck seq alignment
- ~~automate remaining exceptions~~ make research if it even makes sense

In [4]:
import pandas as pd

df = pd.read_csv("../data/subset.csv")
total_rows = len(df)
incomplete_rows_df = df[df.isnull().any(axis=1)]
incomplete_rows = len(incomplete_rows_df)

print(f"{incomplete_rows} rows with missing values out of {total_rows} total rows")
domain_column = 'domain_id'
incomplete_ids = set(incomplete_rows_df[domain_column].dropna().tolist())

fasta_path = "../data/cath-domain-seqs.fa"
results = {}

with open(fasta_path, "r") as file:
    for line in file:
        if line.startswith(">"):
            header = line.strip()
            last_part = header.split('|')[-1]
            if '/' in last_part:
                domain_id, region = last_part.split('/')
                if domain_id in incomplete_ids:
                    results[domain_id] = region

for domain_id, region in results.items():
    print(f"{domain_id}: {region}")

1510 rows with missing values out of 11774 total rows
1a7cA02: 171-280_326-349
1a7sA01: 1-12_108-207
1a9xA06: 664-686_757-936
1afwB01: 28-160_223-302
1agrH01: 60-83_164-175
1b37A01: 5-87_187-292_412-463
1b37A02: 88-186_293-411
1b7yB01: 1-37_154-186
1bgwA02: 420-562_606-633
1c0pA02: 1077-1136_1186-1288
1c1dA01: 3-64_75-136_339-349
1c8iA01: 9-172_273-301
1cg2A01: 26-213_324-414
1ckmA01: 11-59_84-189
1d2oA01: 535-628_677-689
1d7kB01: 35-45_280-409
1dk8A01: 117-147_233-249
1dnpA02: 131-168_204-288
1dq3A01: 1-137_415-454
1dt9A02: 143-174_198-277
1e3mB04: 317-428_527-566
1e5xB01: 17-161_264-434
1edzA02: 13-120_297-319
1epwA02: 444-475_532-830
1eu1A01: 54-145_378-481_507-529
1eu1A02: 147-366_579-604
1f0kA01: 7-162_339-357
1f0xB02: 1009-1105_1520-1566
1f20A02: 963-987_1038-1171
1f4sP00: -1-63
1fc6A01: 78-160_400-414
1fqiA01: 288-320_400-414
1fs0G02: 19-58_200-248
1fw8A01: 1-128_319-415
1g29103: 245-292_356-372
1g8kA02: 120-201_424-531_559-621
1g8kA03: 202-423_658-682
1g8mB02: 375-476_520-583
1

New problem:
- more than 1.5k domains have multiple sites

TODO:
- make some research
- find algorithm to work with them

Current implementation:
- throw away all 'problematic' cases

In [9]:
import numpy as np
df = pd.read_csv("../data/subset.csv")
df.replace('<null>', np.nan, inplace=True)
df = df.dropna()
df.to_csv("../data/subset.csv", index=False)

In [12]:
import os
import glob
import pandas as pd
import requests
from tqdm import tqdm
from Bio import SeqIO
from Bio.Align import PairwiseAligner

checkpoint_dir = "../data/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "mapped_checkpoint_*.csv"))
if checkpoint_files:
    latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
    df = pd.read_csv(latest_checkpoint)
    start_index = int(latest_checkpoint.split("_")[-1].split(".")[0])
    print(f"[Resume] Loaded checkpoint at row {start_index}")
else:
    df = pd.read_csv("../data/subset.csv")
    df["protein_sequence"] = ""
    df["domain_start"] = None
    df["domain_end"] = None
    df["domain_sequence"] = ""
    start_index = 0
    print("[Start] No checkpoint found. Starting from scratch.")

cath_fasta = {
    record.id.split("|")[-1].split("/")[0]: record
    for record in SeqIO.parse("../data/cath-domain-seqs.fa", "fasta")
}

pdb_overrides = {
    "1vw4": {"pdb": "3j6b", "chain": "8"},
    "4gns": {"pdb": "4yg8", "chain": "A"},
    "1vs9": {"pdb": "4v4i", "chain": "S"},
    "3p9d": {"pdb": "4v81", "chain": "A"},
    "4d8q": {"pdb": "4v94", "chain": "F"},
    "4a17": {"pdb": "4v8p", "chain": "BE"},
}

def get_fasta_sequence_with_label(pdb_id, auth_chain_id):
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return "", None

        fasta_blocks = response.text.strip().split(">")
        for block in fasta_blocks:
            lines = block.strip().splitlines()
            if not lines:
                continue
            header = lines[0]
            sequence = "".join(lines[1:])

            if "|Chain " in header or "|Chains " in header:
                chain_field = header.split("|")[1]
                parts = chain_field.replace("Chains ", "").replace("Chain ", "").split(",")
                for part in parts:
                    part = part.strip()
                    if "[auth " in part:
                        model_id = part.split("[auth")[0].strip()
                        auth_id = part.split("[auth")[1].replace("]", "").strip()
                        if auth_chain_id == auth_id:
                            return sequence, model_id
                    else:
                        if auth_chain_id == part:
                            return sequence, auth_chain_id
        return "", None
    except Exception as e:
        print(f"[Error] Failed to fetch FASTA for PDB ID {pdb_id}, chain {auth_chain_id}: {e}")
        return "", None

def get_cath_coordinates(pdb_id, chain_id, domain_id, expected_length, domain_start=None, domain_end=None, reference_seq=None, protein_seq=None):
    url = f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        features = data.get("rcsb_polymer_instance_feature", [])
        matched_coords = []

        # Step 1: match by domain_id
        for feature in features:
            if feature.get("type") != "CATH":
                continue
            for prop in feature.get("additional_properties", []):
                if prop.get("name") == "CATH_DOMAIN_ID":
                    ids = [v.strip().lower() for v in prop.get("values", [])]
                    if domain_id.strip().lower() in ids:
                        coords = feature.get("feature_positions", [])
                        matched_coords.extend([(c["beg_seq_id"], c["end_seq_id"]) for c in coords])

        if matched_coords:
            return matched_coords

        # Step 2: match by domain start/end
        if domain_start and domain_end:
            for feature in features:
                if feature.get("type") != "CATH":
                    continue
                coords = feature.get("feature_positions", [])
                for c in coords:
                    if abs(c["beg_seq_id"] - domain_start) <= 2 and abs(c["end_seq_id"] - domain_end) <= 2:
                        matched_coords.append((c["beg_seq_id"], c["end_seq_id"]))
            if matched_coords:
                return matched_coords

        # Step 3: match by domain length
        for feature in features:
            if feature.get("type") != "CATH":
                continue
            coords = feature.get("feature_positions", [])
            for c in coords:
                length = c["end_seq_id"] - c["beg_seq_id"] + 1
                if expected_length - 2 <= length <= expected_length + 2:
                    matched_coords.append((c["beg_seq_id"], c["end_seq_id"]))
        if matched_coords:
            return matched_coords

        # Step 4: global sequence alignment
        if reference_seq and protein_seq:
            aligner = PairwiseAligner()
            best_score = -1
            best_start = None
            for i in range(0, len(protein_seq) - expected_length + 1):
                window = protein_seq[i:i+expected_length]
                score = aligner.score(reference_seq[:expected_length], window)
                if score > best_score:
                    best_score = score
                    best_start = i
            if best_start is not None:
                return [(best_start + 1, best_start + expected_length)]

        print(f"[No Match] Could not map {domain_id}")
        return []

    except Exception as e:
        print(f"[Error] Failed to fetch CATH data for {pdb_id} {chain_id}: {e}")
        return []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    if idx < start_index:
        continue

    domain_id = row.get("domain_id", "")
    if not isinstance(domain_id, str) or len(domain_id) < 6:
        continue

    base_id = domain_id[:4].lower()
    if base_id in pdb_overrides:
        pdb_id = pdb_overrides[base_id]["pdb"]
        auth_chain_id = pdb_overrides[base_id]["chain"]
    else:
        pdb_id = base_id
        auth_chain_id = domain_id[4]

    cath_seq = str(row.get("sequence", "")).strip()
    domain_start = row.get("cath_domain_start")
    domain_end = row.get("cath_domain_end")
    expected_length = int(row.get("length", 0))

    ref_seq_record = cath_fasta.get(domain_id)
    reference_seq = str(ref_seq_record.seq) if ref_seq_record else None

    protein_seq, label_asym_id = get_fasta_sequence_with_label(pdb_id, auth_chain_id)
    if not protein_seq or not label_asym_id:
        continue

    df.at[idx, "protein_sequence"] = protein_seq

    coordinates = get_cath_coordinates(
        pdb_id, label_asym_id, domain_id, expected_length,
        domain_start, domain_end, reference_seq, protein_seq
    )

    if not coordinates:
        continue

    full_seq = ""
    starts, ends = [], []

    for beg, end in coordinates:
        starts.append(beg)
        ends.append(end)
        full_seq += protein_seq[beg - 1:end]

    df.at[idx, "domain_start"] = min(starts)
    df.at[idx, "domain_end"] = max(ends)
    df.at[idx, "domain_sequence"] = full_seq

    if (idx + 1) % 1000 == 0:
        df.to_csv("../data/subset_protein_mapped.csv", index=False)
        checkpoint_path = os.path.join(checkpoint_dir, f"mapped_checkpoint_{idx + 1}.csv")
        df.to_csv(checkpoint_path, index=False)
        print(f"[Checkpoint] Saved progress at row {idx + 1} → {checkpoint_path}")

df.to_csv("../data/subset_protein_mapped.csv", index=False)
print("[Final] Saved full dataframe")

[Resume] Loaded checkpoint at row 4000


 49%|████▊     | 5000/10264 [14:12<1:25:36,  1.02it/s] 

[Checkpoint] Saved progress at row 5000 → ../data/checkpoints/mapped_checkpoint_5000.csv


 49%|████▉     | 5079/10264 [15:20<1:10:41,  1.22it/s]

[No Match] Could not map 8fm5A01


 58%|█████▊    | 6000/10264 [29:19<1:22:24,  1.16s/it]

[Checkpoint] Saved progress at row 6000 → ../data/checkpoints/mapped_checkpoint_6000.csv


 68%|██████▊   | 7000/10264 [45:16<57:00,  1.05s/it]  

[Checkpoint] Saved progress at row 7000 → ../data/checkpoints/mapped_checkpoint_7000.csv


 78%|███████▊  | 8000/10264 [1:01:33<40:36,  1.08s/it]

[Checkpoint] Saved progress at row 8000 → ../data/checkpoints/mapped_checkpoint_8000.csv


 88%|████████▊ | 9000/10264 [1:17:05<21:48,  1.04s/it]

[Checkpoint] Saved progress at row 9000 → ../data/checkpoints/mapped_checkpoint_9000.csv


 97%|█████████▋| 10000/10264 [1:31:18<04:37,  1.05s/it]

[Checkpoint] Saved progress at row 10000 → ../data/checkpoints/mapped_checkpoint_10000.csv


100%|██████████| 10264/10264 [1:35:04<00:00,  1.80it/s]


[Final] Saved full dataframe


In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/subset_protein_mapped.csv")
df.replace('<null>', np.nan, inplace=True)
incomplete_rows = df[df.isnull().any(axis=1)]
total_rows = len(df)
incomplete_count = len(incomplete_rows)

print("Domain IDs with missing values:")
print(incomplete_rows['domain_id'].tolist())
print(f"\n{incomplete_count} rows with missing values out of {total_rows} total rows")

df = df.dropna()
df.to_csv("../data/subset_protein_mapped.csv", index=False)

Domain IDs with missing values:
['1vx4K02', '4hubI02', '4u2vA02', '7cosB02', '1mslA02', '3s6n201', '2j01501', '3u5cE04', '2v49501', '8h9f601', '2a01C01', '4kbqD00', '7zo9A01', '1v1hB00', '3rjrB02', '3o58g00', '3o58d00', '1vwxA02', '3so1E00', '8fm5A01', '1vzrA01', '4i4m600', '4a18X01', '4p6vF01', '2m25A00', '1vx4407', '3kitJ00', '1vx7200', '1vwxr00', '4a18O00', '1vx2P00', '3o30I00', '3u5eJ00', '4k0mC02', '3o58A02', '7yyqA01', '8edjA01', '3o58D00', '1vw3D00', '1vwxP00', '1vx7W00', '3o58J00', '3o58Y02', '3kypE01']

44 rows with missing values out of 10264 total rows
