# Forth protein mapping implementation

Main idea:
- get from CATH domain_ids + cath_domain_coordinates
- get from RSCB protein_seq
- search in RSCB for  rscb_domain_coordinates
  - exact match of domain_id
  - cath_domain_coordinates
  - length
  - seq alignment
- cut domain_seq from protein using rscb_domain_coordinates

Problem:
- still some of domain_ids could not be mapped

TODO:
- сheck seq alignment
- ~~automate remaining exceptions~~ make research if it even makes sense

In [9]:
import pandas as pd

df = pd.read_csv("../data/subset.csv")
required_columns = ["domain_parts", "length", "cath_domain_start1", "cath_domain_end1"]
total_rows = len(df)

incomplete_rows_df = df[df[required_columns].isnull().any(axis=1)]
incomplete_rows = len(incomplete_rows_df)

print(f"{incomplete_rows} rows with missing required fields out of {total_rows} total rows")

domain_column = 'domain_id'
incomplete_ids = set(incomplete_rows_df[domain_column].dropna().tolist())

fasta_path = "../data/cath-domain-seqs.fa"
results = {}

with open(fasta_path, "r") as file:
    for line in file:
        if line.startswith(">"):
            header = line.strip()
            last_part = header.split('|')[-1]
            if '/' in last_part:
                domain_id, region = last_part.split('/')
                if domain_id in incomplete_ids:
                    results[domain_id] = region


for domain_id, region in results.items():
    print(f"{domain_id}: {region}")

0 rows with missing required fields out of 11774 total rows


New problem:
- more than 1k domains have multiple sites
- now they all are parsed

TODO:
- make some research
- find algorithm to work with them

In [3]:
import os
import glob
import pandas as pd
import requests
from tqdm import tqdm
from Bio import SeqIO
from Bio.Align import PairwiseAligner

checkpoint_dir = "../data/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "mapped_checkpoint_*.csv"))
if checkpoint_files:
    latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
    df = pd.read_csv(latest_checkpoint)
    print(f"[Resume] Loaded checkpoint: {latest_checkpoint}")
else:
    df = pd.read_csv("../data/subset.csv")
    df["protein_sequence"] = ""
    max_parts = df["domain_parts"].max()
    for i in range(1, max_parts + 1):
        df[f"domain_start{i}"] = None
        df[f"domain_end{i}"] = None
        df[f"domain_sequence{i}"] = ""
    print("[Start] No checkpoint found. Starting from scratch.")

unprocessed_mask = df["protein_sequence"].isnull() | (df["protein_sequence"].str.strip() == "")
unprocessed_indices = df[unprocessed_mask].index

cath_fasta = {
    record.id.split("|")[-1].split("/")[0]: record
    for record in SeqIO.parse("../data/cath-domain-seqs.fa", "fasta")
}

pdb_overrides = {
    "1vx4K02": {"pdb": "3j7p", "chain": "K"},
    "4hubI02": {"pdb": "4v9f", "chain": "I"},
    "7cosB02": {"pdb": "8igw", "chain": "B"},
    "1mslA02": {"pdb": "2oar", "chain": "A"},
    "3s6n201": {"pdb": "5xjl", "chain": "2"},
    "2j01501": {"pdb": "4v51", "chain": "B5"},
    "3u5cE04": {"pdb": "4v88", "chain": "CE"},
    "2v49501": {"pdb": "4v5a", "chain": "B5"},
    "3rjrB02": {"pdb": "5vqf", "chain": "B"},
    "3o58g00": {"pdb": "4v7r", "chain": "Bg"},
    "3o58d00": {"pdb": "4v7r", "chain": "Dg"},
    "1vwxA02": {"pdb": "3j7o", "chain": "A"},
    "4i4m600": {"pdb": "4v9f", "chain": "6"},
    "4a18X01": {"pdb": "4v8p", "chain": "AX"},
    "4p6vF01": {"pdb": "8acy", "chain": "F"},
    "2m25A00": {"pdb": "2n17", "chain": "A"},
    "1vx4407": {"pdb": "3j7p", "chain": "4"},
    "3kitJ00": {"pdb": "4v7j", "chain": "AJ"},
    "1vx7200": {"pdb": "3j79", "chain": "2"},
    "1vwxr00": {"pdb": "3j7o", "chain": "r"},
    "4a18O00": {"pdb": "4v8p", "chain": "AO"},
    "1vx2P00": {"pdb": "3j7r", "chain": "SP"},
    "3o30I00": {"pdb": "4v7r", "chain": "AI"},
    "3u5eJ00": {"pdb": "4v88", "chain": "BJ"},
    "4k0mC02": {"pdb": "4v9i", "chain": "BC"},
    "3o58A02": {"pdb": "4v7r", "chain": "BA"},
    "7yyqA01": {"pdb": "8ql2", "chain": "A"},
    "3o58D00": {"pdb": "4v7r", "chain": "Dd"},
    "1vw3D00": {"pdb": "3j6b", "chain": "D"},
    "1vwxP00": {"pdb": "3j7o", "chain": "P"},
    "1vx7W00": {"pdb": "3j79", "chain": "W"},
    "3o58J00": {"pdb": "4v7r", "chain": "BJ"},
    "3kypE01": {"pdb": "5x7v", "chain": "E"},
    "1vw4801": {"pdb": "3j6b", "chain": "8"},
    "1vw4F02": {"pdb": "3j6b", "chain": "F"},
    "1vw4F01": {"pdb": "3j6b", "chain": "F"},
    "4gnsA01": {"pdb": "4yg8", "chain": "A"},
    "1vs9S01": {"pdb": "4v4i", "chain": "S"},
    "3p9dA03": {"pdb": "4v81", "chain": "A"},
    "4d8qF03": {"pdb": "4v94", "chain": "F"},
    "4a17E01": {"pdb": "4v8p", "chain": "BE"},
}

def get_fasta_sequence_with_label(pdb_id, auth_chain_id):
    url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return "", None

        fasta_blocks = response.text.strip().split(">")
        for block in fasta_blocks:
            lines = block.strip().splitlines()
            if not lines:
                continue
            header = lines[0]
            sequence = "".join(lines[1:])

            if "|Chain " in header or "|Chains " in header:
                chain_field = header.split("|")[1]
                parts = chain_field.replace("Chains ", "").replace("Chain ", "").split(",")
                for part in parts:
                    part = part.strip()
                    if "[auth " in part:
                        model_id = part.split("[auth")[0].strip()
                        auth_id = part.split("[auth")[1].replace("]", "").strip()
                        if auth_chain_id == auth_id:
                            return sequence, model_id
                    else:
                        if auth_chain_id == part:
                            return sequence, auth_chain_id
        return "", None
    except Exception as e:
        print(f"[Error] Failed to fetch FASTA for PDB ID {pdb_id}, chain {auth_chain_id}: {e}")
        return "", None

def get_cath_coordinates(pdb_id, chain_id, domain_id, expected_length, domain_start=None, domain_end=None, reference_seq=None, protein_seq=None):
    url = f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        features = data.get("rcsb_polymer_instance_feature", [])
        matched_coords = []

        # Step 1: match by domain_id
        for feature in features:
            if feature.get("type") != "CATH":
                continue
            for prop in feature.get("additional_properties", []):
                if prop.get("name") == "CATH_DOMAIN_ID":
                    ids = [v.strip().lower() for v in prop.get("values", [])]
                    if domain_id.strip().lower() in ids:
                        coords = feature.get("feature_positions", [])
                        matched_coords.extend([(c["beg_seq_id"], c["end_seq_id"]) for c in coords])

        if matched_coords:
            return matched_coords

        # Step 2: match by domain start/end
        if domain_start and domain_end:
            for feature in features:
                if feature.get("type") != "CATH":
                    continue
                coords = feature.get("feature_positions", [])
                for c in coords:
                    if abs(c["beg_seq_id"] - domain_start) <= 2 and abs(c["end_seq_id"] - domain_end) <= 2:
                        matched_coords.append((c["beg_seq_id"], c["end_seq_id"]))
            if matched_coords:
                return matched_coords

        # Step 3: match by domain length
        for feature in features:
            if feature.get("type") != "CATH":
                continue
            coords = feature.get("feature_positions", [])
            for c in coords:
                length = c["end_seq_id"] - c["beg_seq_id"] + 1
                if expected_length - 2 <= length <= expected_length + 2:
                    matched_coords.append((c["beg_seq_id"], c["end_seq_id"]))
        if matched_coords:
            return matched_coords

        # Step 4: global sequence alignment
        if reference_seq and protein_seq:
            aligner = PairwiseAligner()
            best_score = -1
            best_start = None
            for i in range(0, len(protein_seq) - expected_length + 1):
                window = protein_seq[i:i+expected_length]
                score = aligner.score(reference_seq[:expected_length], window)
                if score > best_score:
                    best_score = score
                    best_start = i
            if best_start is not None:
                return [(best_start + 1, best_start + expected_length)]

        print(f"[No Match] Could not map {domain_id}")
        return []

    except Exception as e:
        print(f"[Error] Failed to fetch CATH data for {pdb_id} {chain_id}: {e}")
        return []

aligner = PairwiseAligner()

for idx in tqdm(unprocessed_indices, total=len(unprocessed_indices)):
    row = df.loc[idx]

    domain_id = row.get("domain_id", "")
    if not isinstance(domain_id, str) or len(domain_id) < 6:
        continue

    if domain_id in pdb_overrides:
        pdb_id = pdb_overrides[domain_id]["pdb"]
        auth_chain_id = pdb_overrides[domain_id]["chain"]
    else:
        pdb_id = domain_id[:4].lower()
        auth_chain_id = domain_id[4]

    ref_seq_record = cath_fasta.get(domain_id)
    reference_seq = str(ref_seq_record.seq) if ref_seq_record else None

    protein_seq, label_asym_id = get_fasta_sequence_with_label(pdb_id, auth_chain_id)
    if not protein_seq or not label_asym_id:
        continue

    df.at[idx, "protein_sequence"] = protein_seq

    n_parts = int(row.get("domain_parts", 1))
    segment_lengths = []

    for i in range(1, n_parts + 1):
        start = row.get(f"cath_domain_start{i}")
        end = row.get(f"cath_domain_end{i}")
        if pd.notna(start) and pd.notna(end):
            start, end = int(start), int(end)
            segment_lengths.append(end - start + 1)

    expected_length = sum(segment_lengths)

    initial_coords = get_cath_coordinates(
        pdb_id, label_asym_id, domain_id, expected_length,
        domain_start=None, domain_end=None,
        reference_seq=reference_seq, protein_seq=protein_seq
    )

    coordinates = initial_coords

    if not coordinates:
        coordinates = []
        search_start = 0
        for seg_len in segment_lengths:
            best_score = -1
            best_start = None
            for j in range(search_start, len(protein_seq) - seg_len + 1):
                window = protein_seq[j:j + seg_len]
                score = aligner.score(reference_seq[:seg_len], window)
                if score > best_score:
                    best_score = score
                    best_start = j
            if best_start is None:
                coordinates = []
                break
            coordinates.append((best_start + 1, best_start + seg_len))
            search_start = best_start + seg_len

    if not coordinates or len(coordinates) != n_parts:
        print(f"[Skip] {domain_id} → could not match all parts.")
        continue

    full_seq = ""
    for i, (start, end) in enumerate(coordinates, 1):
        part_seq = protein_seq[start - 1:end]
        df.at[idx, f"domain_start{i}"] = start
        df.at[idx, f"domain_end{i}"] = end
        df.at[idx, f"domain_sequence{i}"] = part_seq
        full_seq += part_seq

    df.at[idx, "domain_sequence1"] = full_seq

    if (idx + 1) % 10 == 0:
        df.to_csv("../data/subset_protein_mapped.csv", index=False)
        checkpoint_path = os.path.join(checkpoint_dir, f"mapped_checkpoint_{idx + 1}.csv")
        df.to_csv(checkpoint_path, index=False)
        print(f"[Checkpoint] Saved progress at row {idx + 1}")

df.to_csv("../data/subset_protein_mapped.csv", index=False)
print("[Final] Saved full dataframe")

[Resume] Loaded checkpoint: ../data/checkpoints/mapped_checkpoint_10190.csv


  2%|▏         | 26/1600 [00:16<28:31,  1.09s/it]

[Checkpoint] Saved progress at row 10200


  2%|▏         | 36/1600 [00:26<29:15,  1.12s/it]

[Checkpoint] Saved progress at row 10210


  3%|▎         | 46/1600 [00:36<27:22,  1.06s/it]

[Checkpoint] Saved progress at row 10220


  4%|▎         | 56/1600 [00:46<27:44,  1.08s/it]

[Checkpoint] Saved progress at row 10230


  4%|▍         | 66/1600 [00:55<28:02,  1.10s/it]

[Checkpoint] Saved progress at row 10240


  5%|▍         | 76/1600 [01:06<29:18,  1.15s/it]

[Checkpoint] Saved progress at row 10250


  5%|▌         | 86/1600 [01:15<27:24,  1.09s/it]

[Checkpoint] Saved progress at row 10260


  6%|▌         | 96/1600 [01:26<30:22,  1.21s/it]

[Checkpoint] Saved progress at row 10270


  7%|▋         | 106/1600 [01:36<29:46,  1.20s/it]

[Checkpoint] Saved progress at row 10280


  7%|▋         | 116/1600 [01:46<27:06,  1.10s/it]

[Checkpoint] Saved progress at row 10290


  8%|▊         | 126/1600 [01:57<26:00,  1.06s/it]

[Checkpoint] Saved progress at row 10300


  8%|▊         | 136/1600 [02:06<25:35,  1.05s/it]

[Checkpoint] Saved progress at row 10310


  9%|▉         | 146/1600 [02:17<27:36,  1.14s/it]

[Checkpoint] Saved progress at row 10320


 10%|▉         | 156/1600 [02:27<27:16,  1.13s/it]

[Checkpoint] Saved progress at row 10330


 10%|█         | 163/1600 [02:34<23:27,  1.02it/s]

[Skip] 5yuyA02 → could not match all parts.


 10%|█         | 166/1600 [02:37<25:39,  1.07s/it]

[Checkpoint] Saved progress at row 10340


 11%|█         | 176/1600 [02:47<25:35,  1.08s/it]

[Checkpoint] Saved progress at row 10350


 12%|█▏        | 186/1600 [02:58<26:58,  1.14s/it]

[Checkpoint] Saved progress at row 10360


 12%|█▏        | 196/1600 [03:07<24:24,  1.04s/it]

[Checkpoint] Saved progress at row 10370


 13%|█▎        | 206/1600 [03:17<23:32,  1.01s/it]

[Checkpoint] Saved progress at row 10380


 14%|█▎        | 216/1600 [03:26<23:17,  1.01s/it]

[Checkpoint] Saved progress at row 10390


 14%|█▍        | 226/1600 [03:35<24:31,  1.07s/it]

[Checkpoint] Saved progress at row 10400


 15%|█▍        | 236/1600 [03:45<23:15,  1.02s/it]

[Checkpoint] Saved progress at row 10410


 15%|█▌        | 246/1600 [03:55<24:49,  1.10s/it]

[Checkpoint] Saved progress at row 10420


 16%|█▌        | 256/1600 [04:04<23:29,  1.05s/it]

[Checkpoint] Saved progress at row 10430


 17%|█▋        | 266/1600 [04:14<24:16,  1.09s/it]

[Checkpoint] Saved progress at row 10440


 17%|█▋        | 270/1600 [04:17<20:34,  1.08it/s]

[No Match] Could not map 3o58D00
[Skip] 3o58D00 → could not match all parts.


 17%|█▋        | 276/1600 [04:23<23:39,  1.07s/it]

[Checkpoint] Saved progress at row 10450


 18%|█▊        | 286/1600 [04:33<24:07,  1.10s/it]

[Checkpoint] Saved progress at row 10460


 18%|█▊        | 296/1600 [04:43<24:55,  1.15s/it]

[Checkpoint] Saved progress at row 10470


 19%|█▊        | 298/1600 [04:45<22:59,  1.06s/it]

[Skip] 4kctA01 → could not match all parts.


 19%|█▉        | 306/1600 [04:54<24:44,  1.15s/it]

[Checkpoint] Saved progress at row 10480


 20%|█▉        | 316/1600 [05:03<22:41,  1.06s/it]

[Checkpoint] Saved progress at row 10490


 20%|██        | 326/1600 [05:13<22:18,  1.05s/it]

[Checkpoint] Saved progress at row 10500


 21%|██        | 336/1600 [05:22<22:26,  1.07s/it]

[Checkpoint] Saved progress at row 10510


 22%|██▏       | 346/1600 [05:32<22:50,  1.09s/it]

[Checkpoint] Saved progress at row 10520


 22%|██▏       | 356/1600 [05:42<23:15,  1.12s/it]

[Checkpoint] Saved progress at row 10530


 23%|██▎       | 366/1600 [05:51<22:26,  1.09s/it]

[Checkpoint] Saved progress at row 10540


 24%|██▎       | 376/1600 [06:01<21:46,  1.07s/it]

[Checkpoint] Saved progress at row 10550


 24%|██▍       | 386/1600 [06:11<23:59,  1.19s/it]

[Checkpoint] Saved progress at row 10560


 25%|██▍       | 396/1600 [06:22<23:44,  1.18s/it]

[Checkpoint] Saved progress at row 10570


 25%|██▌       | 406/1600 [06:32<23:56,  1.20s/it]

[Checkpoint] Saved progress at row 10580


 26%|██▌       | 416/1600 [06:43<22:14,  1.13s/it]

[Checkpoint] Saved progress at row 10590


 27%|██▋       | 426/1600 [06:53<22:51,  1.17s/it]

[Checkpoint] Saved progress at row 10600


 27%|██▋       | 436/1600 [07:04<22:56,  1.18s/it]

[Checkpoint] Saved progress at row 10610


 28%|██▊       | 446/1600 [07:14<21:36,  1.12s/it]

[Checkpoint] Saved progress at row 10620


 28%|██▊       | 456/1600 [07:24<20:19,  1.07s/it]

[Checkpoint] Saved progress at row 10630


 29%|██▉       | 466/1600 [07:34<19:53,  1.05s/it]

[Checkpoint] Saved progress at row 10640


 30%|██▉       | 476/1600 [07:44<19:48,  1.06s/it]

[Checkpoint] Saved progress at row 10650


 30%|███       | 486/1600 [07:53<20:07,  1.08s/it]

[Checkpoint] Saved progress at row 10660


 31%|███       | 496/1600 [08:03<19:35,  1.07s/it]

[Checkpoint] Saved progress at row 10670


 32%|███▏      | 506/1600 [08:13<20:28,  1.12s/it]

[Checkpoint] Saved progress at row 10680


 32%|███▏      | 516/1600 [08:23<19:49,  1.10s/it]

[Checkpoint] Saved progress at row 10690


 33%|███▎      | 526/1600 [08:33<19:48,  1.11s/it]

[Checkpoint] Saved progress at row 10700


 34%|███▎      | 536/1600 [08:42<19:09,  1.08s/it]

[Checkpoint] Saved progress at row 10710


 34%|███▍      | 546/1600 [08:52<19:16,  1.10s/it]

[Checkpoint] Saved progress at row 10720


 35%|███▍      | 556/1600 [09:02<19:17,  1.11s/it]

[Checkpoint] Saved progress at row 10730


 35%|███▌      | 566/1600 [09:11<19:00,  1.10s/it]

[Checkpoint] Saved progress at row 10740


 36%|███▌      | 576/1600 [09:21<18:52,  1.11s/it]

[Checkpoint] Saved progress at row 10750


 36%|███▋      | 581/1600 [09:26<15:54,  1.07it/s]

[Skip] 2x1dA01 → could not match all parts.


 37%|███▋      | 586/1600 [09:30<16:21,  1.03it/s]

[Skip] 5u81A01 → could not match all parts.


 37%|███▋      | 596/1600 [09:40<17:38,  1.05s/it]

[Checkpoint] Saved progress at row 10770


 38%|███▊      | 606/1600 [09:49<17:48,  1.07s/it]

[Checkpoint] Saved progress at row 10780


 38%|███▊      | 616/1600 [09:59<17:27,  1.06s/it]

[Checkpoint] Saved progress at row 10790


 39%|███▉      | 626/1600 [10:09<18:33,  1.14s/it]

[Checkpoint] Saved progress at row 10800


 40%|███▉      | 636/1600 [10:19<16:57,  1.06s/it]

[Checkpoint] Saved progress at row 10810


 40%|████      | 646/1600 [10:28<16:40,  1.05s/it]

[Checkpoint] Saved progress at row 10820


 41%|████      | 650/1600 [10:32<15:32,  1.02it/s]

[Skip] 2aafB01 → could not match all parts.


 41%|████      | 656/1600 [10:38<16:36,  1.06s/it]

[Checkpoint] Saved progress at row 10830


 41%|████▏     | 662/1600 [10:44<14:37,  1.07it/s]

[Skip] 1s9rA01 → could not match all parts.


 42%|████▏     | 666/1600 [10:48<16:35,  1.07s/it]

[Checkpoint] Saved progress at row 10840


 42%|████▏     | 676/1600 [10:58<17:19,  1.12s/it]

[Checkpoint] Saved progress at row 10850


 43%|████▎     | 686/1600 [11:08<16:20,  1.07s/it]

[Checkpoint] Saved progress at row 10860


 44%|████▎     | 696/1600 [11:18<16:29,  1.09s/it]

[Checkpoint] Saved progress at row 10870


 44%|████▍     | 706/1600 [11:28<15:57,  1.07s/it]

[Checkpoint] Saved progress at row 10880


 45%|████▍     | 716/1600 [11:37<15:15,  1.04s/it]

[Checkpoint] Saved progress at row 10890


 45%|████▌     | 726/1600 [11:47<14:44,  1.01s/it]

[Checkpoint] Saved progress at row 10900


 46%|████▌     | 736/1600 [11:57<15:25,  1.07s/it]

[Checkpoint] Saved progress at row 10910


 47%|████▋     | 746/1600 [12:06<15:07,  1.06s/it]

[Checkpoint] Saved progress at row 10920


 47%|████▋     | 756/1600 [12:15<12:29,  1.13it/s]

[Skip] 7kz9A01 → could not match all parts.


 48%|████▊     | 764/1600 [12:23<13:27,  1.04it/s]

[Skip] 7kz9B01 → could not match all parts.


 48%|████▊     | 766/1600 [12:24<13:20,  1.04it/s]

[Skip] 2o7iA02 → could not match all parts.


 48%|████▊     | 776/1600 [12:34<14:53,  1.08s/it]

[Checkpoint] Saved progress at row 10950


 49%|████▉     | 786/1600 [12:44<15:03,  1.11s/it]

[Checkpoint] Saved progress at row 10960


 50%|████▉     | 796/1600 [12:54<14:27,  1.08s/it]

[Checkpoint] Saved progress at row 10970


 50%|█████     | 806/1600 [13:04<14:30,  1.10s/it]

[Checkpoint] Saved progress at row 10980


 51%|█████     | 816/1600 [13:14<14:56,  1.14s/it]

[Checkpoint] Saved progress at row 10990


 51%|█████     | 819/1600 [13:17<13:29,  1.04s/it]

[Skip] 3ubtA02 → could not match all parts.


 52%|█████▏    | 826/1600 [13:25<15:04,  1.17s/it]

[Checkpoint] Saved progress at row 11000


 52%|█████▏    | 836/1600 [13:34<12:35,  1.01it/s]

[Skip] 4dv8A01 → could not match all parts.


 53%|█████▎    | 846/1600 [13:44<13:24,  1.07s/it]

[Checkpoint] Saved progress at row 11020


 54%|█████▎    | 856/1600 [13:54<14:26,  1.16s/it]

[Checkpoint] Saved progress at row 11030


 54%|█████▍    | 862/1600 [14:00<12:11,  1.01it/s]

[Skip] 6qdiA02 → could not match all parts.


 54%|█████▍    | 864/1600 [14:02<11:47,  1.04it/s]

[Skip] 6qdiA01 → could not match all parts.


 54%|█████▍    | 866/1600 [14:05<14:34,  1.19s/it]

[Checkpoint] Saved progress at row 11040


 55%|█████▍    | 876/1600 [14:14<12:53,  1.07s/it]

[Checkpoint] Saved progress at row 11050


 55%|█████▌    | 886/1600 [14:24<12:39,  1.06s/it]

[Checkpoint] Saved progress at row 11060


 56%|█████▌    | 896/1600 [14:35<13:37,  1.16s/it]

[Checkpoint] Saved progress at row 11070


 57%|█████▋    | 906/1600 [14:46<13:24,  1.16s/it]

[Checkpoint] Saved progress at row 11080


 57%|█████▋    | 916/1600 [14:56<12:44,  1.12s/it]

[Checkpoint] Saved progress at row 11090


 58%|█████▊    | 926/1600 [15:06<11:49,  1.05s/it]

[Checkpoint] Saved progress at row 11100


 58%|█████▊    | 936/1600 [15:16<11:50,  1.07s/it]

[Checkpoint] Saved progress at row 11110


 59%|█████▉    | 946/1600 [15:25<11:49,  1.08s/it]

[Checkpoint] Saved progress at row 11120


 59%|█████▉    | 947/1600 [15:26<11:55,  1.10s/it]

[Skip] 3dorA03 → could not match all parts.


 59%|█████▉    | 949/1600 [15:28<11:35,  1.07s/it]

[Skip] 3bf0C03 → could not match all parts.


 59%|█████▉    | 950/1600 [15:30<11:49,  1.09s/it]

[Skip] 1uyvB02 → could not match all parts.


 60%|█████▉    | 956/1600 [15:35<11:17,  1.05s/it]

[Checkpoint] Saved progress at row 11130


 60%|██████    | 966/1600 [15:46<11:56,  1.13s/it]

[Checkpoint] Saved progress at row 11140


 61%|██████    | 976/1600 [15:56<11:38,  1.12s/it]

[Checkpoint] Saved progress at row 11150


 62%|██████▏   | 986/1600 [16:06<11:47,  1.15s/it]

[Checkpoint] Saved progress at row 11160


 62%|██████▏   | 996/1600 [16:16<11:00,  1.09s/it]

[Checkpoint] Saved progress at row 11170


 63%|██████▎   | 1006/1600 [16:25<10:22,  1.05s/it]

[Checkpoint] Saved progress at row 11180


 64%|██████▎   | 1016/1600 [16:35<10:25,  1.07s/it]

[Checkpoint] Saved progress at row 11190


 64%|██████▍   | 1020/1600 [16:39<09:59,  1.03s/it]

[Skip] 2ckfC01 → could not match all parts.


 64%|██████▍   | 1021/1600 [16:40<10:05,  1.05s/it]

[Skip] 2bmoA01 → could not match all parts.


 64%|██████▍   | 1024/1600 [16:43<09:48,  1.02s/it]

[Skip] 2de6A01 → could not match all parts.


 64%|██████▍   | 1025/1600 [16:44<09:49,  1.03s/it]

[Skip] 3vcaA01 → could not match all parts.


 64%|██████▍   | 1026/1600 [16:46<11:14,  1.17s/it]

[Checkpoint] Saved progress at row 11200


 64%|██████▍   | 1027/1600 [16:47<11:05,  1.16s/it]

[Skip] 3gzxA01 → could not match all parts.


 64%|██████▍   | 1028/1600 [16:48<10:57,  1.15s/it]

[Skip] 2b1xA01 → could not match all parts.


 64%|██████▍   | 1029/1600 [16:49<10:28,  1.10s/it]

[Skip] 1z01A01 → could not match all parts.


 65%|██████▍   | 1033/1600 [16:53<09:36,  1.02s/it]

[Skip] 3k8kA03 → could not match all parts.


 65%|██████▍   | 1034/1600 [16:54<09:27,  1.00s/it]

[Skip] 4aeeA04 → could not match all parts.


 65%|██████▍   | 1036/1600 [16:56<09:41,  1.03s/it]

[Skip] 1g5aA03 → could not match all parts.


 65%|██████▌   | 1046/1600 [17:07<11:10,  1.21s/it]

[Checkpoint] Saved progress at row 11220


 66%|██████▌   | 1056/1600 [17:17<09:56,  1.10s/it]

[Checkpoint] Saved progress at row 11230


 67%|██████▋   | 1066/1600 [17:27<09:27,  1.06s/it]

[Checkpoint] Saved progress at row 11240


 67%|██████▋   | 1070/1600 [17:30<08:40,  1.02it/s]

[Skip] 7phoB01 → could not match all parts.


 67%|██████▋   | 1076/1600 [17:36<09:42,  1.11s/it]

[Checkpoint] Saved progress at row 11250


 68%|██████▊   | 1086/1600 [17:46<09:25,  1.10s/it]

[Checkpoint] Saved progress at row 11260


 68%|██████▊   | 1096/1600 [17:56<09:25,  1.12s/it]

[Checkpoint] Saved progress at row 11270


 69%|██████▉   | 1106/1600 [18:07<09:07,  1.11s/it]

[Checkpoint] Saved progress at row 11280


 70%|██████▉   | 1116/1600 [18:17<08:53,  1.10s/it]

[Checkpoint] Saved progress at row 11290


 70%|███████   | 1126/1600 [18:27<08:57,  1.13s/it]

[Checkpoint] Saved progress at row 11300


 71%|███████   | 1136/1600 [18:38<08:57,  1.16s/it]

[Checkpoint] Saved progress at row 11310


 72%|███████▏  | 1146/1600 [18:48<08:22,  1.11s/it]

[Checkpoint] Saved progress at row 11320


 72%|███████▏  | 1156/1600 [18:58<07:44,  1.05s/it]

[Checkpoint] Saved progress at row 11330


 73%|███████▎  | 1166/1600 [19:07<07:36,  1.05s/it]

[Checkpoint] Saved progress at row 11340


 74%|███████▎  | 1176/1600 [19:18<08:00,  1.13s/it]

[Checkpoint] Saved progress at row 11350


 74%|███████▍  | 1182/1600 [19:24<06:56,  1.00it/s]

[Skip] 3afhA02 → could not match all parts.


 74%|███████▍  | 1186/1600 [19:28<07:48,  1.13s/it]

[Checkpoint] Saved progress at row 11360


 75%|███████▍  | 1196/1600 [19:38<07:15,  1.08s/it]

[Checkpoint] Saved progress at row 11370


 75%|███████▌  | 1206/1600 [19:48<06:53,  1.05s/it]

[Checkpoint] Saved progress at row 11380


 76%|███████▌  | 1216/1600 [19:57<06:53,  1.08s/it]

[Checkpoint] Saved progress at row 11390


 77%|███████▋  | 1226/1600 [20:07<06:39,  1.07s/it]

[Checkpoint] Saved progress at row 11400


 77%|███████▋  | 1236/1600 [20:17<06:24,  1.06s/it]

[Checkpoint] Saved progress at row 11410


 78%|███████▊  | 1246/1600 [20:26<06:13,  1.05s/it]

[Checkpoint] Saved progress at row 11420


 78%|███████▊  | 1256/1600 [20:36<06:08,  1.07s/it]

[Checkpoint] Saved progress at row 11430


 79%|███████▉  | 1266/1600 [20:45<05:45,  1.03s/it]

[Checkpoint] Saved progress at row 11440


 80%|███████▉  | 1276/1600 [20:55<05:45,  1.07s/it]

[Checkpoint] Saved progress at row 11450


 80%|████████  | 1286/1600 [21:05<06:04,  1.16s/it]

[Checkpoint] Saved progress at row 11460


 81%|████████  | 1296/1600 [21:15<05:27,  1.08s/it]

[Checkpoint] Saved progress at row 11470


 82%|████████▏ | 1306/1600 [21:25<05:34,  1.14s/it]

[Checkpoint] Saved progress at row 11480


 82%|████████▏ | 1316/1600 [21:36<05:30,  1.16s/it]

[Checkpoint] Saved progress at row 11490


 83%|████████▎ | 1326/1600 [21:45<04:47,  1.05s/it]

[Checkpoint] Saved progress at row 11500


 84%|████████▎ | 1336/1600 [21:55<04:49,  1.10s/it]

[Checkpoint] Saved progress at row 11510


 84%|████████▍ | 1346/1600 [22:05<04:14,  1.00s/it]

[Checkpoint] Saved progress at row 11520


 85%|████████▍ | 1356/1600 [22:15<04:24,  1.09s/it]

[Checkpoint] Saved progress at row 11530


 85%|████████▌ | 1366/1600 [22:25<04:23,  1.12s/it]

[Checkpoint] Saved progress at row 11540


 86%|████████▌ | 1376/1600 [22:35<03:59,  1.07s/it]

[Checkpoint] Saved progress at row 11550


 87%|████████▋ | 1386/1600 [22:45<04:05,  1.15s/it]

[Checkpoint] Saved progress at row 11560


 87%|████████▋ | 1396/1600 [22:55<03:55,  1.15s/it]

[Checkpoint] Saved progress at row 11570


 88%|████████▊ | 1406/1600 [23:05<03:47,  1.17s/it]

[Checkpoint] Saved progress at row 11580


 88%|████████▊ | 1416/1600 [23:16<03:35,  1.17s/it]

[Checkpoint] Saved progress at row 11590


 89%|████████▉ | 1426/1600 [23:26<03:08,  1.09s/it]

[Checkpoint] Saved progress at row 11600


 90%|████████▉ | 1436/1600 [23:36<02:52,  1.05s/it]

[Checkpoint] Saved progress at row 11610


 90%|█████████ | 1446/1600 [23:46<02:44,  1.07s/it]

[Checkpoint] Saved progress at row 11620


 92%|█████████▏| 1466/1600 [24:04<02:37,  1.18s/it]

[Checkpoint] Saved progress at row 11640


 92%|█████████▏| 1476/1600 [24:14<02:20,  1.13s/it]

[Checkpoint] Saved progress at row 11650


 93%|█████████▎| 1486/1600 [24:24<02:07,  1.12s/it]

[Checkpoint] Saved progress at row 11660


 94%|█████████▎| 1496/1600 [24:33<01:50,  1.06s/it]

[Checkpoint] Saved progress at row 11670


 94%|█████████▍| 1506/1600 [24:43<01:41,  1.08s/it]

[Checkpoint] Saved progress at row 11680


 95%|█████████▍| 1516/1600 [24:53<01:32,  1.11s/it]

[Checkpoint] Saved progress at row 11690


 95%|█████████▌| 1526/1600 [25:03<01:23,  1.13s/it]

[Checkpoint] Saved progress at row 11700


 96%|█████████▌| 1536/1600 [25:13<01:09,  1.09s/it]

[Checkpoint] Saved progress at row 11710


 97%|█████████▋| 1546/1600 [25:23<00:58,  1.09s/it]

[Checkpoint] Saved progress at row 11720


 97%|█████████▋| 1556/1600 [25:32<00:49,  1.13s/it]

[Checkpoint] Saved progress at row 11730


 98%|█████████▊| 1566/1600 [25:42<00:37,  1.11s/it]

[Checkpoint] Saved progress at row 11740


 98%|█████████▊| 1576/1600 [25:52<00:26,  1.11s/it]

[Checkpoint] Saved progress at row 11750


 99%|█████████▉| 1586/1600 [26:02<00:15,  1.11s/it]

[Checkpoint] Saved progress at row 11760


100%|█████████▉| 1596/1600 [26:12<00:04,  1.14s/it]

[Checkpoint] Saved progress at row 11770


100%|██████████| 1600/1600 [26:16<00:00,  1.02it/s]


[Final] Saved full dataframe


In [5]:
import pandas as pd

df = pd.read_csv("../data/subset_protein_mapped.csv")

incomplete_mask = df["protein_sequence"].isnull() | df["domain_sequence1"].isnull()
incomplete_rows = df[incomplete_mask]
problem_ids = incomplete_rows["domain_id"].tolist()
print("Domain IDs with missing protein_sequence or domain_sequence1:")
for d in problem_ids:
    print(d)

print(f"\nTotal incomplete rows: {len(problem_ids)} out of {len(df)} total rows.")

#df = df[~incomplete_mask]
#df.to_csv("../data/subset_protein_mapped.csv", index=False)

Domain IDs with missing protein_sequence or domain_sequence1:
4u2vA02
3hm6X01
1werA02
1nf1A01
3ziuB01
6spoA03
3fnrA01
3ziuA01
1gaxA02
3bnjA01
1oahA01
3t6pA01
1vw4900
1vw4501
2a01C01
1khdA01
4eadA01
5noeA01
2qtlA01
1f20A02
2bpoA02
4kbqD00
7zo9A01
7bi4A01
8amzQ01
1v1hB00
6e56D01
1n8zC04
2b39A06
2d1sA01
1rfeA01
3b5mA01
2ausA01
1rcqA01
5x7nB01
2nvaA01
2vd8A01
2yxxA01
5gjoB01
4eclA01
4beuA01
1knwA01
1d7kB01
5zl6A01
5fagA01
3n29B01
4aibA01
2pljA01
4lusB01
2j66A01
7odcA01
3n2bD01
3n2oC01
1pfzA02
5bnzA04
2hz7A04
4en2A02
1u0tA02
4k35A02
4pvaA00
3pijA01
5hp6A01
3ugfA01
8fm5A01
1vzrA01
3au4A02
2grvC03
6l4lA01
4k8lA01
4dunA01
2gkeA01
4jbdA01
2otnB01
6hjfB01
3ednA01
5k87A01
2gkeA02
6lpnA01
5yuyA01
3u27A01
4hubG01
1vx4q01
7uwrA02
6l3mB01
7s6dB02
1vw4502
4wweA01
3o58A01
3lltA01
2fstX01
7si1A01
6ks6B02
6ks6e02
8bm1K01
7d5mA01
3btvA02
3db2B02
2ixaA02
7xr9E01
3dtyB02
1lc0A02
1vw4700
1l0wA02
1v4pC01
1tkeA02
2ztgA04
7qh7701
4bt2A01
1xv2C01
4pabA03
2gagC01
1pj5A03
1v5vA01
3girA01
4dx5A04
7whfC01
1ti2A02
3u

To be fixed:

* 4u2vA02
* 3hm6X01
* 1werA02
* 1nf1A01
* 3ziuB01
* 6spoA03
* 3fnrA01
* 3ziuA01
* 1gaxA02
* 3bnjA01
* 1oahA01
* 3t6pA01
* 1vw4900
* 1vw4501
* 2a01C01
* 1khdA01
* 4eadA01
* 5noeA01
* 2qtlA01
* 1f20A02
* 2bpoA02
* 4kbqD00
* 7zo9A01
* 7bi4A01
* 8amzQ01
* 1v1hB00
* 6e56D01
* 1n8zC04
* 2b39A06
* 2d1sA01
* 1rfeA01
* 3b5mA01
* 2ausA01
* 1rcqA01
* 5x7nB01
* 2nvaA01
* 2vd8A01
* 2yxxA01
* 5gjoB01
* 4eclA01
* 4beuA01
* 1knwA01
* 1d7kB01
* 5zl6A01
* 5fagA01
* 3n29B01
* 4aibA01
* 2pljA01
* 4lusB01
* 2j66A01
* 7odcA01
* 3n2bD01
* 3n2oC01
* 1pfzA02
* 5bnzA04
* 2hz7A04
* 4en2A02
* 1u0tA02
* 4k35A02
* 4pvaA00
* 3pijA01
* 5hp6A01
* 3ugfA01
* 8fm5A01
* 1vzrA01
* 3au4A02
* 2grvC03
* 6l4lA01
* 4k8lA01
* 4dunA01
* 2gkeA01
* 4jbdA01
* 2otnB01
* 6hjfB01
* 3ednA01
* 5k87A01
* 2gkeA02
* 6lpnA01
* 5yuyA01
* 3u27A01
* 4hubG01
* 1vx4q01
* 7uwrA02
* 6l3mB01
* 7s6dB02
* 1vw4502
* 4wweA01
* 3o58A01
* 3lltA01
* 2fstX01
* 7si1A01
* 6ks6B02
* 6ks6e02
* 8bm1K01
* 7d5mA01
* 3btvA02
* 3db2B02
* 2ixaA02
* 7xr9E01
* 3dtyB02
* 1lc0A02
* 1vw4700
* 1l0wA02
* 1v4pC01
* 1tkeA02
* 2ztgA04
* 7qh7701
* 4bt2A01
* 1xv2C01
* 4pabA03
* 2gagC01
* 1pj5A03
* 1v5vA01
* 3girA01
* 4dx5A04
* 7whfC01
* 1ti2A02
* 3uykA01
* 2qzsA01
* 8inpA01
* 2iuyA01
* 4x7rA01
* 1f0kA01
* 3ia7A01
* 3t7dA01
* 2iw1A01
* 3s5jA02
* 1u0tB01
* 3pfnD01
* 1z0sA01
* 1y3iA01
* 3s40A01
* 1yt5A01
* 2i2cA01
* 2bonA01
* 3afoB01
* 1u0tA01
* 7r4lA01
* 2an1D01
* 4werA01
* 6cngA01
* 3jr7A01
* 1pzxB01
* 1oi2A01
* 3pl5A01
* 6spcb01
* 2e5lB01
* 7o0aD01
* 2amlB01
* 4bb9A02
* 7ntgA01
* 2yimA01
* 1x74A01
* 1q7eA01
* 1xk7A01
* 4ed9A01
* 8edjA01
* 1ti2A03
* 5kf6B02
* 2uvaJ10
* 6fkuA01
* 4c3sA01
* 3v4cB01
* 6jqlA01
* 7w5kA01
* 3r64A01
* 3pqaB01
* 3ju8A01
* 1o04A01
* 8dr9A01
* 4jbeB01
* 4o6rA01
* 3szaA01
* 4ogdA01
* 3k2wD01
* 5ekcF01
* 5qr1A01
* 5yuyA02
* 3o58D00
* 4kctA01
* 2x1dA01
* 5u81A01
* 2aafB01
* 1s9rA01
* 7kz9A01
* 7kz9B01
* 2o7iA02
* 3ubtA02
* 4dv8A01
* 6qdiA02
* 6qdiA01
* 3dorA03
* 3bf0C03
* 1uyvB02
* 1vw4102
* 2ckfC01
* 2bmoA01
* 2de6A01
* 3vcaA01
* 3gzxA01
* 2b1xA01
* 1z01A01
* 3k8kA03
* 4aeeA04
* 1g5aA03
* 7phoB01
* 3afhA02
* 1vs9F02
* 1vw4K00
* 1vw4H00
* 3o58Y02
* 4a17N00