### Setup: helper methods


In [6]:
import requests
from Bio import SeqIO
from io import StringIO
import polars as pl
from tcr_format_parsers.common.MHCCodeConverter import (
    HLASequenceDBConverter,
    H2SequenceDictConverter,
)
from tcr_format_parsers.common.TCRUtils import standardize_tcr
import warnings
from pathlib import Path


def format_pdb_df(df):
    df = df.with_columns(
        pl.when(pl.col("mhc_type") == "MH1")
        .then(pl.lit("I"))
        .when(pl.col("mhc_type") == "MH2")
        .then(pl.lit("II"))
        .otherwise(None)
        .alias("mhc_class"),
    ).filter(pl.col("mhc_class").is_not_null())

    df = df.filter(
        (pl.col("mhc_chain1").is_not_null())
        & (pl.col("mhc_chain2").is_not_null())
    )

    df = df.group_by("pdb").agg(
        pl.col("Bchain").first(),
        pl.col("Achain").first(),
        pl.col("mhc_chain1").first(),
        pl.col("mhc_chain2").first(),
        pl.col("antigen_chain").first(),
        pl.col("mhc_class").first(),
        pl.col("mhc_chain1_organism").first().alias("mhc_1_species"),
        pl.col("mhc_chain2_organism").first().alias("mhc_2_species"),
        pl.col("alpha_organism").first().alias("tcr_1_species"),
        pl.col("beta_organism").first().alias("tcr_2_species"),
    )

    df = df.with_columns(
        pl.when(pl.col("mhc_1_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("mhc_1_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("mhc_1_species"),
        pl.when(pl.col("mhc_2_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("mhc_2_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("mhc_2_species"),
        pl.when(pl.col("tcr_1_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("tcr_1_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("tcr_1_species"),
        pl.when(pl.col("tcr_2_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("tcr_2_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("tcr_2_species"),
    ).filter(
        (pl.col("mhc_1_species").is_not_null())
        & (pl.col("mhc_2_species").is_not_null())
        & (pl.col("tcr_1_species").is_not_null())
        & (pl.col("tcr_2_species").is_not_null())
    )

    df = df.with_columns(
        pl.when(pl.col("mhc_class") == "II")
        .then(pl.lit("alpha"))
        .otherwise(pl.lit("heavy"))
        .alias("mhc_1_chain"),
        pl.when(pl.col("mhc_class") == "II")
        .then(pl.lit("beta"))
        .otherwise(pl.lit("light"))
        .alias("mhc_2_chain"),
        pl.lit(True).alias("cognate"),
        pl.lit("alpha").alias("tcr_1_chain"),
        pl.lit("beta").alias("tcr_2_chain"),
    )

    df = df.with_columns(
        pl.col("antigen_chain")
        .str.split("|")
        .list.first()
        .str.strip_chars()
        .alias("antigen_chain")
    )

    return df


SEQ_STRUCT = pl.Struct(
    {
        "peptide": pl.String,
        "mhc_1_seq": pl.String,
        "mhc_2_seq": pl.String,
        "tcr_1_seq": pl.String,
        "tcr_2_seq": pl.String,
    }
)


def parse_chain(chain):
    if "[" in chain:
        return chain.split("[auth ")[1][0]
    else:
        return chain.replace(" ", "")


def parse_fasta_description(description):
    chain_token = description.split("|")[1]

    if chain_token.startswith("Chain "):
        return list(parse_chain(chain_token.split("Chain ")[1]))
    else:
        chains = chain_token.split("Chains ")[1].split(",")
        chain_list = [parse_chain(chain) for chain in chains]

        return chain_list


def get_fasta_seq(
    pdb_id,
    antigen_chain_id,
    mhc_chain1_id,
    mhc_chain2_id,
    Achain_id,
    Bchain_id,
):
    r = requests.get("https://www.rcsb.org/fasta/entry/" + pdb_id)

    r.raise_for_status()

    fasta_sequences = SeqIO.parse(StringIO(r.text), "fasta")

    seq_dict = {}
    for fasta in fasta_sequences:
        chains = parse_fasta_description(fasta.description)
        for chain in chains:
            seq_dict[chain] = str(fasta.seq)

    return {
        "peptide": seq_dict[antigen_chain_id],
        "mhc_1_seq": seq_dict[mhc_chain1_id],
        "mhc_2_seq": seq_dict[mhc_chain2_id],
        "tcr_1_seq": seq_dict[Achain_id],
        "tcr_2_seq": seq_dict[Bchain_id],
    }


def format_seqs(df):
    df = df.with_columns(
        pl.struct(
            pl.col("pdb"),
            pl.col("Bchain"),
            pl.col("Achain"),
            pl.col("antigen_chain"),
            pl.col("mhc_chain1"),
            pl.col("mhc_chain2"),
        )
        .map_elements(
            lambda x: get_fasta_seq(
                x["pdb"],
                x["antigen_chain"],
                x["mhc_chain1"],
                x["mhc_chain2"],
                x["Achain"],
                x["Bchain"],
            ),
            return_dtype=SEQ_STRUCT,
        )
        .alias("chain_seqs"),
    ).unnest("chain_seqs")

    return df


def remove_peptide_from_chains(row):
    new_row = row.copy()
    if row["peptide"] in row["mhc_1_seq"]:
        warnings.warn(f"Peptide found in MHC 1 sequence for PDB {row['pdb']}")
        index_of_peptide = row["mhc_1_seq"].index(row["peptide"])
        new_row["mhc_1_seq"] = new_row["mhc_1_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    if row["peptide"] in row["mhc_2_seq"]:
        warnings.warn(f"Peptide found in MHC 2 sequence for PDB {row['pdb']}")
        index_of_peptide = row["mhc_2_seq"].index(row["peptide"])
        new_row["mhc_2_seq"] = new_row["mhc_2_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    if row["peptide"] in row["tcr_1_seq"]:
        warnings.warn(f"Peptide found in TCR 1 sequence for PDB {row['pdb']}")
        index_of_peptide = row["tcr_1_seq"].index(row["peptide"])
        new_row["tcr_1_seq"] = new_row["tcr_1_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    if row["peptide"] in row["tcr_2_seq"]:
        warnings.warn(f"Peptide found in TCR 2 sequence for PDB {row['pdb']}")
        index_of_peptide = row["tcr_2_seq"].index(row["peptide"])
        new_row["tcr_2_seq"] = new_row["tcr_2_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    return pl.DataFrame(new_row)


def infer_correct_mhc(row, human_conv, mouse_conv):
    mhc1 = row["mhc_1_seq"]
    mhc2 = row["mhc_2_seq"]

    if row["mhc_1_species"] == "human":
        mhc_1_inf = human_conv.get_mhc_allele(
            mhc1, chain=row["mhc_1_chain"], top_only=True
        )
    else:
        mhc_1_inf = mouse_conv.get_mhc_allele(
            mhc1, chain=row["mhc_1_chain"], top_only=True
        )

    if row["mhc_2_species"] == "human":
        mhc_2_inf = human_conv.get_mhc_allele(
            mhc2, chain=row["mhc_2_chain"], top_only=True
        )
    else:
        mhc_2_inf = mouse_conv.get_mhc_allele(
            mhc2, chain=row["mhc_2_chain"], top_only=True
        )

    new_row = row.copy()

    new_row["mhc_1_match_seq"] = mhc_1_inf["seq"]
    new_row["mhc_1_name"] = mhc_1_inf["name"]
    new_row["mhc_1_match_size"] = mhc_1_inf["match_size"]
    new_row["mhc_1_match_proportion"] = (
        (mhc_1_inf["match_size"] / len(mhc1))
        if mhc_1_inf["match_size"] is not None
        else None
    )
    new_row["mhc_1_status"] = mhc_2_inf["sequence_status"]
    new_row["mhc_2_name_maxres"] = mhc_2_inf["max_resolution_name"]
    new_row["mhc_2_match_seq"] = mhc_2_inf["seq"]
    new_row["mhc_2_name"] = mhc_2_inf["name"]
    new_row["mhc_2_match_size"] = mhc_2_inf["match_size"]
    new_row["mhc_2_match_proportion"] = (
        (mhc_2_inf["match_size"] / len(mhc2))
        if mhc_2_inf["match_size"] is not None
        else None
    )
    new_row["mhc_2_status"] = mhc_2_inf["sequence_status"]
    new_row["mhc_2_maxres"] = mhc_2_inf["max_resolution_name"]
    return pl.DataFrame(new_row)


def download_pdb(row, path):

    r = requests.get(f"https://files.rcsb.org/download/{row["pdb"]}.pdb")
    suffix = ".pdb"
    try:
        r.raise_for_status()
    except Exception as e:
        r = requests.get(f"https://files.rcsb.org/download/{row["pdb"]}.cif")
        suffix = ".cif"
        r.raise_for_status()
    with open(path / (row["pdb"] + suffix), "wb") as f:
        f.write(r.content)
    return pl.DataFrame(row)

### 1. Import triads from STCRpred


In [67]:
import polars as pl
from pathlib import Path

IMGT_HLA_PATH = Path("/tgen_labs/altin/alphafold3/IMGTHLA")

schema_overrides = {
    "Gchain": pl.String,
    "Dchain": pl.String,
}
null_values = ["NA", "unknown"]

pdb_human_I = pl.read_csv(
    "raw/humanI.tsv",
    schema_overrides=schema_overrides,
    null_values=null_values,
    separator="\t",
)

pdb_human_I = format_pdb_df(pdb_human_I)


pdb_human_II = pl.read_csv(
    "raw/humanII.tsv",
    schema_overrides=schema_overrides,
    null_values=null_values,
    separator="\t",
)

pdb_human_II = format_pdb_df(pdb_human_II)

pdb_mouse_I = pl.read_csv(
    "raw/mouseI.tsv",
    schema_overrides=schema_overrides,
    null_values=null_values,
    separator="\t",
)

pdb_mouse_I = format_pdb_df(pdb_mouse_I)

pdb_mouse_II = pl.read_csv(
    "raw/mouseII.tsv",
    schema_overrides=schema_overrides,
    null_values=null_values,
    separator="\t",
)

pdb_mouse_II = format_pdb_df(pdb_mouse_II)

## 2. Process and clean triads


In [69]:
pdb_human_I = format_seqs(pdb_human_I)
pdb_human_II = format_seqs(pdb_human_II)
pdb_mouse_I = format_seqs(pdb_mouse_I)
pdb_mouse_II = format_seqs(pdb_mouse_II)

In [85]:
from tcr_format_parsers.common.MHCCodeConverter import (
    HLASequenceDBConverter,
    H2SequenceDictConverter,
)
from mdaf3.FeatureExtraction import split_apply_combine, serial_apply

human_conv = HLASequenceDBConverter(IMGT_HLA_PATH)
mouse_conv = H2SequenceDictConverter()

pdb_mouse_I = serial_apply(pdb_mouse_I, remove_peptide_from_chains)
pdb_mouse_I = serial_apply(
    pdb_mouse_I, infer_correct_mhc, human_conv, mouse_conv
)

pdb_mouse_II = serial_apply(pdb_mouse_II, remove_peptide_from_chains)
pdb_mouse_II = serial_apply(
    pdb_mouse_II, infer_correct_mhc, human_conv, mouse_conv
)

pdb_human_I = serial_apply(pdb_human_I, remove_peptide_from_chains)
pdb_human_I = serial_apply(
    pdb_human_I, infer_correct_mhc, human_conv, mouse_conv
)

pdb_human_II = serial_apply(pdb_human_II, remove_peptide_from_chains)
pdb_human_II = serial_apply(
    pdb_human_II, infer_correct_mhc, human_conv, mouse_conv
)

Processing rows: 100%|██████████| 28/28 [00:00<00:00, 5972.36it/s]
Processing rows: 100%|██████████| 28/28 [02:34<00:00,  5.51s/it]
Processing rows: 100%|██████████| 18/18 [00:00<00:00, 5742.56it/s]
Processing rows: 100%|██████████| 18/18 [00:25<00:00,  1.44s/it]
Processing rows: 100%|██████████| 167/167 [00:00<00:00, 6336.04it/s]
Processing rows: 100%|██████████| 167/167 [56:05<00:00, 20.15s/it]
Processing rows: 100%|██████████| 33/33 [00:00<00:00, 5923.90it/s]
Processing rows: 100%|██████████| 33/33 [02:32<00:00,  4.61s/it]


In [86]:
pdb_human_I

pdb,Bchain,Achain,mhc_chain1,mhc_chain2,antigen_chain,mhc_class,mhc_1_species,mhc_2_species,tcr_1_species,tcr_2_species,mhc_1_chain,mhc_2_chain,cognate,tcr_1_chain,tcr_2_chain,peptide,mhc_1_seq,mhc_2_seq,tcr_1_seq,tcr_2_seq,mhc_1_match_seq,mhc_1_name,mhc_1_match_size,mhc_1_match_proportion,mhc_1_status,mhc_2_name_maxres,mhc_2_match_seq,mhc_2_name,mhc_2_match_size,mhc_2_match_proportion,mhc_2_status,mhc_2_maxres
str,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,null,null,str,str
"""8cx4""","""F""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""LRVMMLAPF""","""MGSHSMRYFHTSVSRPGRGEPRFITVGYVD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""KQEVTQIPAALSVPEGENLVLNCSFTDSAI…","""DSGVTQTPKHLITATGQRVTLRCSPRSGDL…","""GSHSMRYFHTSVSRPGRGEPRFITVGYVDD…","""B*27:03""",211,0.756272,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""3vxs""","""E""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""RYPLTLGWCF""","""MGSHSMRYFSTSVSRPGRGEPRFIAVGYVD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""MKQEVTQIPAALSVPEGENLVLNCSFTDSA…","""MDTGVSQNPRHKITKRGQNVTFRCDPISEH…","""GSHSMRYFSTSVSRPGRGEPRFIAVGYVDD…","""A*24:02""",274,0.996364,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""5e9d""","""J""","""I""","""F""","""G""","""H""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""ELAGIGILTV""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""QKEVEQNSGPLSVPEGAIASLNCTYSDRGS…","""MGSSHHHHHHSSGLVPRGSNAGVTQTPKFQ…","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""A*02:01""",275,1.0,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""3kpr""","""E""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""EEYLKAWTF""","""GSHSMRYFYTAMSRPGRGEPRFITVGYVDD…","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""KTTQPNSMESNEEEPVHLPCNHSTISGTDY…","""GVSQSPRYKVAKRGQDVALRCDPISGHVSL…","""GSHSMRYFYTAMSRPGRGEPRFITVGYVDD…","""B*44:05""",276,1.0,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""1bd2""","""E""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""LLFGYPVYV""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""QQVKQNSPSLSVQEGRISILNCDYTNSMFD…","""NAGVTQTPKFQVLKTGQSMTLQCAQDMNHE…","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""A*02:01""",275,1.0,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""6tmo""","""E""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""EAAGIGILTV""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""KQEVEQNSGPLSVPEGAIASLNCTYSFLGS…","""SQTIHQWPATLVQPVGSPLSLECTVEGTSN…","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""A*02:01""",276,1.0,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""4mji""","""J""","""I""","""F""","""G""","""H""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""TAFTIPSI""","""GSHSMRYFYTAMSRPGRGEPRFIAVGYVDD…","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""GEEDPQALSIQEGENATMNCSYKTSINNLQ…","""AGVSQTPSNKVTEKGKYVELRCDPISGHTA…","""GSHSMRYFYTAMSRPGRGEPRFIAVGYVDD…","""B*51:01""",276,1.0,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""7n2r""","""F""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""TRLALIAPK""","""GSHSMRYFHTSVSRPGRGEPRFITVGYVDD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""KQEVTQIPAALSVPEGENLVLNCSFTDSAI…","""GVTQTPKHLITATGQRVTLRCSPRSGDLSV…","""GSHSMRYFHTSVSRPGRGEPRFITVGYVDD…","""B*27:03""",211,0.758993,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""
"""4jfe""","""E""","""D""","""A""","""B""","""C""","""I""","""human""","""human""","""human""","""human""","""heavy""","""light""",true,"""alpha""","""beta""","""ELAGIGALTV""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""QKEVEQNSGPLSVPEGAIASLNCTYSFLGS…","""MSQTIHQWPATLVQPVGSPLSLECTVEGTS…","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""A*02:01""",276,1.0,"""Full""","""B2M""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…","""B2M""",,,"""Full""","""B2M"""


In [89]:
from tcr_format_parsers.common.TriadUtils import FORMAT_COLS, generate_job_name

all_pdb_triads = pl.concat(
    [
        pdb_human_I,
        pdb_human_II,
        pdb_mouse_I,
        pdb_mouse_II,
    ],
    how="vertical_relaxed",
)

all_pdb_triads = generate_job_name(all_pdb_triads).select(
    FORMAT_COLS
    + [
        "pdb",
        "mhc_1_match_seq",
        "mhc_1_match_size",
        "mhc_1_match_proportion",
        "mhc_1_status",
        "mhc_2_match_seq",
        "mhc_2_match_size",
        "mhc_2_match_proportion",
        "mhc_2_status",
        "mhc_2_maxres",
        "antigen_chain",
        "mhc_chain1",
        "mhc_chain2",
        "Achain",
        "Bchain",
    ]
)

all_pdb_triads.select(FORMAT_COLS).unique().write_csv("pdb_triads.csv")
all_pdb_triads.write_parquet("pdb_triads.parquet")

In [8]:
from mdaf3.FeatureExtraction import split_apply_combine, serial_apply

all_pdb_triads = pl.read_parquet("pdb_triads.parquet")

serial_apply(all_pdb_triads, download_pdb, Path("struct"))

Processing rows: 100%|██████████| 246/246 [03:32<00:00,  1.15it/s]


job_name,cognate,peptide,mhc_class,mhc_1_chain,mhc_1_species,mhc_1_name,mhc_1_seq,mhc_2_chain,mhc_2_species,mhc_2_name,mhc_2_seq,tcr_1_chain,tcr_1_species,tcr_1_seq,tcr_2_chain,tcr_2_species,tcr_2_seq,pdb,mhc_1_match_seq,mhc_1_match_size,mhc_1_match_proportion,mhc_1_status,mhc_2_match_seq,mhc_2_match_size,mhc_2_match_proportion,mhc_2_status,mhc_2_maxres,antigen_chain,mhc_chain1,mhc_chain2,Achain,Bchain
str,bool,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,str,str,str,str,str,str
"""d6d24a93fb90e1c196c1c3c87ce648…",true,"""LRVMMLAPF""","""I""","""heavy""","""human""","""B*27:03""","""MGSHSMRYFHTSVSRPGRGEPRFITVGYVD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""KQEVTQIPAALSVPEGENLVLNCSFTDSAI…","""beta""","""human""","""DSGVTQTPKHLITATGQRVTLRCSPRSGDL…","""8cx4""","""GSHSMRYFHTSVSRPGRGEPRFITVGYVDD…",211,0.756272,"""Full""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…",,,"""Full""","""B2M""","""C""","""A""","""B""","""D""","""F"""
"""81926c1d318e57bbee02e0aade2dca…",true,"""RYPLTLGWCF""","""I""","""heavy""","""human""","""A*24:02""","""MGSHSMRYFSTSVSRPGRGEPRFIAVGYVD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""MKQEVTQIPAALSVPEGENLVLNCSFTDSA…","""beta""","""human""","""MDTGVSQNPRHKITKRGQNVTFRCDPISEH…","""3vxs""","""GSHSMRYFSTSVSRPGRGEPRFIAVGYVDD…",274,0.996364,"""Full""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…",,,"""Full""","""B2M""","""C""","""A""","""B""","""D""","""E"""
"""c96440ae3c6acd523d7a82b04b3674…",true,"""ELAGIGILTV""","""I""","""heavy""","""human""","""A*02:01""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""QKEVEQNSGPLSVPEGAIASLNCTYSDRGS…","""beta""","""human""","""MGSSHHHHHHSSGLVPRGSNAGVTQTPKFQ…","""5e9d""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…",275,1.0,"""Full""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…",,,"""Full""","""B2M""","""H""","""F""","""G""","""I""","""J"""
"""c05c1a21eb12afd64036096571077d…",true,"""EEYLKAWTF""","""I""","""heavy""","""human""","""B*44:05""","""GSHSMRYFYTAMSRPGRGEPRFITVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""human""","""KTTQPNSMESNEEEPVHLPCNHSTISGTDY…","""beta""","""human""","""GVSQSPRYKVAKRGQDVALRCDPISGHVSL…","""3kpr""","""GSHSMRYFYTAMSRPGRGEPRFITVGYVDD…",276,1.0,"""Full""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…",,,"""Full""","""B2M""","""C""","""A""","""B""","""D""","""E"""
"""d56673afb5c30801f4b7623ffa3dec…",true,"""LLFGYPVYV""","""I""","""heavy""","""human""","""A*02:01""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""QQVKQNSPSLSVQEGRISILNCDYTNSMFD…","""beta""","""human""","""NAGVTQTPKFQVLKTGQSMTLQCAQDMNHE…","""1bd2""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…",275,1.0,"""Full""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…",,,"""Full""","""B2M""","""C""","""A""","""B""","""D""","""E"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""f9a7c27b38f6d26fc651083c4e31c2…",true,"""GGYRARPAKAAAT""","""II""","""alpha""","""human""","""DRA*01:01""","""IKEEHVIIQAEFYLNPDQSGEFMFDFDGDE…","""beta""","""human""","""DRB1*04:01""","""GDTRPRFLEQVKHECHFFNGTERVRFLDRY…","""alpha""","""mouse""","""GDSVTQTEGQVTVSESKSLIINCTYSATSI…","""beta""","""mouse""","""AVFQTPNYHVTQVGNEVSFNCKQTLGHDTM…","""6v1a""","""KEEHVIIQAEFYLNPDQSGEFMFDFDGDEI…",181,0.957672,"""Full""","""GDTRPRFLEQVKHECHFFNGTERVRFLDRY…","""190""","""0.9595959595959596""","""Full""","""DRB1*04:01:01:01""","""C""","""A""","""B""","""D""","""E"""
"""ac54c2405afa03c65178a7fed984ea…",true,"""GGYRARPAKAAAT""","""II""","""alpha""","""human""","""DRA*01:01""","""IKEEHVIIQAEFYLNPDQSGEFMFDFDGDE…","""beta""","""human""","""DRB1*04:01""","""GDTRPRFLEQVKHECHFFNGTERVRFLDRY…","""alpha""","""mouse""","""GDSVTQTEGQVTVSESKSLIINCTYSATSI…","""beta""","""mouse""","""AVFQTPNYHVTQVGNEVSFNCKQTLGHDTM…","""6v18""","""KEEHVIIQAEFYLNPDQSGEFMFDFDGDEI…",181,0.957672,"""Full""","""GDTRPRFLEQVKHECHFFNGTERVRFLDRY…","""190""","""0.9595959595959596""","""Full""","""DRB1*04:01:01:01""","""C""","""A""","""B""","""D""","""E"""
"""427d74361d2c1d7b8d2556249ee3ab…",true,"""ADPADPLAFFSSAIKGGGGSLV""","""II""","""alpha""","""mouse""","""H2-IAg7""","""ADPIKEEHTIIQAEFYLLPDKRGEFMFDFD…","""beta""","""mouse""","""H2-IAb""","""PRGSGGGGSRPWFLEYCKSECHFYNGTQRV…","""alpha""","""mouse""","""ADPGRGDQVEQSPSALSLHEGTGSALRCNF…","""beta""","""mouse""","""ADPKVIQTPRYLVKGQGQKAKMRCIPEKGH…","""4p2o""","""EDDIEADHVGFYGTTVYQSPGDIGQYTHEF…",7,0.034314,"""Full""","""GNSERHFVVQFKGECYYTNGTQRIRLVTRY…","""12""","""0.056074766355140186""","""Full""","""H2-IAb""","""P""","""A""","""B""","""C""","""D"""
"""5e5f317e7980570a197d5cd1f22b29…",true,"""ADGLAYFRSSFKGG""","""II""","""alpha""","""mouse""","""H2-IAg7""","""ADPIKEEHTIIQAEFYLLPDKRGEFMFDFD…","""beta""","""mouse""","""H2-IAb""","""GSGGGGSRPWFLEYCKSECHFYNGTQRVRL…","""alpha""","""mouse""","""MRGDQVEQSPSALSLHEGTGSALRCNFTTT…","""beta""","""mouse""","""GGGSGGSGGKVIQTPRYLVKGQGQKAKMRC…","""4p2q""","""EDDIEADHVGFYGTTVYQSPGDIGQYTHEF…",7,0.034314,"""Full""","""GNSERHFVVQFKGECYYTNGTQRIRLVTRY…","""12""","""0.05660377358490566""","""Full""","""H2-IAb""","""H""","""F""","""G""","""I""","""J"""


In [9]:
all_pdb_triads.filter(pl.col("pdb") == "8shi")

job_name,cognate,peptide,mhc_class,mhc_1_chain,mhc_1_species,mhc_1_name,mhc_1_seq,mhc_2_chain,mhc_2_species,mhc_2_name,mhc_2_seq,tcr_1_chain,tcr_1_species,tcr_1_seq,tcr_2_chain,tcr_2_species,tcr_2_seq,pdb,mhc_1_match_seq,mhc_1_match_size,mhc_1_match_proportion,mhc_1_status,mhc_2_match_seq,mhc_2_match_size,mhc_2_match_proportion,mhc_2_status,mhc_2_maxres,antigen_chain,mhc_chain1,mhc_chain2,Achain,Bchain
str,bool,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,str,str,str,str,str,str,str,str,str,str,str
"""a825a9509fbd53a6838631ea8d3abf…",True,"""VRSRRALRL""","""I""","""heavy""","""human""","""C*06:02""","""MSSHSMRYFDTAVSRPGRGEPRFISVGYVD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""MSQQGEEDPQALSIQEGENATMNCSYKTSI…","""beta""","""human""","""MGVTQTPKFQVLKTGQSMTLQCAQDMNHEY…","""8shi""","""CSHSMRYFDTAVSRPGRGEPRFISVGYVDD…",275,0.99278,"""Full""","""MSRSVALAVLALLSLSGLEAIQRTPKIQVY…",,,"""Full""","""B2M""","""F""","""D""","""E""","""I""","""J"""
