https://github.com/phbradley/TCRdock/blob/main/datasets_from_the_paper/table_S1_structure_benchmark_complexes.csv


### 0. Setup: helper methods


In [32]:
import requests
from Bio import SeqIO
from io import StringIO
import polars as pl
from tcr_format_parsers.common.MHCCodeConverter import (
    HLASequenceDBConverter,
    H2SequenceDictConverter,
)
from tcr_format_parsers.common.TCRUtils import standardize_tcr
import warnings
from pathlib import Path
import MDAnalysis as mda

def format_pdb_df(df):
    df = df.with_columns(
        pl.when(pl.col("mhc_type") == "MH1")
        .then(pl.lit("I"))
        .when(pl.col("mhc_type") == "MH2")
        .then(pl.lit("II"))
        .otherwise(None)
        .alias("mhc_class"),
    )

    # df = df.filter(
    #     (pl.col("mhc_chain1").is_not_null())
    #     & (pl.col("mhc_chain2").is_not_null())
    # )

    df = df.group_by("pdb").agg(
        pl.col("Bchain").drop_nulls().first(),
        pl.col("Achain").drop_nulls().first(),
        pl.col("mhc_chain1").drop_nulls().first(),
        pl.col("mhc_chain2").drop_nulls().first(),
        pl.col("antigen_chain").drop_nulls().first(),
        pl.col("mhc_class").drop_nulls().first(),
        pl.col("mhc_chain1_organism").drop_nulls().first().alias("mhc_1_species"),
        pl.col("mhc_chain2_organism").drop_nulls().first().alias("mhc_2_species"),
        pl.col("alpha_organism").drop_nulls().first().alias("tcr_1_species"),
        pl.col("beta_organism").drop_nulls().first().alias("tcr_2_species"),
    )

    df = df.with_columns(
        pl.when(pl.col("mhc_1_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("mhc_1_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("mhc_1_species"),
        pl.when(pl.col("mhc_2_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("mhc_2_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("mhc_2_species"),
        pl.when(pl.col("tcr_1_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("tcr_1_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("tcr_1_species"),
        pl.when(pl.col("tcr_2_species") == "homo sapiens")
        .then(pl.lit("human"))
        .when(pl.col("tcr_2_species") == "mus musculus")
        .then(pl.lit("mouse"))
        .otherwise(None)
        .alias("tcr_2_species"),
    )

    df = df.with_columns(
        pl.when(pl.col("mhc_class") == "II")
        .then(pl.lit("alpha"))
        .otherwise(pl.lit("heavy"))
        .alias("mhc_1_chain"),
        pl.when(pl.col("mhc_class") == "II")
        .then(pl.lit("beta"))
        .otherwise(pl.lit("light"))
        .alias("mhc_2_chain"),
        pl.lit(True).alias("cognate"),
        pl.lit("alpha").alias("tcr_1_chain"),
        pl.lit("beta").alias("tcr_2_chain"),
    )

    df = df.with_columns(
        pl.col("antigen_chain")
        .str.split("|")
        .list.first()
        .str.strip_chars()
        .alias("antigen_chain")
    )

    return df


def get_pdb_date(row):
    r = requests.get("https://data.rcsb.org/rest/v1/core/entry/" + row["pdb"])
    r.raise_for_status()
    new_row = row.copy()
    new_row["pdb_date"] = r.json()["rcsb_accession_info"][
        "initial_release_date"
    ]

    return pl.DataFrame(new_row).with_columns(
        pl.col("pdb_date").str.to_datetime().alias("pdb_date")
    )


def parse_chain(chain):
    if "[" in chain:
        
        return chain.split("[auth ")[1][0]
        # if can have multi-letter chains
        # return chain.split("[auth ")[1].split("]")[0]
    else:
        return chain.replace(" ", "")


def parse_fasta_description(description):
    chain_token = description.split("|")[1]

    if chain_token.startswith("Chain "):
        return list(parse_chain(chain_token.split("Chain ")[1]))
    else:
        chains = chain_token.split("Chains ")[1].split(",")
        chain_list = [parse_chain(chain) for chain in chains]

        return chain_list


def get_fasta_seq(
    pdb_id,
    antigen_chain_id,
    mhc_chain1_id,
    mhc_chain2_id,
    Achain_id,
    Bchain_id,
):
    r = requests.get("https://www.rcsb.org/fasta/entry/" + pdb_id)

    r.raise_for_status()

    fasta_sequences = SeqIO.parse(StringIO(r.text), "fasta")

    seq_dict = {}
    for fasta in fasta_sequences:
        chains = parse_fasta_description(fasta.description)
        for chain in chains:
            seq_dict[chain] = str(fasta.seq)

    return {
        "peptide_seq": seq_dict[antigen_chain_id] if antigen_chain_id is not None else "",
        "mhc_1_seq": seq_dict[mhc_chain1_id] if mhc_chain1_id is not None else "",
        "mhc_2_seq": seq_dict[mhc_chain2_id] if mhc_chain2_id is not None else "",
        "tcr_1_seq": seq_dict[Achain_id] if Achain_id is not None else "",
        "tcr_2_seq": seq_dict[Bchain_id] if Bchain_id is not None else "",
    }


SEQ_STRUCT = pl.Struct(
    {
        "peptide_seq": pl.String,
        "mhc_1_seq": pl.String,
        "mhc_2_seq": pl.String,
        "tcr_1_seq": pl.String,
        "tcr_2_seq": pl.String,
    }
)

def format_seqs(df, skip_peptide=False):
    df = df.with_columns(
        pl.struct(
            pl.col("pdb"),
            pl.col("Bchain"),
            pl.col("Achain"),
            pl.col("antigen_chain"),
            pl.col("mhc_chain1"),
            pl.col("mhc_chain2"),
        )
        .map_elements(
            lambda x: get_fasta_seq(
                x["pdb"],
                x["antigen_chain"],
                x["mhc_chain1"],
                x["mhc_chain2"],
                x["Achain"],
                x["Bchain"],
            ),
            return_dtype=SEQ_STRUCT,
            skip_nulls=False,
        )
        .alias("chain_seqs"),
    ).unnest("chain_seqs").with_columns(
        pl.when(pl.col("peptide_seq") == "")
        .then(pl.lit(None))
        .otherwise(pl.col("peptide_seq"))
        .alias("peptide_seq"),
        pl.when(pl.col("mhc_1_seq") == "")
        .then(pl.lit(None))
        .otherwise(pl.col("mhc_1_seq"))
        .alias("mhc_1_seq"),
        pl.when(pl.col("mhc_2_seq") == "")
        .then(pl.lit(None))
        .otherwise(pl.col("mhc_2_seq"))
        .alias("mhc_2_seq"),
        pl.when(pl.col("tcr_1_seq") == "")
        .then(pl.lit(None))
        .otherwise(pl.col("tcr_1_seq"))
        .alias("tcr_1_seq"),
        pl.when(pl.col("tcr_2_seq") == "")
        .then(pl.lit(None))
        .otherwise(pl.col("tcr_2_seq"))
        .alias("tcr_2_seq"),
    )

    return df


def remove_peptide_from_chains(row):
    new_row = row.copy()

    if row["mhc_1_seq"] is not None and row["peptide"] in row["mhc_1_seq"]:
        warnings.warn(f"Peptide found in MHC 1 sequence for PDB {row['pdb']} at position {row['mhc_1_seq'].index(row['peptide'])}")
        index_of_peptide = row["mhc_1_seq"].index(row["peptide"])
        new_row["mhc_1_seq"] = new_row["mhc_1_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    if row["mhc_2_seq"] is not None and row["peptide"] in row["mhc_2_seq"]:
        warnings.warn(f"Peptide found in MHC 2 sequence for PDB {row['pdb']} at position {row['mhc_2_seq'].index(row['peptide'])}")
        index_of_peptide = row["mhc_2_seq"].index(row["peptide"])
        new_row["mhc_2_seq"] = new_row["mhc_2_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    if row["tcr_1_seq"] is not None and row["peptide"] in row["tcr_1_seq"]:
        warnings.warn(f"Peptide found in TCR 1 sequence for PDB {row['pdb']} at position {row['tcr_1_seq'].index(row['peptide'])}")
        index_of_peptide = row["tcr_1_seq"].index(row["peptide"])
        new_row["tcr_1_seq"] = new_row["tcr_1_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    if row["tcr_2_seq"] is not None and row["peptide"] in row["tcr_2_seq"]:
        warnings.warn(f"Peptide found in TCR 2 sequence for PDB {row['pdb']} at position {row['tcr_2_seq'].index(row['peptide'])}")
        index_of_peptide = row["tcr_2_seq"].index(row["peptide"])
        new_row["tcr_2_seq"] = new_row["tcr_2_seq"][
            index_of_peptide + len(row["peptide"]) :
        ]
    return pl.DataFrame(new_row)


def infer_correct_mhc(row, human_conv, mouse_conv):
    mhc1 = row["mhc_1_seq"]
    mhc2 = row["mhc_2_seq"]

    if row["organism"] == "human":
        mhc_1_inf = human_conv.get_mhc_allele(
            mhc1, chain=row["mhc_1_chain"], top_only=True
        )
    else:
        mhc_1_inf = mouse_conv.get_mhc_allele(
            mhc1, chain=row["mhc_1_chain"], top_only=True
        )

    if row["organism"] == "human":
        if row["mhc_class"] == "I" and row['mhc_2_seq'] is None:
            mhc_2_inf = {
                "mhc_2_match_seq": None,
                "mhc_2_name" : None,
                "mhc_2_match_size": None,
                "mhc_2_match_proportion": None,
                "mhc_2_status": None,
                "mhc_2_maxres": None,
            }
        else:
            mhc_2_inf = human_conv.get_mhc_allele(
                mhc2, chain=row["mhc_2_chain"], top_only=True
            )
    else:
        mhc_2_inf = mouse_conv.get_mhc_allele(
            mhc2, chain=row["mhc_2_chain"], top_only=True
        )

    new_row = row.copy()

    new_row["mhc_1_match_seq"] = mhc_1_inf["seq"]
    new_row["mhc_1_name"] = mhc_1_inf["name"]
    new_row["mhc_1_match_size"] = mhc_1_inf["match_size"]
    new_row["mhc_1_match_proportion"] = (
        (mhc_1_inf["match_size"] / len(mhc1))
        if mhc_1_inf["match_size"] is not None
        else None
    )
    new_row["mhc_1_status"] = mhc_1_inf["sequence_status"]
    new_row["mhc_1_name_maxres"] = mhc_1_inf["max_resolution_name"]

    new_row["mhc_2_match_seq"] = mhc_2_inf["seq"]
    new_row["mhc_2_name"] = mhc_2_inf["name"]
    new_row["mhc_2_match_size"] = mhc_2_inf["match_size"]
    new_row["mhc_2_match_proportion"] = (
        (mhc_2_inf["match_size"] / len(mhc2))
        if mhc_2_inf["match_size"] is not None
        else None
    )
    new_row["mhc_2_status"] = mhc_2_inf["sequence_status"]
    new_row["mhc_2_maxres"] = mhc_2_inf["max_resolution_name"]
    return pl.DataFrame(new_row)


def download_pdb(row, path):

    r = requests.get(f"https://files.rcsb.org/download/{row["pdb"]}.pdb")
    suffix = ".pdb"
    try:
        r.raise_for_status()
    except Exception as e:
        r = requests.get(f"https://files.rcsb.org/download/{row["pdb"]}.cif")
        suffix = ".cif"
        r.raise_for_status()
    with open(path / (row["pdb"] + suffix), "wb") as f:
        f.write(r.content)
    return pl.DataFrame(row)

def get_true_mda_universe(pdb_id, root_path):
    # Favor PDB since it doesn't have multiple residue with same ID issue
    if (root_path / (pdb_id + ".pdb")).exists():
        suffix = ".pdb"

    else:
        suffix = ".cif"

    return mda.Universe((root_path / (pdb_id + suffix)).as_posix())

def update_df_from_k_v(
    df,
    primary_key_colname,
    primary_key,
    k,
    v,
):
    df = pl.concat([df.filter(pl.col(primary_key_colname) == primary_key).with_columns(
        pl.lit(v).alias(k)
    ), df.filter(pl.col(primary_key_colname) != primary_key)], how="vertical_relaxed")
    return df

### 1. Import data, query IEDB locally


In [None]:
import polars as pl
from tcr_format_parsers.common.TriadUtils import FORMAT_ANTIGEN_COLS
from tcr_format_parsers.common.TCRUtils import hash_tcr_sequence
import polars as pl
from pathlib import Path
from datetime import datetime, timezone
from mdaf3.FeatureExtraction import serial_apply

IMGT_HLA_PATH = Path("/tgen_labs/altin/alphafold3/IMGTHLA")



schema_overrides = {
    "Gchain": pl.String,
    "Dchain": pl.String,
}
null_values = ["NA", "unknown", "NOT"]

all_pdb = format_pdb_df(
    pl.read_csv(
        "raw/db_summary.dat",
        schema_overrides=schema_overrides,
        null_values=null_values,
        separator="\t",
    )
)

all_pdb = serial_apply(
    all_pdb,
    get_pdb_date,
)

phil_pdb = (
    pl.read_csv(
        "raw/table_S1_structure_benchmark_complexes.csv",
    )
    .rename({"pdbid": "pdb"})
    .with_columns(
        pl.when(pl.col("mhc_class") == 1)
        .then(pl.lit("I"))
        .otherwise(pl.lit("II"))
        .alias("mhc_class"),
    )
)

Processing rows: 100%|██████████| 634/634 [00:33<00:00, 18.88it/s]


### 2. Recreating Phil Bradley's Elife dataset


#### Rows STCRDab found


In [None]:
stcr_dat_phil = phil_pdb.join(all_pdb, on="pdb", how="inner")

mhc_1_missing_b2m = [
    "4n0c",
    "4mvb",
    "3tpu",
    "3tf7",
    "4n5e",
    "4mxq",
    "3tfk",
    "2oi9",
]

pre_fasta_corrections = {
    "3tf7": {"mhc_chain1": "E", "Achain": None, "Bchain": None},
    # antigen chain located on MHC 2 / TCR 2 chain
    # when we later look for the peptide we can find it by aligning to this chain
    "6bga": {"antigen_chain": "B"},
    "3pl6": {"antigen_chain": "D"},
    "3o6f": {"antigen_chain": "B"},
    "6dfw": {"antigen_chain": "D"},
    "3c5z": {"antigen_chain": "D"},
    "3rdt": {"antigen_chain": "D"},
    "6dfx": {"antigen_chain": "E"},
    "3c60": {"antigen_chain": "D"},
    "4grl": {"antigen_chain": "D"},
    "6mnn": {"antigen_chain": "D"},
    "6dfs": {"antigen_chain": "D"},
    "4p4k": {"antigen_chain": "B"},
    "4may": {"antigen_chain": "D"},
}

post_fasta_corrections = {
    # 3tf7 has 1 tcrab pair bound together with linker
    "3tf7": {
        "tcr_1_seq": "MGAQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQMLLKYYSGDPVVQGVNGFEAEFSKSDSSFHLRKASVHRSDSAVYFCAVSAKGTGSKLSFGKGAKLTVSP",
        "tcr_2_seq": "SEAAVTQSPRNKVTVTGENVTLSCRQTNSHNYMYWYRQDTGHELRLIYYSYGAGNLQIGDVPDGYKATRTTQEDFFLTLESASPSQTSLYFCASSDAPGQLYFGEGSKLTVLELEHHHHHH",
    }
}


for pdb_id, correction in pre_fasta_corrections.items():

    for k, v in correction.items():
        stcr_dat_phil = update_df_from_k_v(
            stcr_dat_phil,
            "pdb",
            pdb_id,
            k,
            v,
        )

stcr_dat_phil = format_seqs(stcr_dat_phil)

for pdb_id, correction in post_fasta_corrections.items():
    for k, v in correction.items():
        stcr_dat_phil = update_df_from_k_v(
            stcr_dat_phil,
            "pdb",
            pdb_id,
            k,
            v,
        )

#### Row STCRDab could not find


In [None]:
exclusion_pdb_date = get_pdb_date({"pdb": "6l9l"}).select("pdb_date").item()

exclusion = format_seqs(
    phil_pdb.join(all_pdb, on="pdb", how="anti").with_columns(
        Bchain=pl.lit("D"),
        Achain=pl.lit("C"),
        mhc_chain1=pl.lit("A"),
        mhc_chain2=pl.lit(None, dtype=pl.String),
        antigen_chain=pl.lit("B"),
        mhc_1_species=pl.lit("mouse"),
        mhc_2_species=pl.lit(None, dtype=pl.String),
        tcr_1_species=pl.lit(None, dtype=pl.String),
        tcr_2_species=pl.lit(None, dtype=pl.String),
        mhc_1_chain=pl.lit("heavy"),
        mhc_2_chain=pl.lit(None, dtype=pl.String),
        cognate=pl.lit(True),
        tcr_1_chain=pl.lit("alpha"),
        tcr_2_chain=pl.lit("beta"),
        pdb_date=exclusion_pdb_date,
    )
)

phil_pdb = pl.concat(
    [
        exclusion,
        stcr_dat_phil.select(pl.exclude("mhc_class_right")),
    ],
    how="vertical_relaxed",
)

In [67]:
phil_pdb_II = (
    phil_pdb.filter(pl.col("mhc_class") == "II")
    .with_columns(pl.col("mhc").str.split(",").alias("split_parts"))
    .with_columns(
        pl.when(pl.col("split_parts").list.len() == 2)
        .then(
            pl.struct(
                pl.col("split_parts")
                .list.get(0, null_on_oob=True)
                .alias("mhc_1_name"),
                pl.col("split_parts")
                .list.get(1, null_on_oob=True)
                .alias("mhc_2_name"),
            )
        )
        .otherwise(
            pl.struct(
                pl.lit(None).alias("mhc_1_name"),
                pl.col("split_parts").list.get(0).alias("mhc_2_name"),
            )
        )
        .alias("mhc_struct")
    )
    .unnest("mhc_struct")
)

phil_pdb_I = (
    phil_pdb.filter(pl.col("mhc_class") == "I")
    .with_columns(pl.col("mhc").str.split(",").alias("split_parts"))
    .with_columns(
        pl.when(pl.col("split_parts").list.len() == 2)
        .then(
            pl.struct(
                pl.col("split_parts")
                .list.get(0, null_on_oob=True)
                .alias("mhc_1_name"),
                pl.col("split_parts")
                .list.get(1, null_on_oob=True)
                .alias("mhc_2_name"),
            )
        )
        .otherwise(
            pl.struct(
                pl.lit("B2M").alias("mhc_2_name"),
                pl.col("split_parts").list.get(0).alias("mhc_1_name"),
            )
        )
        .alias("mhc_struct")
    )
    .unnest("mhc_struct")
)

In [68]:
phil_pdb = pl.concat([phil_pdb_I, phil_pdb_II])

In [69]:
from tcr_format_parsers.common.TriadUtils import FORMAT_COLS, generate_job_name


phil_pdb = generate_job_name(phil_pdb)

phil_pdb = phil_pdb.select(
    FORMAT_COLS
    + [
        "pdb",
        "pdb_date",
        "va",
        "ja",
        "cdr3a",
        "vb",
        "jb",
        "cdr3b",
        "antigen_chain",
        "mhc_chain1",
        "mhc_chain2",
        "Achain",
        "Bchain",
        "organism",
        "cdr_rmsd",
        "cdr_rmsd_af2_full",
        "cdr_rmsd_af2_trim",
    ]
)

phil_pdb = serial_apply(phil_pdb, remove_peptide_from_chains)

Processing rows: 100%|██████████| 130/130 [00:00<00:00, 3213.67it/s]


In [73]:
phil_pdb.write_csv(
    "elife_replicate/phil_pdb.csv",
)

In [4]:
phil_pdb = pl.read_csv(
    "elife_replicate/phil_pdb.csv",
)

In [None]:
(
    phil_pdb.with_columns(
        [
            pl.when(pl.col("mhc_class") == "I")
            .then(pl.lit(1))
            .otherwise(pl.lit(2))
            .alias("mhc_class"),
            pl.lit([1, 2, 3, 4, 5]).alias("replicate"),
        ]
    )
    .explode("replicate")
    .with_columns(
        (pl.col("pdb") + "_" + pl.col("replicate").cast(pl.Utf8)).alias(
            "pdbid"
        )
    )
    .select("pdbid", "mhc_class", "organism")
    .write_csv("elife_replicate/phil_pdb_tcrdock_by_seed.csv")
)

### Convert replication inference CIFs into TCRDock format


In [7]:
import MDAnalysis as mda
from mdaf3.FeatureExtraction import *
from mdaf3.AF3OutputParser import *
import Bio.Align


def tcrdock_format_cif(row, inference_path, output_path, seed=None):
    af3_output = AF3Output(inference_path / row["job_name"])

    pred_u = af3_output.get_mda_universe(seed=seed)

    # maybe mhc 2 seq is not included since it's implied B2m, so check
    if row["mhc_class"] == "II":
        # mhc1, mhc2, pep, tcr1, tcr2
        pred_segids = ["B", "C", "A", "D", "E"]
        rename_segids = ["A", "B", "C", "D", "E"]
    # TCRdock will remove B2M anyways
    else:
        pred_segids = ["B", "A", "D", "E"]
        # for consistency with format already in tcrdock repo
        rename_segids = ["A", "B", "C", "D"]

    chain_us = []

    for pred_segsel, rename_segid in zip(pred_segids, rename_segids):

        pred_sel = pred_u.select_atoms(f"segid {pred_segsel}").atoms

        chain_u = mda.Merge(pred_sel)
        chain_u.segments.segids = rename_segid
        chain_u.atoms.chainIDs = [rename_segid] * len(chain_u.atoms)
        chain_us.append(chain_u.atoms)

    new_u = mda.Merge(*chain_us)

    if seed is not None:
        suffix = f"_{seed}"
    else:
        suffix = ""

    with mda.Writer(output_path / (row["pdb"] + suffix + ".pdb")) as W:

        # u_new = mda.Universe.empty(
        #     n_atoms, n_segments=n_segments, n_residues=n_residues
        # )

        # ordered_chains = sum(chain_sels)

        # # for attr in ["name", "type", "resname"]:
        # #     u_new.add_TopologyAttr("name", ordered_chains.residues.names)

        # # choose first altloc if mutliple present
        # W.write(ordered_chains)

        W.write(new_u.atoms)

    # noop
    return pl.DataFrame(row)


output_path = Path("elife_replicate/tcrdock_inference_pdb")
inference_path = Path("elife_replicate/inference")

serial_apply(
    phil_pdb, tcrdock_format_cif, inference_path, output_path, seed=None
)

for seed in [1, 2, 3, 4, 5]:
    serial_apply(
        phil_pdb, tcrdock_format_cif, inference_path, output_path, seed=seed
    )

Processing rows: 100%|██████████| 130/130 [00:27<00:00,  4.78it/s]
Processing rows: 100%|██████████| 130/130 [00:30<00:00,  4.32it/s]
Processing rows: 100%|██████████| 130/130 [00:30<00:00,  4.21it/s]
Processing rows: 100%|██████████| 130/130 [00:31<00:00,  4.11it/s]
Processing rows: 100%|██████████| 130/130 [00:31<00:00,  4.10it/s]
Processing rows: 100%|██████████| 130/130 [00:31<00:00,  4.15it/s]


### 3. New triads contributed after AF3 cutoff


In [36]:
post_cutoff_pdb = all_pdb.filter(pl.col("pdb_date") > cutoff)

In [None]:
accept = [
    "8gom",
    "8vd0",
    "8trr",
    "8wte",
    "8vcy",
    "8es9",
    "8gon",
    "8i5d",
    "8i5c",
    "8vcx",
    "7q99",
    "8eo8",
    "8dnt",
    "8enh",
    "8ye4",
    "8wul",
    "8f5a",
    "8vd2",
    "7q9b",
    "8en8",
    "8pjg",
    "7q9a",
]
organism = ["human"] * len(accept)

tmp_metadat = pl.DataFrame({"pdb": accept, "organism": organism})
post_cutoff_pdb = post_cutoff_pdb.join(tmp_metadat, on="pdb", how="inner")

pre_fasta_corrections = {
    "8vd0": {"antigen_chain": "C"},
}

post_fasta_corrections = {
    "8vd0": {"peptide": "GQVELGGGNAVEVCKG"},
    "7q9b": {
        "mhc_chain1": "FFF",
        "mhc_chain2": "GGG",
        "Achain": "III",
        "Bchain": "JJJ",
        "antigen_chain": "HHH",
    },
}

for pdb_id, correction in pre_fasta_corrections.items():

    for k, v in correction.items():
        post_cutoff_pdb = update_df_from_k_v(
            post_cutoff_pdb,
            "pdb",
            pdb_id,
            k,
            v,
        )


post_cutoff_pdb = format_seqs(post_cutoff_pdb).rename(
    {"peptide_seq": "peptide"}
)

for pdb_id, correction in post_fasta_corrections.items():
    for k, v in correction.items():
        post_cutoff_pdb = update_df_from_k_v(
            post_cutoff_pdb,
            "pdb",
            pdb_id,
            k,
            v,
        )

#### Infer the HLA alleles- not used downstream, so does not have to be perfect


In [None]:
from tcr_format_parsers.common.MHCCodeConverter import HLASequenceDBConverter

conv = HLASequenceDBConverter(IMGT_HLA_PATH)

post_cutoff_pdb = serial_apply(post_cutoff_pdb, infer_correct_mhc, conv, None)

Processing rows: 100%|██████████| 22/22 [05:46<00:00, 15.74s/it]


In [None]:
from tcr_format_parsers.common.TriadUtils import generate_job_name, FORMAT_COLS

post_cutoff_pdb = generate_job_name(post_cutoff_pdb)

In [None]:
post_cutoff_pdb = post_cutoff_pdb.select(
    FORMAT_COLS
    + [
        "pdb",
        "pdb_date",
        "antigen_chain",
        "mhc_chain1",
        "mhc_chain2",
        "Achain",
        "Bchain",
        "organism",
    ]
)

post_cutoff_pdb = serial_apply(post_cutoff_pdb, remove_peptide_from_chains)

Processing rows: 100%|██████████| 22/22 [00:00<00:00, 1924.72it/s]


In [None]:
post_cutoff_pdb = serial_apply(
    post_cutoff_pdb, download_pdb, Path("post_training/struct")
)

Processing rows: 100%|██████████| 22/22 [00:22<00:00,  1.02s/it]


In [45]:
post_cutoff_pdb.write_csv(
    "post_training/post_cutoff_pdb.csv",
)

# also write into TCRDock CSV format
# this was provided for us in the original paper, but here we have to write it

post_cutoff_pdb.with_columns(
    pl.when(pl.col("mhc_class") == "I")
    .then(pl.lit(1))
    .otherwise(pl.lit(2))
    .alias("mhc_class"),
    pl.col("pdb").alias("pdbid"),
).select("pdbid", "mhc_class", "organism").write_csv(
    "post_training/post_cutoff_pdb_tcrdock.csv",
)

In [1]:
import polars as pl

post_cutoff_pdb = pl.read_csv(
    "post_training/post_cutoff_pdb.csv",
)

In [2]:
(
    post_cutoff_pdb.with_columns(
        [
            pl.when(pl.col("mhc_class") == "I")
            .then(pl.lit(1))
            .otherwise(pl.lit(2))
            .alias("mhc_class"),
            pl.lit([1, 2, 3, 4, 5]).alias("replicate"),
        ]
    )
    .explode("replicate")
    .with_columns(
        (pl.col("pdb") + "_" + pl.col("replicate").cast(pl.Utf8)).alias(
            "pdbid"
        )
    )
    .select("pdbid", "mhc_class", "organism")
    .write_csv("post_training/post_cutoff_pdb_tcrdock_by_seed.csv")
)

### Convert post-AF3 training date inference CIFs into TCRDock format


In [9]:
import MDAnalysis as mda
from mdaf3.FeatureExtraction import *
from mdaf3.AF3OutputParser import *
import Bio.Align


# def tcrdock_format_cif(row, inference_path, output_path, seed=None):
#     af3_output = AF3Output(inference_path / row["job_name"], seed=seed)

#     pred_u = af3_output.get_mda_universe()

#     # maybe mhc 2 seq is not included since it's implied B2m, so check
#     if row["mhc_class"] == "II":
#         # mhc1, mhc2, pep, tcr1, tcr2
#         pred_segids = ["B", "C", "A", "D", "E"]
#         rename_segids = ["A", "B", "C", "D", "E"]
#     # TCRdock will remove B2M anyways
#     else:
#         pred_segids = ["B", "A", "D", "E"]
#         # for consistency with format already in tcrdock repo
#         rename_segids = ["A", "B", "C", "D"]

#     chain_us = []

#     for pred_segsel, rename_segid in zip(pred_segids, rename_segids):

#         pred_sel = pred_u.select_atoms(f"segid {pred_segsel}").atoms

#         chain_u = mda.Merge(pred_sel)
#         chain_u.segments.segids = rename_segid
#         chain_u.atoms.chainIDs = [rename_segid] * len(chain_u.atoms)
#         chain_us.append(chain_u.atoms)

#     new_u = mda.Merge(*chain_us)

#     with mda.Writer(output_path / (row["pdb"] + ".pdb")) as W:

#         # u_new = mda.Universe.empty(
#         #     n_atoms, n_segments=n_segments, n_residues=n_residues
#         # )

#         # ordered_chains = sum(chain_sels)

#         # # for attr in ["name", "type", "resname"]:
#         # #     u_new.add_TopologyAttr("name", ordered_chains.residues.names)

#         # # choose first altloc if mutliple present
#         # W.write(ordered_chains)

#         W.write(new_u.atoms)

#     # noop
#     return pl.DataFrame(row)


output_path = Path("post_training/tcrdock_inference_pdb")
inference_path = Path("post_training/inference")


serial_apply(
    post_cutoff_pdb,
    tcrdock_format_cif,
    inference_path,
    output_path,
    seed=None,
)

for seed in [1, 2, 3, 4, 5]:
    serial_apply(
        post_cutoff_pdb,
        tcrdock_format_cif,
        inference_path,
        output_path,
        seed=seed,
    )

Processing rows: 100%|██████████| 22/22 [00:04<00:00,  4.44it/s]
Processing rows: 100%|██████████| 22/22 [00:05<00:00,  4.02it/s]
Processing rows: 100%|██████████| 22/22 [00:05<00:00,  4.14it/s]
Processing rows: 100%|██████████| 22/22 [00:05<00:00,  4.08it/s]
Processing rows: 100%|██████████| 22/22 [00:05<00:00,  3.93it/s]
Processing rows: 100%|██████████| 22/22 [00:06<00:00,  3.54it/s]


### Convert post-AF3 training date raw PDBs into TCRDock format


In [None]:
import MDAnalysis as mda
from mdaf3.FeatureExtraction import *
from mdaf3.AF3OutputParser import *
import Bio.Align


def align_residue_groups(true_residues, pred_residues, strict=False):
    aligner = Bio.Align.PairwiseAligner(mode="global")

    # Alignment[0] is best alignment
    seq_aln = aligner.align(
        true_residues.sequence(format="string"),
        pred_residues.sequence(format="string"),
    )[0]

    true_aln = seq_aln[0]
    pred_aln = seq_aln[1]

    true_resindices = true_residues.resindices
    true_res_boolmask = np.full(true_resindices.shape, False, dtype=np.bool)
    pred_resindices = pred_residues.resindices
    pred_res_boolmask = np.full(pred_resindices.shape, False, dtype=np.bool)

    true_ptr = 0
    pred_ptr = 0
    for i in range(len(true_aln)):
        if true_aln[i] == "-":
            pred_ptr += 1
            continue
        elif pred_aln[i] == "-":
            true_ptr += 1
            continue
        elif strict and pred_aln[i] != true_aln[i]:
            raise ValueError("Sequences cannot align without mutations")
        else:
            true_res_boolmask[true_ptr] = True
            pred_res_boolmask[pred_ptr] = True
            true_ptr += 1
            pred_ptr += 1

    return true_residues[true_res_boolmask], pred_residues[pred_res_boolmask]


def get_true_mda_universe(pdb_id, root_path):
    # Favor PDB since it doesn't have multiple residue with same ID issue
    if (root_path / (pdb_id + ".pdb")).exists():
        suffix = ".pdb"

    else:
        suffix = ".cif"

    return mda.Universe((root_path / (pdb_id + suffix)).as_posix())


def tcrdock_format_pdb(row, struct_path, inference_path, output_path):
    af3_output = AF3Output(inference_path / row["job_name"])

    pred_u = af3_output.get_mda_universe()

    true_u = get_true_mda_universe(row["pdb"], struct_path)

    if row["mhc_class"] == "II":
        # mhc1, mhc2, pep, tcr1, tcr2
        pred_segids = ["B", "C", "A", "D", "E"]
        true_segids = [
            row["mhc_chain1"],
            row["mhc_chain2"],
            row["antigen_chain"],
            row["Achain"],
            row["Bchain"],
        ]
        rename_segids = ["A", "B", "C", "D", "E"]
    # TCRdock will remove B2M anyways
    else:
        pred_segids = ["B", "A", "D", "E"]
        true_segids = [
            row["mhc_chain1"],
            row["antigen_chain"],
            row["Achain"],
            row["Bchain"],
        ]
        rename_segids = ["A", "B", "C", "D"]

    # seg_map = {true_segids[i]: pred_segids[i] for i in range(len(true_segid))}

    chain_us = []

    for true_segsel, pred_segsel, rename_segid in zip(
        true_segids, pred_segids, rename_segids
    ):
        true_sel = true_u.select_atoms(
            f"segid {true_segsel} and name CA and record_type ATOM"
        ).residues

        pred_sel = pred_u.select_atoms(
            f"segid {pred_segsel} and name CA and record_type ATOM"
        ).residues

        true_res, pred_res = align_residue_groups(
            true_sel,
            pred_sel,
        )

        chain_u = mda.Merge(
            true_res.atoms.select_atoms(
                "record_type ATOM and (altloc A or not altloc [!?])"
            )
        )
        chain_u.segments.segids = rename_segid
        chain_u.atoms.chainIDs = [rename_segid] * len(chain_u.atoms)

        chain_us.append(chain_u.atoms)

    new_u = mda.Merge(*chain_us)

    with mda.Writer(output_path / (row["pdb"] + ".pdb")) as W:

        W.write(new_u.atoms)

    # noop
    return pl.DataFrame(row)


output_path = Path("post_training/tcrdock_struct_pdb")
struct_path = Path("post_training/struct")
inference_path = Path("post_training/inference")

serial_apply(
    post_cutoff_pdb,
    tcrdock_format_pdb,
    struct_path,
    inference_path,
    output_path,
)

Processing rows: 100%|██████████| 22/22 [00:09<00:00,  2.30it/s]


job_name,cognate,peptide,mhc_class,mhc_1_chain,mhc_1_species,mhc_1_name,mhc_1_seq,mhc_2_chain,mhc_2_species,mhc_2_name,mhc_2_seq,tcr_1_chain,tcr_1_species,tcr_1_seq,tcr_2_chain,tcr_2_species,tcr_2_seq,pdb,pdb_date,antigen_chain,mhc_chain1,mhc_chain2,Achain,Bchain,organism
str,bool,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""b4a6153b3bd2df0309a16f643d9be8…",true,"""EAAGIGILTV""","""I""","""heavy""",,"""A*02:01""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""light""",,"""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""",,"""QKEVEQNSGPLSVPEGAIASLNCTYSDRGS…","""beta""",,"""NAGVTQTPKFQVLKTGQSMTLQCAQDMNHE…","""7q9b""","""2023-02-22T00:00:00.000000+000…","""HHH""","""FFF""","""GGG""","""III""","""JJJ""","""human"""
"""39bf4e6e65f6b042d9340128ecd790…",true,"""GQVELGGGNAVEVCKG""","""II""","""alpha""","""human""","""DQA1*03:01""","""EDIVADHVASYGVNLYQSYGPSGQYSHEFD…","""beta""","""human""","""DQB1*03:02""","""GSGGSRDSPEDFVYQFKGMCYFTNGTERVR…","""alpha""","""human""","""MKTTQPPSMDCAEGRAANLPCNHSTISGNE…","""beta""","""human""","""GVTQTPRYLIKTRGQQVTLSCSPISGHRSV…","""8vd0""","""2024-08-07T00:00:00.000000+000…","""C""","""A""","""C""","""D""","""E""","""human"""
"""62f1b197f4b0555c27ac38c5496ffd…",true,"""NLSALGIFST""","""I""","""heavy""","""human""","""A*02:01""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""EVEQNSGPLSVPEGAIASLNCTYSDRGSQS…","""beta""","""human""","""SQTIHQWPATLVQPVGSPLSLECTVEGTSN…","""7q99""","""2023-02-22T00:00:00.000000+000…","""C""","""A""","""B""","""D""","""E""","""human"""
"""b093517b1cc89f77734e54d40b30ca…",true,"""LPFEKSTIM""","""I""","""heavy""","""human""","""B*35:01""","""GSHSMRYFYTAMSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""SAGENVEQHPSTLSVQEGDSAVIKCTYSDS…","""beta""","""human""","""AVVSQHPSRVICKSGTSVKIECRSLDFQAT…","""8enh""","""2024-03-27T00:00:00.000000+000…","""H""","""F""","""G""","""I""","""J""","""human"""
"""3dad0fd6a6e0ce25e4106fda13a57a…",true,"""VVGAVGVGK""","""I""","""heavy""","""human""","""A*11:100""","""GSHSMRYFYTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""mouse""","""QQKVQQSPESLIVPEGGMASLNCTSSDRNV…","""beta""","""mouse""","""EAAVTQSPRNKVAVTGGKVTLSCNQTNNHN…","""8i5c""","""2023-08-23T00:00:00.000000+000…","""M""","""K""","""L""","""N""","""O""","""human"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""58484be6ef5feee9f02e558e069f54…",true,"""LPFDKATIM""","""I""","""heavy""","""human""","""B*35:01""","""GSHSMRYFYTAMSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""SAGENVEQHPSTLSVQEGDSAVIKCTYSDS…","""beta""","""human""","""AVVSQHPSRVICKSGTSVKIECRSLDFQAT…","""8eo8""","""2024-03-27T00:00:00.000000+000…","""C""","""A""","""B""","""D""","""E""","""human"""
"""59acb5477505053567be60b57e5f3b…",true,"""LLLDRLNQL""","""I""","""heavy""","""human""","""A*02:01""","""MGSHSMRYFFTSVSRPGRGEPRFIAVGYVD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""KEVEQNSGPLSVPEGAIASLNCTYSDRGSQ…","""beta""","""human""","""GAGVSQSPSNKVTEKGKDVELRCDPISGHT…","""8dnt""","""2023-07-19T00:00:00.000000+000…","""D""","""E""","""F""","""A""","""B""","""human"""
"""13f7a9410810562e44920fa0413159…",true,"""VVGAVGVGK""","""I""","""heavy""","""human""","""A*11:01""","""GSHSMRYFYTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""mouse""","""QQKVQQSPESLIVPEGGMASLNCTSSDRNV…","""beta""","""mouse""","""KIIQKPKYLVAVTGSEKILICEQYLGHNAM…","""8wte""","""2024-05-01T00:00:00.000000+000…","""J""","""H""","""I""","""A""","""B""","""human"""
"""e6da04b0f02efa07a5e1795130328a…",true,"""GQVELGGGSSPETCI""","""II""","""alpha""","""human""","""DQA1*03:01""","""EDIVADHVASYGVNLYQSYGPSGQYSHEFD…","""beta""","""human""","""DQB1*03:02""","""RDSPEDFVYQFKGMCYFTNGTERVRLVTRY…","""alpha""","""human""","""MKTTQPPSMDCAEGRAANLPCNHSTISGNE…","""beta""","""human""","""GVTQTPRYLIKTRGQQVTLSCSPISGHRSV…","""8vcy""","""2024-08-07T00:00:00.000000+000…","""C""","""A""","""B""","""D""","""E""","""human"""


In [30]:
phil_pdb = pl.read_csv(
    "elife_replicate/phil_pdb.csv",
)

In [None]:
from datetime import datetime, timezone

cutoff = pl.lit(datetime(2023, 1, 12, tzinfo=timezone.utc))

phil_pdb = serial_apply(
    phil_pdb,
    get_pdb_date,
)

Processing rows: 100%|██████████| 130/130 [00:06<00:00, 19.01it/s]


In [None]:
af2_cutoff = pl.lit(datetime(2018, 6, 1, tzinfo=timezone.utc))

phil_pdb.filter(pl.col("pdb_date") > af2_cutoff)

job_name,cognate,peptide,mhc_class,mhc_1_chain,mhc_1_species,mhc_1_name,mhc_1_seq,mhc_2_chain,mhc_2_species,mhc_2_name,mhc_2_seq,tcr_1_chain,tcr_1_species,tcr_1_seq,tcr_2_chain,tcr_2_species,tcr_2_seq,pdb,pdb_date,va,ja,cdr3a,vb,jb,cdr3b,antigen_chain,mhc_chain1,mhc_chain2,Achain,Bchain,organism,cdr_rmsd,cdr_rmsd_af2_full,cdr_rmsd_af2_trim
str,bool,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]",str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64
"""00d671d629eff4094fee1be3c7f4da…",true,"""SPSYAYHQF""","""I""","""heavy""","""mouse""","""H2Ld""","""GPHSMRYYETATSRRGLGEPRYTSVGYVDD…",,,"""B2M""",,"""alpha""",,"""AKTTQPDSMESTEGETVHLPCSHATISGNE…","""beta""",,"""AVTQSPRNKVTVTGGNVTLSCRQTNSHNYM…","""6l9l""",2019-11-10 00:00:00 UTC,"""TRAV21/DV12*01""","""TRAJ58*01""","""CILQGTGSKLSF""","""TRBV13-1*02""","""TRBJ2-7*01""","""CASSDGDYEQYF""","""B""","""A""",,"""C""","""D""","""mouse""",4.2,11.69,26.58
"""66588fb6f41c42796a59e21f801dcc…",true,"""RLQSLQTYV""","""I""","""heavy""","""human""","""A*02:01""","""GSHSMRYFFTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""QRVTQPEKLLSVFKGAPVELKCNYSYSGSP…","""beta""","""human""","""GVAQSPRYKIIEKRQSVAFWCNPISGHATL…","""7n1e""",2021-05-27 00:00:00 UTC,"""TRAV16*01""","""TRAJ39*01""","""CALSGFNNAGNMLTF""","""TRBV11-2*01""","""TRBJ2-3*01""","""CASSLGGAGGADTQYF""","""C""","""A""","""B""","""D""","""E""","""human""",8.07,22.91,26.8
"""c874eda106179b27d61348b7557431…",true,"""SLLMWITQV""","""I""","""heavy""","""human""","""A*02:01""","""MGSHSMRYFFTSVSRPGRGEPRFIAVGYVD…","""light""","""human""","""B2M""","""MIQRTPKIQVYSRHPAENGKSNFLNCYVSG…","""alpha""","""human""","""MQKEVEQNSGPLSVPEGAIASLNCTYSDRG…","""beta""","""human""","""MGAGVSQSPRYKVTKRGQDVALRCDPISGH…","""6rp9""",2019-05-14 00:00:00 UTC,"""TRAV12-2*01""","""TRAJ49*01""","""CALTRGPGNQFYF""","""TRBV7-6*01""","""TRBJ1-1*01""","""CASSSPGGVSTEAFF""","""H""","""F""","""G""","""I""","""J""","""human""",5.67,23.64,20.27
"""1e2f70e91c8b2d204ab7d420da955a…",true,"""FEDLRVLSF""","""I""","""heavy""","""human""","""B*37:01""","""GSHSMRYFHTSVSRPGRGEPRFISVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""human""","""QPVQSPQAVILREGEDAVINCSSSKALYSV…","""beta""","""human""","""GITQSPKYLFRKEGQNVTLSCEQNLNHDAM…","""6mtm""",2018-10-19 00:00:00 UTC,"""TRAV30*01""","""TRAJ13*01""","""CGTERSGGYQKVTF""","""TRBV19*01""","""TRBJ1-1*01""","""CASSMSAMGTEAFF""","""C""","""A""","""B""","""D""","""E""","""human""",2.49,5.28,3.14
"""7f4b0f1dd8f99a68601d93e79d9738…",true,"""ASNENMETM""","""I""","""heavy""","""mouse""","""H2Db""","""MGAMAPRTLLLLLAAALAPTQTRAGPHSMR…","""light""","""mouse""","""B2M""","""MARSVTLVFLVLVSLTGLYAIQKTPQIQVY…","""alpha""","""mouse""","""GDQVEQSPSALSLHEGTDSALRCNFTTTMR…","""beta""","""mouse""","""DTTVKQNPRYKLARVGKPVNLICSQTMNHD…","""7jwj""",2020-08-25 00:00:00 UTC,"""TRAV4-4/DV10*01""","""TRAJ37*01""","""CAAVTGNTGKLIF""","""TRBV17*01""","""TRBJ1-1*01""","""CASSRGTIHSNTEVFF""","""C""","""A""","""B""","""D""","""E""","""mouse""",8.6,25.94,16.93
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""cce720e42d01d1e8820318af338c7c…",true,"""SYYGPKTSPVQ""","""II""","""alpha""","""mouse""","""H2ABa""","""IEADHVGTYGISVYQSPGDIGQYTFEFDGD…","""beta""","""mouse""","""H2ABb""","""GGGGSLVPRGSGGGGSERHFVYQFMGECYF…","""alpha""","""mouse""","""MQQVRQSPQSLTVWEGETAILNCSYENSAF…","""beta""","""mouse""","""AVTQSPRNKVAVTGGKVTLSCNQTNNHNNM…","""6mnn""",2018-10-02 00:00:00 UTC,"""TRAV14D-3/DV8*02""","""TRAJ52*01""","""CAASVTGANTGKLTF""","""TRBV13-2*01""","""TRBJ2-4*01""","""CASGDFWGDTLYF""","""D""","""C""","""D""","""A""","""B""","""mouse""",3.83,3.89,4.58
"""da51f41cc36cf890da26880c6f14fe…",true,"""APFSEQEQPVL""","""II""","""alpha""","""human""","""DQA1*02:01""","""MILNKALMLGALALTTVMSPCGGEDIVADH…","""beta""","""human""","""DQB1*02:01""","""MSWKKALRIPGGLRAATVTLMLSMLSTPVA…","""alpha""","""human""","""MKQEVTQIPAALSVPEGENLVLNCSFTDSA…","""beta""","""human""","""MGVSQTPSNKVTEKGKYVELRCDPISGHTA…","""6px6""",2019-07-24 00:00:00 UTC,"""TRAV21*01""","""TRAJ31*01""","""CAVHTGARLMF""","""TRBV7-3*01""","""TRBJ2-3*01""","""CASSHGASTDTQYF""","""C""","""A""","""B""","""D""","""E""","""human""",3.9,2.81,3.33
"""b05a70ac15dcec9c3125e3649535e8…",true,"""APMPMPELPYP""","""II""","""alpha""","""human""","""DQA1*05:01""","""EDIVADHVASYGVNLYQSYGPSGQYTHEFD…","""beta""","""human""","""DQB1*02:01""","""GGSGASRDSPEDFVYQFKGMCYFTNGTERV…","""alpha""","""human""","""QSVTQPDIHITVSEGASLELRCNYSYGATP…","""beta""","""human""","""HMGVTQSPTHLIKTRGQQVTLRCSPISGHK…","""6u3n""",2019-08-22 00:00:00 UTC,"""TRAV8-3*01""","""TRAJ33*01""","""CAVGAGSNYQLIW""","""TRBV5-5*01""","""TRBJ2-1*01""","""CASSLEGQGASEQFF""","""C""","""A""","""B""","""D""","""E""","""human""",3.2,4.52,4.53
"""08a74d65f9e5f41a5ec67c0accfde9…",true,"""VVQSELPYPEG""","""II""","""alpha""","""human""","""DQA1*05:01""","""EDIVADHVASYGVNLYQSYGPSGQYTHEFD…","""beta""","""human""","""DQB1*02:01""","""GGSGASRDSPEDFVYQFKGMCYFTNGTERV…","""alpha""","""human""","""MKTTQPPSMDCAEGRAANLPCNHSTISGNE…","""beta""","""human""","""MGVSQSPSNKVTEKGKDVELRCDPISGHTA…","""6u3o""",2019-08-22 00:00:00 UTC,"""TRAV26-1*01""","""TRAJ54*01""","""CIAFQGAQKLVF""","""TRBV7-2*01""","""TRBJ2-3*01""","""CASSFRALAADTQYF""","""I""","""E""","""F""","""G""","""H""","""human""",3.03,20.29,5.43


In [34]:
phil_pdb.write_csv(
    "elife_replicate/phil_pdb.csv",
)

In [None]:
tmp = serial_apply(
    post_cutoff_pdb,
    get_pdb_date,
)

Processing rows: 100%|██████████| 22/22 [00:01<00:00, 18.76it/s]


job_name,cognate,peptide,mhc_class,mhc_1_chain,mhc_1_species,mhc_1_name,mhc_1_seq,mhc_2_chain,mhc_2_species,mhc_2_name,mhc_2_seq,tcr_1_chain,tcr_1_species,tcr_1_seq,tcr_2_chain,tcr_2_species,tcr_2_seq,pdb,pdb_date,antigen_chain,mhc_chain1,mhc_chain2,Achain,Bchain,organism
str,bool,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]",str,str,str,str,str,str
"""39bf4e6e65f6b042d9340128ecd790…",true,"""GQVELGGGNAVEVCKG""","""II""","""alpha""","""human""","""DQA1*03:01""","""EDIVADHVASYGVNLYQSYGPSGQYSHEFD…","""beta""","""human""","""DQB1*03:02""","""GSGGSRDSPEDFVYQFKGMCYFTNGTERVR…","""alpha""","""human""","""MKTTQPPSMDCAEGRAANLPCNHSTISGNE…","""beta""","""human""","""GVTQTPRYLIKTRGQQVTLSCSPISGHRSV…","""8vd0""",2023-12-14 00:00:00 UTC,"""C""","""A""","""C""","""D""","""E""","""human"""
"""3dad0fd6a6e0ce25e4106fda13a57a…",true,"""VVGAVGVGK""","""I""","""heavy""","""human""","""A*11:100""","""GSHSMRYFYTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""mouse""","""QQKVQQSPESLIVPEGGMASLNCTSSDRNV…","""beta""","""mouse""","""EAAVTQSPRNKVAVTGGKVTLSCNQTNNHN…","""8i5c""",2023-01-24 00:00:00 UTC,"""M""","""K""","""L""","""N""","""O""","""human"""
"""55461d93e2317bf4eecc407caf3910…",true,"""VVGAVGVGK""","""I""","""heavy""","""human""","""A*11:100""","""GSHSMRYFYTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""mouse""","""SWALSVHEGESVTVNCSYKTSITALQWYRQ…","""beta""","""mouse""","""GVIQTPRHKVTGKGQEATLWCEPISGHSAV…","""8i5d""",2023-01-25 00:00:00 UTC,"""P""","""H""","""L""","""A""","""B""","""human"""
"""7ed224bf27775709b488a69d90cbe5…",true,"""PKYVKQNTLKLAR""","""II""","""alpha""","""human""","""DRA*01:01""","""MIKEEHVIIQAEFYLNPDQSGEFMFDFDGD…","""beta""","""human""","""DRB1*01:01""","""MGDTRPRFLWQLKFECHFFNGTERVRLLER…","""alpha""","""human""","""MAQSVTQLGSHVSVSEGALVLLRCNYSSSV…","""beta""","""human""","""MADVTQTPRNRITKTGKRIMLECSQTKGHD…","""8pjg""",2023-06-23 00:00:00 UTC,"""C""","""A""","""B""","""D""","""E""","""human"""
"""df790dd4eb9a5164bc3171d1ce5f70…",true,"""GQVELGGGTPIESCQ""","""II""","""alpha""","""human""","""DQA1*03:01""","""EDIVADHVASYGVNLYQSYGPSGQYSHEFD…","""beta""","""human""","""DQB1*03:02""","""RDSPEDFVYQFKGMCYFTNGTERVRLVTRY…","""alpha""","""human""","""MKTTQPPSMDCAEGRAANLPCNHSTISGNE…","""beta""","""human""","""GVTQTPRYLIKTRGQQVTLSCSPISGHRSV…","""8vd2""",2023-12-14 00:00:00 UTC,"""C""","""A""","""B""","""D""","""E""","""human"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""6899d34d074bdb896dbde93243ae73…",true,"""NYNYLYRLF""","""I""","""heavy""","""human""","""A*24:02""","""GSHSMRYFSTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""human""","""EVEQDPGPFNVPEGATVAFNCTYSNSASQS…","""beta""","""human""","""AGVTQTPKFQVLKTGQSMTLQCAQDMNHNS…","""8ye4""",2024-02-21 00:00:00 UTC,"""E""","""A""","""B""","""G""","""H""","""human"""
"""98daa70e8e2597cfecfacfff18353f…",true,"""GQVELGGGPGAESCQ""","""II""","""alpha""","""human""","""DQA1*03:01""","""EDIVADHVASYGVNLYQSYGPSGQYSHEFD…","""beta""","""human""","""DQB1*03:02""","""RDSPEDFVYQFKGMCYFTNGTERVRLVTRY…","""alpha""","""human""","""MKTTQPPSMDCAEGRAANLPCNHSTISGNE…","""beta""","""human""","""GVTQTPRYLIKTRGQQVTLSCSPISGHRSV…","""8vcx""",2023-12-14 00:00:00 UTC,"""C""","""A""","""B""","""D""","""E""","""human"""
"""404f58ae3d26a08ecc8bd707789d01…",true,"""VVGAVGVGK""","""I""","""heavy""","""human""","""A*11:01""","""GSHSMRYFYTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""mouse""","""QQKVQQSPESLIVPEGGMASLNCTSSDRNV…","""beta""","""mouse""","""KIIQKPKYLVAVTGSEKILICEQYLGHNAM…","""8wul""",2023-10-20 00:00:00 UTC,"""Q""","""O""","""P""","""E""","""F""","""human"""
"""13f7a9410810562e44920fa0413159…",true,"""VVGAVGVGK""","""I""","""heavy""","""human""","""A*11:01""","""GSHSMRYFYTSVSRPGRGEPRFIAVGYVDD…","""light""","""human""","""B2M""","""IQRTPKIQVYSRHPAENGKSNFLNCYVSGF…","""alpha""","""mouse""","""QQKVQQSPESLIVPEGGMASLNCTSSDRNV…","""beta""","""mouse""","""KIIQKPKYLVAVTGSEKILICEQYLGHNAM…","""8wte""",2023-10-18 00:00:00 UTC,"""J""","""H""","""I""","""A""","""B""","""human"""
