### 0. Query IEDB (converted to SQLite) for assay data


In [3]:
import polars as pl
import sqlite3

DB_PATH = "/tgen_labs/altin/alphafold3/IEDB/2025-04-15/iedb_public.db"

TRIAD_QUERY_STR = """
SELECT curated_receptor.distinct_receptor_id as receptor_id,
    assay_type.assay_type
FROM tcell
    JOIN tcell_receptor on tcell.tcell_id = tcell_receptor.tcell_id
    JOIN assay_type on assay_type.assay_type_id = tcell.as_type_id
    JOIN curated_receptor on tcell_receptor.curated_receptor_id = curated_receptor.curated_receptor_id;
"""

conn = sqlite3.connect(DB_PATH)
iedb_metadat = (
    pl.read_database(
        connection=conn,
        query=TRIAD_QUERY_STR,
        infer_schema_length=1000000,
    )
    .cast({"receptor_id": pl.String})
    .select("receptor_id", "assay_type")
    .unique()
)

### 1. Import triad data from IMMREP25 fork


In [4]:
import polars as pl

schema_overrides = {
    "references": pl.String,
    "receptor_id": pl.String,
}

iedb_human_I = pl.read_csv(
    "raw/human_I/immrep_IEDB.csv", schema_overrides=schema_overrides
)
iedb_human_II = pl.read_csv(
    "raw/human_II/immrep_IEDB.csv", schema_overrides=schema_overrides
)

### 2. Triad AF3 job CSVs for 1:1 matched dataset, parquet metadata


#### Class I positives


In [5]:
import polars as pl
from tcr_format_parsers.common.MHCCodeConverter import (
    B2M_HUMAN_SEQ,
    HLACodeWebConverter,
)
from tcr_format_parsers.common.TriadUtils import (
    generate_job_name,
    FORMAT_COLS,
    FORMAT_TCR_COLS,
    FORMAT_ANTIGEN_COLS,
    TCRDIST_COLS,
    generate_negatives_antigen_matched,
)

human_conv = HLACodeWebConverter()

addtl_cols = TCRDIST_COLS + [
    "receptor_id",
    "references",
]

iedb_human_I = (
    iedb_human_I.rename(
        {
            "Peptide": "peptide",
            "TCRb": "tcr_2_seq",
            "TCRa": "tcr_1_seq",
            "HLA": "mhc_1_name",
            "CDR1a": "tcr_1_cdr_1",
            "CDR2a": "tcr_1_cdr_2",
            "CDR3a": "tcr_1_cdr_3",
            "CDR1b": "tcr_2_cdr_1",
            "CDR2b": "tcr_2_cdr_2",
            "CDR3b": "tcr_2_cdr_3",
            "Va": "tcr_1_v_gene",
            "Ja": "tcr_1_j_gene",
            "Vb": "tcr_2_v_gene",
            "Jb": "tcr_2_j_gene",
        }
    )
    .with_columns(
        pl.lit("heavy").alias("mhc_1_chain"),
        pl.lit("light").alias("mhc_2_chain"),
        pl.lit("alpha").alias("tcr_1_chain"),
        pl.lit("beta").alias("tcr_2_chain"),
        pl.lit("human").alias("tcr_1_species"),
        pl.lit("human").alias("tcr_2_species"),
        pl.lit("human").alias("mhc_1_species"),
        pl.lit("human").alias("mhc_2_species"),
        pl.lit(B2M_HUMAN_SEQ).alias("mhc_2_seq"),
        pl.lit("B2M").alias("mhc_2_name"),
        pl.lit("I").alias("mhc_class"),
        pl.col("mhc_1_name").str.split_exact("HLA-", 1).alias("split_parts"),
        pl.lit(True).alias("cognate"),
        pl.col("receptor_id").str.split(",").alias("receptor_id"),
        pl.col("references").str.split(",").alias("references"),
    )
    .select(pl.exclude("mhc_1_name"))
    .unnest("split_parts")
    .rename(
        {
            "field_0": "tmp",
            "field_1": "mhc_1_name",
        }
    )
    .select(pl.exclude("tmp"))
    .with_columns(
        pl.col("mhc_1_name")
        .map_elements(
            lambda x: human_conv.get_sequence(x, top_only=True),
            return_dtype=pl.String,
        )
        .alias("mhc_1_seq")
    )
    .filter(
        pl.col("tcr_1_seq").is_not_null(), pl.col("tcr_2_seq").is_not_null()
    )
    .with_columns(
        pl.when(pl.col("references").is_not_null())
        .then(
            pl.col("references").list.eval(
                pl.element().str.split("/").list.get(-1)
            )
        )
        .otherwise(None)
        .alias("references")
    )
)

iedb_human_I = generate_job_name(iedb_human_I)

iedb_human_I = iedb_human_I.group_by(FORMAT_COLS + TCRDIST_COLS).agg(
    [pl.col("references").flatten(), pl.col("receptor_id").flatten()]
)


iedb_human_I = (
    iedb_human_I.explode("receptor_id")
    .join(iedb_metadat, on="receptor_id", how="left")
    .group_by(pl.exclude("receptor_id", "assay_type"))
    .agg(
        [
            pl.col("receptor_id"),
            pl.col("assay_type"),
        ]
    )
)

#### Class I negatives (for 1:1 matched dataset)


In [None]:
iedb_human_I_negs = generate_negatives_antigen_matched(
    iedb_human_I
).with_columns(
    pl.lit(None).alias("references"),
    pl.lit(None).alias("receptor_id"),
    pl.lit(None).alias("assay_type"),
)

iedb_human_I = pl.concat(
    [iedb_human_I, iedb_human_I_negs], how="vertical_relaxed"
)

#### Save class I 1:1 matched dataset to disk


In [9]:
iedb_human_I.select(FORMAT_COLS).write_csv("triad/human_I/human_I.csv")
iedb_human_I.write_parquet("triad/human_I/human_I.parquet")

#### Class II Positives


In [6]:
from process_utils import infer_hla_chain

iedb_human_II = (
    iedb_human_II.rename(
        {
            "Peptide": "peptide",
            "TCRb": "tcr_2_seq",
            "TCRa": "tcr_1_seq",
            "HLA": "mhc_1_name",
            "CDR1a": "tcr_1_cdr_1",
            "CDR2a": "tcr_1_cdr_2",
            "CDR3a": "tcr_1_cdr_3",
            "CDR1b": "tcr_2_cdr_1",
            "CDR2b": "tcr_2_cdr_2",
            "CDR3b": "tcr_2_cdr_3",
            "Va": "tcr_1_v_gene",
            "Ja": "tcr_1_j_gene",
            "Vb": "tcr_2_v_gene",
            "Jb": "tcr_2_j_gene",
        }
    )
    .with_columns(
        pl.lit("alpha").alias("mhc_1_chain"),
        pl.lit("beta").alias("mhc_2_chain"),
        pl.lit("alpha").alias("tcr_1_chain"),
        pl.lit("beta").alias("tcr_2_chain"),
        pl.lit("human").alias("tcr_1_species"),
        pl.lit("human").alias("tcr_2_species"),
        pl.lit("human").alias("mhc_1_species"),
        pl.lit("human").alias("mhc_2_species"),
        pl.lit("II").alias("mhc_class"),
        pl.lit(True).alias("cognate"),
        pl.col("receptor_id").str.split(",").alias("receptor_id"),
        pl.col("references").str.split(",").alias("references"),
    )
    .filter(
        pl.col("tcr_1_seq").is_not_null(), pl.col("tcr_2_seq").is_not_null()
    )
    .with_columns(
        pl.when(pl.col("references").is_not_null())
        .then(
            pl.col("references").list.eval(
                pl.element().str.split("/").list.get(-1)
            )
        )
        .otherwise(None)
        .alias("references")
    )
)

iedb_human_II = (
    iedb_human_II.with_columns(
        pl.col("mhc_1_name").str.split("/").alias("split_parts")
    )
    .with_columns(
        pl.when(pl.col("split_parts").list.len() == 2)
        .then(
            pl.struct(
                pl.col("split_parts")
                .list.get(0, null_on_oob=True)
                .str.slice(4)
                .alias("mhc_1_name"),
                pl.col("split_parts")
                .list.get(1, null_on_oob=True)
                .alias("mhc_2_name"),
            )
        )
        .otherwise(
            pl.struct(
                pl.lit(None).alias("mhc_1_name"),
                pl.col("split_parts")
                .list.get(0)
                .str.slice(4)
                .alias("mhc_2_name"),
            )
        )
        .alias("mhc_struct")
    )
    .select(pl.exclude("mhc_1_name"))
    .with_columns(
        pl.col("mhc_struct")
        .map_elements(
            lambda x: infer_hla_chain(x["mhc_1_name"], x["mhc_2_name"]),
            return_dtype=pl.Struct,
        )
        .alias("chains")
    )
    .unnest("chains")
    .filter(
        (pl.col("mhc_1_name").is_not_null())
        & (pl.col("mhc_2_name").is_not_null())
    )
    .with_columns(
        pl.col("mhc_1_name")
        .map_elements(
            lambda x: human_conv.get_sequence(x, top_only=True),
            return_dtype=pl.String,
        )
        .alias("mhc_1_seq"),
        pl.col("mhc_2_name")
        .map_elements(
            lambda x: human_conv.get_sequence(x, top_only=True),
            return_dtype=pl.String,
        )
        .alias("mhc_2_seq"),
    )
)


iedb_human_II = generate_job_name(iedb_human_II)

iedb_human_II = iedb_human_II.group_by(FORMAT_COLS + TCRDIST_COLS).agg(
    [pl.col("references").flatten(), pl.col("receptor_id").flatten()]
)

iedb_human_II = (
    iedb_human_II.explode("receptor_id")
    .join(iedb_metadat, on="receptor_id", how="left")
    .group_by(pl.exclude("receptor_id", "assay_type"))
    .agg(
        [
            pl.col("receptor_id"),
            pl.col("assay_type"),
        ]
    )
)



#### Class II negatives (for 1:1 matched dataset)


In [None]:
iedb_human_II_negs = generate_negatives_antigen_matched(
    iedb_human_II
).with_columns(
    pl.lit(None).alias("references"),
    pl.lit(None).alias("receptor_id"),
    pl.lit(None).alias("assay_type"),
)

iedb_human_II = pl.concat(
    [iedb_human_II, iedb_human_II_negs], how="vertical_relaxed"
)

#### Save class II 1:1 matched dataset to disk


In [16]:
iedb_human_II.select(FORMAT_COLS).write_csv("triad/human_II/human_II.csv")
iedb_human_II.write_parquet("triad/human_II/human_II.parquet")

### 3. pMHC AF3 jobs & parquet metadata


#### Per antigen count, TCR Diversity, job name for AF3


In [None]:
from tcr_format_parsers.common.TriadUtils import (
    per_antigen_diversity,
    FORMAT_ANTIGEN_COLS,
)
from tcr_format_parsers.common.TCRUtils import (
    hash_tcr_sequence,
)

iedb_human_II = pl.read_parquet("triad/human_II/human_II.parquet")
iedb_human_I = pl.read_parquet("triad/human_I/human_I.parquet")

iedb_human_I_antigen = per_antigen_diversity(
    iedb_human_I.filter(pl.col("cognate"))
)
iedb_human_II_antigen = per_antigen_diversity(
    iedb_human_II.filter(pl.col("cognate"))
)


def generate_job_name_pmhc(df):
    df = df.with_columns(
        pl.concat_str(
            pl.concat_str(
                [
                    pl.col("peptide"),
                    pl.col("mhc_1_seq"),
                    pl.col("mhc_2_seq"),
                ],
            )
            .map_elements(
                lambda x: hash_tcr_sequence(x, "md5"), return_dtype=pl.String
            )
            .alias("job_name"),
        )
    )
    return df


iedb_human_I_antigen = generate_job_name_pmhc(iedb_human_I_antigen).select(
    ["job_name"] + FORMAT_ANTIGEN_COLS + ["TCRdiv", "TCRdiv_samples"]
)

iedb_human_II_antigen = generate_job_name_pmhc(iedb_human_II_antigen).select(
    ["job_name"] + FORMAT_ANTIGEN_COLS + ["TCRdiv", "TCRdiv_samples"]
)

#### Save antigen data to disk


In [None]:
iedb_human_I_antigen.select(["job_name"] + FORMAT_ANTIGEN_COLS).write_csv(
    "pmhc/human_I/human_I_antigen.csv"
)
iedb_human_I_antigen.write_parquet("pmhc/human_I/human_I_antigen.parquet")


iedb_human_II_antigen.select(["job_name"] + FORMAT_ANTIGEN_COLS).write_csv(
    "pmhc/human_II/human_II_antigen.csv"
)

iedb_human_II_antigen.write_parquet("pmhc/human_II/human_II_antigen.parquet")

#### Per antigen pMHC PDB blast hit boolean / PDB ID


In [None]:
import polars as pl
from mdaf3.FeatureExtraction import serial_apply, split_apply_combine
from process_utils import find_matching

iedb_human_II_antigen = pl.read_parquet(
    "pmhc/human_II/human_II_antigen.parquet"
)
iedb_human_I_antigen = pl.read_parquet("pmhc/human_I/human_I_antigen.parquet")

# https://rnnh.github.io/bioinfo-notebook/docs/blast.html outfmt 10
schema = {
    "query_id": pl.String,
    "subject_id": pl.String,
    "per_identity": pl.Float64,
    "aln_length": pl.Int32,
    "mismatches": pl.Int32,
    "gap_opens": pl.Int32,
    "q_start": pl.Int32,
    "q_end": pl.Int32,
    "s_start": pl.Int32,
    "s_end": pl.Int32,
    "e_value": pl.Float64,
    "bit_score": pl.Float64,
}

blast_results = (
    pl.read_csv("blast_result.csv", schema=schema)
    .with_columns(
        pl.col("subject_id").str.split("_").list.get(0).alias("pdb"),
        pl.col("subject_id").str.split("_").list.get(1).alias("segid"),
    )
    .drop("subject_id")
)

iedb_human_II_antigen = serial_apply(
    iedb_human_II_antigen, find_matching, blast_results
)

iedb_human_I_antigen = serial_apply(
    iedb_human_I_antigen, find_matching, blast_results
)

#### Save antigen data to disk


In [32]:
iedb_human_II_antigen.write_parquet("pmhc/human_II/human_II_antigen.parquet")
iedb_human_I_antigen.write_parquet("pmhc/human_I/human_I_antigen.parquet")

### 4. Benchmark dataset construction


#### Re-import data


In [1]:
import polars as pl
from tcr_format_parsers.common.TriadUtils import (
    generate_job_name,
    FORMAT_COLS,
    FORMAT_TCR_COLS,
    FORMAT_ANTIGEN_COLS,
    TCRDIST_COLS,
    generate_negatives_antigen_matched,
    generate_all_possible_negs,
)


human_I_pos = pl.read_parquet("triad/human_I/human_I.parquet").filter(
    pl.col("cognate")
)
human_II_pos = pl.read_parquet("triad/human_II/human_II.parquet").filter(
    pl.col("cognate")
)

human_I_antigen = pl.read_parquet("pmhc/human_I/human_I_antigen.parquet")
human_II_antigen = pl.read_parquet("pmhc/human_II/human_II_antigen.parquet")

human_I_pos_thresh = human_I_pos.join(
    human_I_antigen.select(FORMAT_ANTIGEN_COLS + ["TCRdiv_samples"]),
    on=FORMAT_ANTIGEN_COLS,
).filter(pl.col("TCRdiv_samples") >= 3)
human_II_pos_thresh = human_II_pos.join(
    human_II_antigen.select(FORMAT_ANTIGEN_COLS + ["TCRdiv_samples"]),
    on=FORMAT_ANTIGEN_COLS,
).filter(pl.col("TCRdiv_samples") >= 3)

  from .autonotebook import tqdm as notebook_tqdm


#### Generate within and cross class negatives


In [None]:
cross_negs = generate_all_possible_negs(
    pl.concat([human_I_pos, human_II_pos]), cross_class=True
)

human_I_all_negs = generate_all_possible_negs(human_I_pos)
human_II_all_negs = generate_all_possible_negs(human_II_pos)

#### Sample down to 10 negs per pos


In [None]:
from process_utils import sample_to, sample_supplemental_negatives

human_I_negs, missing_neg_antigens = sample_to(
    human_I_antigen, human_I_all_negs, 10
).with_columns(
    pl.lit(None).alias("references"),
    pl.lit(None).alias("receptor_id"),
    pl.lit(None).alias("assay_type"),
)

human_I_supp_negs = sample_supplemental_negatives(
    missing_neg_antigens, cross_negs
).with_columns(
    pl.lit(None).alias("references"),
    pl.lit(None).alias("receptor_id"),
    pl.lit(None).alias("assay_type"),
)

human_I_bench_thresh_3 = pl.concat(
    [human_I_pos_thresh, human_I_negs, human_I_supp_negs]
)

human_II_negs, missing_neg_antigens = sample_to(
    human_II_antigen, human_II_all_negs, 10
).with_columns(
    pl.lit(None).alias("references"),
    pl.lit(None).alias("receptor_id"),
    pl.lit(None).alias("assay_type"),
)

human_II_supp_negs = sample_supplemental_negatives(
    missing_neg_antigens, cross_negs
).with_columns(
    pl.lit(None).alias("references"),
    pl.lit(None).alias("receptor_id"),
    pl.lit(None).alias("assay_type"),
)

human_II_bench_thresh_3 = pl.concat(
    [human_II_pos_thresh, human_II_negs, human_II_supp_negs]
)

Processing rows: 100%|██████████| 283/283 [07:47<00:00,  1.65s/it]


In [None]:
human_I_bench_thresh_3.select(FORMAT_COLS).write_csv(
    "triad/human_I/human_I_bench_thresh_3.csv"
)
human_I_bench_thresh_3.write_parquet(
    "triad/human_I/human_I_bench_thresh_3.parquet"
)


human_II_bench_thresh_3.select(FORMAT_COLS).write_csv(
    "triad/human_II/human_II_bench_thresh_3.csv"
)
human_II_bench_thresh_3.write_parquet(
    "triad/human_II/human_II_bench_thresh_3.parquet"
)

### 5. Write benchmark and 1:1 dataset into tcrdock format

After inference is complete


#### Re-import data


In [2]:
from pathlib import Path
import polars as pl

human_I = pl.read_parquet("triad/human_I/human_I.parquet")

human_I_bench_thresh_3 = pl.read_parquet(
    "triad/human_I/human_I_bench_thresh_3.parquet"
)

human_II = pl.read_parquet("triad/human_II/human_II.parquet")

human_II_bench_thresh_3 = pl.read_parquet(
    "triad/human_II/human_II_bench_thresh_3.parquet"
)

human_II_bench_inf = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_II/inference"
)
human_II_out_path = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_II/tcrdock_pdb"
)

human_I_bench_inf = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_I/inference"
)
human_I_out_path = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_I/tcrdock_pdb"
)

In [80]:
human_II_bench_thresh_3.with_columns(
    pl.when(pl.col("mhc_class") == "I")
    .then(pl.lit(1))
    .otherwise(pl.lit(2))
    .alias("mhc_class"),
    pl.col("job_name").alias("pdbid"),
    pl.lit("human").alias("organism"),
).select("pdbid", "mhc_class", "organism").write_csv(
    "triad/human_II/human_II_bench_thresh_3_tcrdock.csv",
)

human_II.with_columns(
    pl.when(pl.col("mhc_class") == "I")
    .then(pl.lit(1))
    .otherwise(pl.lit(2))
    .alias("mhc_class"),
    pl.col("job_name").alias("pdbid"),
    pl.lit("human").alias("organism"),
).select("pdbid", "mhc_class", "organism").write_csv(
    "triad/human_II/human_II_tcrdock.csv",
)

human_I_bench_thresh_3.with_columns(
    pl.when(pl.col("mhc_class") == "I")
    .then(pl.lit(1))
    .otherwise(pl.lit(2))
    .alias("mhc_class"),
    pl.col("job_name").alias("pdbid"),
    pl.lit("human").alias("organism"),
).select("pdbid", "mhc_class", "organism").write_csv(
    "triad/human_I/human_I_bench_thresh_3_tcrdock.csv",
)

human_I.with_columns(
    pl.when(pl.col("mhc_class") == "I")
    .then(pl.lit(1))
    .otherwise(pl.lit(2))
    .alias("mhc_class"),
    pl.col("job_name").alias("pdbid"),
    pl.lit("human").alias("organism"),
).select("pdbid", "mhc_class", "organism").write_csv(
    "triad/human_I/human_I_tcrdock.csv",
)

In [None]:
from process_utils import tcrdock_format_cif
from mdaf3.FeatureExtraction import split_apply_combine, serial_apply
import warnings

warnings.filterwarnings("ignore")

split_apply_combine(
    human_II_bench_thresh_3,
    tcrdock_format_cif,
    human_II_bench_inf,
    human_II_out_path,
)
split_apply_combine(
    human_II, tcrdock_format_cif, human_II_bench_inf, human_II_out_path
)

split_apply_combine(
    human_I, tcrdock_format_cif, human_I_bench_inf, human_I_out_path
)
split_apply_combine(
    human_I_bench_thresh_3,
    tcrdock_format_cif,
    human_I_bench_inf,
    human_I_out_path,
    chunksize=5,
)

### 6. Write Class I benchmark


In [None]:
import polars as pl
from pathlib import Path

human_I_bench_thresh_3 = pl.read_parquet(
    "triad/human_I/human_I_bench_thresh_3.parquet"
)

human_I_bench_inf = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_I/inference"
)

human_I_antigen = pl.read_parquet("pmhc/human_I/human_I_antigen.parquet")

In [None]:
from tcr_format_parsers.common.TriadUtils import (
    FORMAT_ANTIGEN_COLS,
    FORMAT_COLS,
)


human_I_20 = (
    human_I_antigen.filter(pl.col("TCRdiv_samples") >= 20)
    .with_row_index("group")
    .select(FORMAT_ANTIGEN_COLS + ["group"])
    .join(human_I_bench_thresh_3, on=FORMAT_ANTIGEN_COLS)
).select(
    FORMAT_COLS
    + TCRDIST_COLS
    + [
        "references",
        "receptor_id",
        "assay_type",
        "group",
    ]
)

In [24]:
human_I_20.write_parquet("triad/human_I/human_I_bench_thresh_20.parquet")

### 7. Rewrite class II benchmark dataset with 9mer windows


In [3]:
import polars as pl
from pathlib import Path

human_II_bench_thresh_3 = pl.read_parquet(
    "triad/human_II/human_II_bench_thresh_3.parquet"
)

human_II_bench_inf = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_II/inference"
)
human_II_out_path = Path(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_II/tcrdock_pdb"
)

human_II_antigen = pl.read_parquet("pmhc/human_II/human_II_antigen.parquet")

In [2]:
from tcr_format_parsers.common.TriadUtils import (
    FORMAT_ANTIGEN_COLS,
    FORMAT_COLS,
)

human_II_20 = (
    human_II_antigen.filter(pl.col("TCRdiv_samples") >= 20)
    .with_row_index("group")
    .select(FORMAT_ANTIGEN_COLS + ["group"])
    .join(human_II_bench_thresh_3, on=FORMAT_ANTIGEN_COLS)
).select(pl.exclude("job_name"))

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from tcr_format_parsers.common.TriadUtils import (
    generate_job_name,
    TCRDIST_COLS,
)

human_II_20_windowed = (
    human_II_20.with_columns(
        pl.col("peptide").map_elements(
            lambda x: [x[i : i + 9] for i in range(0, len(x) - 8)],
            return_dtype=pl.List(str),
        )
    )
    .with_columns(
        pl.int_ranges(pl.lit(0), pl.col("peptide").list.len()).alias(
            "window_id"
        )
    )
    .with_row_index("entity_id")
    .explode(["peptide", "window_id"])
)

human_II_20_windowed = generate_job_name(
    human_II_20_windowed, addtl_cols=["entity_id"]
).select(
    FORMAT_COLS
    + TCRDIST_COLS
    + [
        "references",
        "receptor_id",
        "assay_type",
        "group",
        "entity_id",
        "window_id",
    ]
)

In [None]:
human_II_20_windowed.select(FORMAT_COLS).write_csv(
    "triad/human_II/human_II_bench_thresh_20_windowed.csv"
)
human_II_20_windowed.write_parquet(
    "triad/human_II/human_II_bench_thresh_20_windowed.parquet"
)

In [None]:
import polars as pl

human_II_20_windowed = (
    pl.read_csv("triad/human_II/human_II_bench_thresh_20_windowed.csv")
    .with_columns(
        pl.when(pl.col("mhc_class") == "I")
        .then(pl.lit(1))
        .otherwise(pl.lit(2))
        .alias("mhc_class"),
        pl.col("job_name").alias("pdbid"),
        pl.lit("human").alias("organism"),
    )
    .select("pdbid", "mhc_class", "organism")
    .write_csv(
        "triad/human_II/human_II_windowed_tcrdock.csv",
    )
)

In [None]:
from process_utils import tcrdock_format_cif
from mdaf3.FeatureExtraction import split_apply_combine, serial_apply
import warnings
import polars as pl

warnings.filterwarnings("ignore")

serial_apply(
    pl.read_csv("triad/human_II/human_II_bench_thresh_20_windowed.csv"),
    tcrdock_format_cif,
    human_II_bench_inf,
    human_II_out_path,
)

Processing rows:   0%|          | 0/83182 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 83182/83182 [1:15:08<00:00, 18.45it/s]  


column_0
object
"shape: (1, 18) ┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐ │ job_name ┆ cognate ┆ peptide ┆ mhc_class ┆ … ┆ tcr_1_seq ┆ tcr_2_cha ┆ tcr_2_spe ┆ tcr_2_seq │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ in ┆ cies ┆ --- │ │ str ┆ bool ┆ str ┆ str ┆ ┆ str ┆ --- ┆ --- ┆ str │ │ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ │ ╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡ │ 2179934f76 ┆ true ┆ TFEYVSQPF ┆ II ┆ … ┆ GQQLNQSPQ ┆ beta ┆ human ┆ NAGVTQTPK │ │ 1479f4191e ┆ ┆ ┆ ┆ ┆ SMFIQEGED ┆ ┆ ┆ FQVLKTGQS │ │ 72017f0d3f ┆ ┆ ┆ ┆ ┆ VSMNCTSSS ┆ ┆ ┆ MTLQCAQDM │ │ … ┆ ┆ ┆ ┆ ┆ IFN… ┆ ┆ ┆ NHE… │ └────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
"shape: (1, 18) ┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐ │ job_name ┆ cognate ┆ peptide ┆ mhc_class ┆ … ┆ tcr_1_seq ┆ tcr_2_cha ┆ tcr_2_spe ┆ tcr_2_seq │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ in ┆ cies ┆ --- │ │ str ┆ bool ┆ str ┆ str ┆ ┆ str ┆ --- ┆ --- ┆ str │ │ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ │ ╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡ │ 20e0248684 ┆ true ┆ FEYVSQPFL ┆ II ┆ … ┆ GQQLNQSPQ ┆ beta ┆ human ┆ NAGVTQTPK │ │ c3f99cf4df ┆ ┆ ┆ ┆ ┆ SMFIQEGED ┆ ┆ ┆ FQVLKTGQS │ │ f21bae04a1 ┆ ┆ ┆ ┆ ┆ VSMNCTSSS ┆ ┆ ┆ MTLQCAQDM │ │ … ┆ ┆ ┆ ┆ ┆ IFN… ┆ ┆ ┆ NHE… │ └────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
"shape: (1, 18) ┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐ │ job_name ┆ cognate ┆ peptide ┆ mhc_class ┆ … ┆ tcr_1_seq ┆ tcr_2_cha ┆ tcr_2_spe ┆ tcr_2_seq │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ in ┆ cies ┆ --- │ │ str ┆ bool ┆ str ┆ str ┆ ┆ str ┆ --- ┆ --- ┆ str │ │ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ │ ╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡ │ 61aadd8236 ┆ true ┆ EYVSQPFLM ┆ II ┆ … ┆ GQQLNQSPQ ┆ beta ┆ human ┆ NAGVTQTPK │ │ cd2faad068 ┆ ┆ ┆ ┆ ┆ SMFIQEGED ┆ ┆ ┆ FQVLKTGQS │ │ 302d5ce1b3 ┆ ┆ ┆ ┆ ┆ VSMNCTSSS ┆ ┆ ┆ MTLQCAQDM │ │ … ┆ ┆ ┆ ┆ ┆ IFN… ┆ ┆ ┆ NHE… │ └────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
"shape: (1, 18) ┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐ │ job_name ┆ cognate ┆ peptide ┆ mhc_class ┆ … ┆ tcr_1_seq ┆ tcr_2_cha ┆ tcr_2_spe ┆ tcr_2_seq │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ in ┆ cies ┆ --- │ │ str ┆ bool ┆ str ┆ str ┆ ┆ str ┆ --- ┆ --- ┆ str │ │ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ │ ╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡ │ b24f52b796 ┆ true ┆ YVSQPFLMD ┆ II ┆ … ┆ GQQLNQSPQ ┆ beta ┆ human ┆ NAGVTQTPK │ │ 2d7a393e45 ┆ ┆ ┆ ┆ ┆ SMFIQEGED ┆ ┆ ┆ FQVLKTGQS │ │ 1a82af7ef1 ┆ ┆ ┆ ┆ ┆ VSMNCTSSS ┆ ┆ ┆ MTLQCAQDM │ │ … ┆ ┆ ┆ ┆ ┆ IFN… ┆ ┆ ┆ NHE… │ └────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
"shape: (1, 18) ┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐ │ job_name ┆ cognate ┆ peptide ┆ mhc_class ┆ … ┆ tcr_1_seq ┆ tcr_2_cha ┆ tcr_2_spe ┆ tcr_2_seq │ │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ in ┆ cies ┆ --- │ │ str ┆ bool ┆ str ┆ str ┆ ┆ str ┆ --- ┆ --- ┆ str │ │ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ │ ╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡ │ 91fac19665 ┆ true ┆ VSQPFLMDL ┆ II ┆ … ┆ GQQLNQSPQ ┆ beta ┆ human ┆ NAGVTQTPK │ │ ce79383faa ┆ ┆ ┆ ┆ ┆ SMFIQEGED ┆ ┆ ┆ FQVLKTGQS │ │ d8459001ce ┆ ┆ ┆ ┆ ┆ VSMNCTSSS ┆ ┆ ┆ MTLQCAQDM │ │ … ┆ ┆ ┆ ┆ ┆ IFN… ┆ ┆ ┆ NHE… │ └────────────┴─────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
…
"{'job_name': 'cd6205ce24acecfb939456fe50018612', 'cognate': False, 'peptide': 'ERNAGSGII', 'mhc_class': 'II', 'mhc_1_chain': 'alpha', 'mhc_1_species': 'human', 'mhc_1_name': 'DQA1*01:02', 'mhc_1_seq': 'EDIVADHVASCGVNLYQFYGPSGQYTHEFDGDEQFYVDLERKETAWRWPEFSKFGGFDPQGALRNMAVAKHNLNIMIKRYNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGQSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDQPLLKHWEPEIPAPMSELT', 'mhc_2_chain': 'beta', 'mhc_2_species': 'human', 'mhc_2_name': 'DQB1*06:02', 'mhc_2_seq': 'RDSPEDFVFQFKGMCYFTNGTERVRLVTRYIYNREEYARFDSDVGVYRAVTPQGRPDAEYWNSQKEVLEGTRAELDTVCRHNYEVAFRGILQRRVEPTVTISPSRTEALNHHNLLVCSVTDFYPGQIKVRWFRNDQEETAGVVSTPLIRNGDWTFQILVMLEMTPQRGDVYTCHVEHPSLQSPITVEWRAQSESAQSK', 'tcr_1_chain': 'alpha', 'tcr_1_species': 'human', 'tcr_1_seq': 'AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVSDKGDSSYKLIFGSGTRLLVRPD', 'tcr_2_chain': 'beta', 'tcr_2_species': 'human', 'tcr_2_seq': 'DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSLGTGYEQYFGPGTRLTVTE'}"
"{'job_name': '91978857905b0abb0ae3ca2653526d76', 'cognate': False, 'peptide': 'RNAGSGIII', 'mhc_class': 'II', 'mhc_1_chain': 'alpha', 'mhc_1_species': 'human', 'mhc_1_name': 'DQA1*01:02', 'mhc_1_seq': 'EDIVADHVASCGVNLYQFYGPSGQYTHEFDGDEQFYVDLERKETAWRWPEFSKFGGFDPQGALRNMAVAKHNLNIMIKRYNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGQSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDQPLLKHWEPEIPAPMSELT', 'mhc_2_chain': 'beta', 'mhc_2_species': 'human', 'mhc_2_name': 'DQB1*06:02', 'mhc_2_seq': 'RDSPEDFVFQFKGMCYFTNGTERVRLVTRYIYNREEYARFDSDVGVYRAVTPQGRPDAEYWNSQKEVLEGTRAELDTVCRHNYEVAFRGILQRRVEPTVTISPSRTEALNHHNLLVCSVTDFYPGQIKVRWFRNDQEETAGVVSTPLIRNGDWTFQILVMLEMTPQRGDVYTCHVEHPSLQSPITVEWRAQSESAQSK', 'tcr_1_chain': 'alpha', 'tcr_1_species': 'human', 'tcr_1_seq': 'AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVSDKGDSSYKLIFGSGTRLLVRPD', 'tcr_2_chain': 'beta', 'tcr_2_species': 'human', 'tcr_2_seq': 'DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSLGTGYEQYFGPGTRLTVTE'}"
"{'job_name': 'aee1277d23a42cdc5a1f57a2cc897c07', 'cognate': False, 'peptide': 'NAGSGIIIS', 'mhc_class': 'II', 'mhc_1_chain': 'alpha', 'mhc_1_species': 'human', 'mhc_1_name': 'DQA1*01:02', 'mhc_1_seq': 'EDIVADHVASCGVNLYQFYGPSGQYTHEFDGDEQFYVDLERKETAWRWPEFSKFGGFDPQGALRNMAVAKHNLNIMIKRYNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGQSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDQPLLKHWEPEIPAPMSELT', 'mhc_2_chain': 'beta', 'mhc_2_species': 'human', 'mhc_2_name': 'DQB1*06:02', 'mhc_2_seq': 'RDSPEDFVFQFKGMCYFTNGTERVRLVTRYIYNREEYARFDSDVGVYRAVTPQGRPDAEYWNSQKEVLEGTRAELDTVCRHNYEVAFRGILQRRVEPTVTISPSRTEALNHHNLLVCSVTDFYPGQIKVRWFRNDQEETAGVVSTPLIRNGDWTFQILVMLEMTPQRGDVYTCHVEHPSLQSPITVEWRAQSESAQSK', 'tcr_1_chain': 'alpha', 'tcr_1_species': 'human', 'tcr_1_seq': 'AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVSDKGDSSYKLIFGSGTRLLVRPD', 'tcr_2_chain': 'beta', 'tcr_2_species': 'human', 'tcr_2_seq': 'DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSLGTGYEQYFGPGTRLTVTE'}"
"{'job_name': '9433ae9d85a0b4e5211997133ccb8d4f', 'cognate': False, 'peptide': 'AGSGIIISD', 'mhc_class': 'II', 'mhc_1_chain': 'alpha', 'mhc_1_species': 'human', 'mhc_1_name': 'DQA1*01:02', 'mhc_1_seq': 'EDIVADHVASCGVNLYQFYGPSGQYTHEFDGDEQFYVDLERKETAWRWPEFSKFGGFDPQGALRNMAVAKHNLNIMIKRYNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGQSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDQPLLKHWEPEIPAPMSELT', 'mhc_2_chain': 'beta', 'mhc_2_species': 'human', 'mhc_2_name': 'DQB1*06:02', 'mhc_2_seq': 'RDSPEDFVFQFKGMCYFTNGTERVRLVTRYIYNREEYARFDSDVGVYRAVTPQGRPDAEYWNSQKEVLEGTRAELDTVCRHNYEVAFRGILQRRVEPTVTISPSRTEALNHHNLLVCSVTDFYPGQIKVRWFRNDQEETAGVVSTPLIRNGDWTFQILVMLEMTPQRGDVYTCHVEHPSLQSPITVEWRAQSESAQSK', 'tcr_1_chain': 'alpha', 'tcr_1_species': 'human', 'tcr_1_seq': 'AQSVTQLGSHVSVSEGALVLLRCNYSSSVPPYLFWYVQYPNQGLQLLLKYTSAATLVKGINGFEAEFKKSETSFHLTKPSAHMSDAAEYFCAVSDKGDSSYKLIFGSGTRLLVRPD', 'tcr_2_chain': 'beta', 'tcr_2_species': 'human', 'tcr_2_seq': 'DTGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLTYFQNEAQLEKSRLLSDRFSAERPKGSFSTLEIQRTEQGDSAMYLCASSLGTGYEQYFGPGTRLTVTE'}"


: 