In [10]:
%load_ext autoreload
%autoreload 2


In [1]:
import polars as pl
from tcr_format_parsers.common.TriadUtils import *


triad = pl.read_csv(
    "/tgen_labs/altin/alphafold3/runs/manucript_inp/CRESTA/output/cresta.csv"
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_job_name_pmhc(df):
    df = df.with_columns(
        pl.concat_str(
            pl.concat_str(
                [
                    pl.col("peptide"),
                    pl.col("mhc_1_seq"),
                    pl.col("mhc_2_seq"),
                ],
            )
            .map_elements(
                lambda x: hash_tcr_sequence(x, "md5"), return_dtype=pl.String
            )
            .alias("job_name"),
        )
    )
    return df


cresta_pmhc = generate_job_name_pmhc(
    triad.select(FORMAT_ANTIGEN_COLS).unique()
).select(["job_name"] + FORMAT_ANTIGEN_COLS)

In [3]:
cresta_pmhc.write_csv("pmhc/cresta_antigen.csv")

# 10x negatives


In [14]:
import polars as pl
from tcr_format_parsers.common.TriadUtils import FORMAT_COLS, TCRDIST_COLS
from tcr_format_parsers.common.TCRUtils import extract_tcrdist_cols
from mdaf3.FeatureExtraction import *

cognate = serial_apply(
    pl.read_csv(
        "/tgen_labs/altin/alphafold3/runs/manucript_inp/CRESTA/output/cresta.csv"
    )
    .filter(pl.col("cognate"))
    .select(FORMAT_COLS),
    extract_tcrdist_cols,
).select(FORMAT_COLS + TCRDIST_COLS + ["tcr_1_cdr_2_5", "tcr_2_cdr_2_5"])

# yes, this dataset already has cdr annotations
# but cresta doesn't, so we need to use
# pw_tcrdist 'use_provided_cdr' which requires the cdr 2.5 region
negs = serial_apply(
    pl.read_parquet(
        "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/iedb/triad/human_II/human_II.parquet"
    )
    .filter(~pl.col("cognate"))
    .select(FORMAT_COLS),
    extract_tcrdist_cols,
).select(FORMAT_COLS + TCRDIST_COLS + ["tcr_1_cdr_2_5", "tcr_2_cdr_2_5"])

Processing rows:   0%|          | 0/206 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 206/206 [00:19<00:00, 10.82it/s]
Processing rows:  59%|█████▉    | 993/1674 [01:21<01:00, 11.25it/s]

Limiting hmmer search to species ['human'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species
Limiting hmmer search to species ['human'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species


Processing rows: 100%|██████████| 1674/1674 [02:17<00:00, 12.17it/s]


In [26]:
from tcr_format_parsers.common.TriadUtils import (
    generate_all_possible_negs,
    SOURCE_RENAME_DICT,
    SOURCE_ANTIGEN_COLS,
    FORMAT_ANTIGEN_COLS,
)

all_neg = generate_all_possible_negs(
    pl.concat([cognate, negs]), use_provided_cdr=True
).join(
    # remove negatives that come from cresta
    cognate.select(FORMAT_ANTIGEN_COLS).unique().rename(SOURCE_RENAME_DICT),
    on=SOURCE_ANTIGEN_COLS,
    how="anti",
)

In [None]:
from process_utils import sample_to

cresta_10x_negs = sample_to(
    cognate.group_by(FORMAT_ANTIGEN_COLS).agg(
        pl.len().alias("TCRdiv_samples")
    ),
    all_neg,
    10,
)

Processing rows: 100%|██████████| 8/8 [00:38<00:00,  4.78s/it]


In [None]:
cognate_10x = pl.concat(
    [
        cognate.select(FORMAT_COLS + TCRDIST_COLS),
        # ignore second element of tuple- only one antigen with missing TCRs, only short by 7
        cresta_10x_negs[0].select(FORMAT_COLS + TCRDIST_COLS),
    ]
)

In [26]:
from tcr_format_parsers.common.TriadUtils import shorten_tcrs
from mdaf3.FeatureExtraction import serial_apply

cognate_10x = serial_apply(
    cognate_10x,
    shorten_tcrs,
)

Processing rows:   0%|          | 0/2258 [00:00<?, ?it/s]

Processing rows:  34%|███▎      | 761/2258 [00:35<01:11, 21.02it/s]

Limiting hmmer search to species ['human'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species


Processing rows:  58%|█████▊    | 1305/2258 [01:01<00:46, 20.63it/s]

Limiting hmmer search to species ['human'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species


Processing rows: 100%|██████████| 2258/2258 [01:46<00:00, 21.28it/s]


In [30]:
from tcr_format_parsers.common.TriadUtils import FORMAT_TCR_COLS

cognate_10x.select(FORMAT_COLS).unique().write_csv("triad/cresta_10x.csv")
cognate_10x.unique().write_parquet("triad/cresta_10x.parquet")

In [None]:
cognate_10x = pl.read_parquet(
    "/tgen_labs/altin/alphafold3/runs/tcrtrifold-experiments/data/cresta/triad/cresta_10x.parquet"
)

## Normal CRESTA set, with shortened TCR


In [34]:
import polars as pl
from tcr_format_parsers.common.TriadUtils import FORMAT_COLS, TCRDIST_COLS
from tcr_format_parsers.common.TCRUtils import extract_tcrdist_cols
from mdaf3.FeatureExtraction import *

cresta_shortened = serial_apply(
    pl.read_csv(
        "/tgen_labs/altin/alphafold3/runs/manucript_inp/CRESTA/output/cresta.csv"
    )
    .select(FORMAT_COLS).unique(),
    extract_tcrdist_cols,
).select(FORMAT_COLS + TCRDIST_COLS + ["tcr_1_cdr_2_5", "tcr_2_cdr_2_5"])

cresta_shortened = serial_apply(
    cresta_shortened, shorten_tcrs
)

Processing rows:   0%|          | 0/411 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 411/411 [00:42<00:00,  9.60it/s]
Processing rows: 100%|██████████| 411/411 [00:21<00:00, 19.40it/s]


In [36]:
from tcr_format_parsers.common.TriadUtils import FORMAT_TCR_COLS

cresta_shortened.select(FORMAT_COLS).unique().write_csv(
    "triad/cresta_shortened.csv"
)
cresta_shortened.unique().write_parquet("triad/cresta_shortened.parquet")