In [1]:
import numpy as np
import pandas as pd
from malid import config, etl, helpers, logger
from malid.datamodels import GeneLocus

# Load MIRA TCR Covid-19 known binder data, subset, and add our IgBlast parses

In [2]:
# combine datasets, but store row numbers in each one
df = pd.concat(
    [
        pd.read_csv(
            config.paths.external_raw_data
            / "immunecode_all/mira/ImmuneCODE-MIRA-Release002.1/peptide-detail-ci.csv"
        )
        .assign(source="peptide-cI")
        .rename_axis(index="rownum")
        .reset_index(),
        pd.read_csv(
            config.paths.external_raw_data
            / "immunecode_all/mira/ImmuneCODE-MIRA-Release002.1/peptide-detail-cii.csv"
        )
        .assign(source="peptide-cII")
        .rename_axis(index="rownum")
        .reset_index(),
        pd.read_csv(
            config.paths.external_raw_data
            / "immunecode_all/mira/ImmuneCODE-MIRA-Release002.1/minigene-detail.csv"
        )
        .assign(source="minigene")
        .rename_axis(index="rownum")
        .reset_index(),
    ],
    axis=0,
)

# Merge in subject metadata
subject_metadata = pd.read_csv(
    config.paths.external_raw_data
    / "immunecode_all/mira/ImmuneCODE-MIRA-Release002.1/subject-metadata.csv",
    encoding="unicode_escape",
)
df = pd.merge(
    df,
    subject_metadata,
    how="left",
    on="Experiment",
    validate="m:1",
)
assert not df["Cohort"].isna().any()

# Filter to Covid19 patient data
# We're excluding these cohorts: Healthy (No known exposure), COVID-19-B-Non-Acute, COVID-19-Exposed
df = df[df["Cohort"].isin(["COVID-19-Acute", "COVID-19-Convalescent"])]

# split bioidentity
assert not df["TCR BioIdentity"].isna().any()
df = pd.concat(
    [
        df,
        df["TCR BioIdentity"]
        .str.split("+", expand=True)
        .rename(columns={0: "cdr3_seq_aa_q_trim", 1: "v_gene", 2: "j_gene"}),
    ],
    axis=1,
)


# Trim CDR3: remove ends
# and replace field that's entirely space (or empty) with NaN
df["cdr3_seq_aa_q_trim"] = (
    df["cdr3_seq_aa_q_trim"]
    .str.slice(start=1, stop=-1)
    .replace(r"^\s*$", np.nan, regex=True)
)

# Add length
# Note: Adaptive data can have a length cutoff that will potentially exclude some longer CDR3s
df["cdr3_aa_sequence_trim_len"] = df["cdr3_seq_aa_q_trim"].str.len()

In [3]:
df

Unnamed: 0,rownum,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome,source,ORF,...,DRB3,DRB3.1,DRB4,DRB4.1,DRB5,DRB5.1,cdr3_seq_aa_q_trim,v_gene,j_gene,cdr3_aa_sequence_trim_len
5,5,CASSLLGWEQLDEQFF+TCRBV27-01+TCRBJ02-01,TCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTT...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,DRB3*02:02:01,,,,,,ASSLLGWEQLDEQF,TCRBV27-01,TCRBJ02-01,14.0
9,9,CASSAGQGASDEQFF+TCRBV07-09+TCRBJ02-01,CAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCA...,eMR15,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,DRB3*02:02:01,,,,DRB5*01:01:01,,ASSAGQGASDEQF,TCRBV07-09,TCRBJ02-01,13.0
13,13,CASSPLEWEGPTEAFF+TCRBV27-01+TCRBJ01-01,TCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTC...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,DRB3*02:02:01,,,,,,ASSPLEWEGPTEAF,TCRBV27-01,TCRBJ01-01,14.0
17,17,CASSSIEWEGPGDEQFF+TCRBV27-01+TCRBJ02-01,CCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTAGTA...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,DRB3*02:02:01,,,,,,ASSSIEWEGPGDEQF,TCRBV27-01,TCRBJ02-01,15.0
19,19,CASSWDGGLASNQPQHF+TCRBV07-09+TCRBJ01-05,ACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCTGGG...,eQD137,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,DRB3*01:01:02,,DRB4*01:01:01,,,,ASSWDGGLASNQPQH,TCRBV07-09,TCRBJ01-05,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161121,6801,CASSLVGQTQPQHF+TCRBV27-01+TCRBJ01-05,CTGGAGTCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCA...,eAV100,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,DRB3*02:02:01,,,,DRB5*01:01:01,,ASSLVGQTQPQH,TCRBV27-01,TCRBJ01-05,12.0
161122,6802,CASSPWFERGKIRRNEQFF+TCRBV05-04+TCRBJ02-01,CTGGACGACTCGGCCCTGTATCTCTGTGCCAGCAGCCCCTGGTTCG...,eNL192,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,,,,,,,ASSPWFERGKIRRNEQF,TCRBV05-04,TCRBJ02-01,17.0
161123,6803,CASSRRDPSTDTQYF+TCRBV06-02+TCRBJ02-03,NTGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCA...,eQD139,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,DRB3*02:02:01,,,,,,ASSRRDPSTDTQY,TCRBV06-02,TCRBJ02-03,13.0
161124,6804,CASSPRGGGPPKEQYF+TCRBV12-X+TCRBJ02-07,CCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGCAGTC...,eQD139,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,DRB3*02:02:01,,,,,,ASSPRGGGPPKEQY,TCRBV12-X,TCRBJ02-07,14.0


In [4]:
# Merge our IgBlast output

In [5]:
# We ran IgBlast ourselves using the "TCR Nucleotide Sequence" field
# Our IgBlast gives some different V gene calls, but generally doesn't provide CDR3 calls for these short sequences.

# Use the V/J gene calls from our IgBlast.
# Keep sequences called productive by our IgBlast.
# Use Adaptive CDR3 call.

parse_fnames = list(
    (
        config.paths.external_raw_data
        / "immunecode_all/mira/ImmuneCODE-MIRA-Release002.1/splits"
    ).glob(f"*.fasta.part*.fasta.parse.txt.parsed.tsv")
)
len(parse_fnames)

34

In [6]:
if len(parse_fnames) == 0:
    raise ValueError(f"No igblast parse files found")

In [7]:
df_parse = pd.concat([pd.read_csv(fname, sep="\t") for fname in parse_fnames], axis=0)
len(df_parse), len(df)

(162651, 43592)

In [8]:
# extract fasta ID
df_parse[["specimen_label", "rownum"]] = df_parse["id"].str.split("|", expand=True)
df_parse["rownum"] = df_parse["rownum"].astype(int)
for specimen_label, grp in df_parse.groupby("specimen_label"):
    assert not grp["rownum"].duplicated().any()

In [9]:
df_parse["specimen_label"].value_counts()

peptide-detail-ci     154319
peptide-detail-cii      6809
minigene-detail         1523
Name: specimen_label, dtype: int64

In [10]:
# create "source" column
df_parse["source"] = df_parse["specimen_label"].replace(
    {
        "peptide-detail-ci": "peptide-cI",
        "peptide-detail-cii": "peptide-cII",
        "minigene-detail": "minigene",
    }
)
df_parse["source"].value_counts()

peptide-cI     154319
peptide-cII      6809
minigene         1523
Name: source, dtype: int64

In [11]:
orig_shape = df.shape
df = pd.merge(
    df.drop(columns=["v_gene", "j_gene"]),
    df_parse[["source", "rownum", "v_segment", "j_segment", "productive"]],
    left_on=["source", "rownum"],
    right_on=["source", "rownum"],
    how="inner",
    validate="1:1",
)
assert df.shape[0] == min(orig_shape[0], df_parse.shape[0])

In [12]:
df.head()

Unnamed: 0,rownum,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome,source,ORF,...,DRB3.1,DRB4,DRB4.1,DRB5,DRB5.1,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len,v_segment,j_segment,productive
0,5,CASSLLGWEQLDEQFF+TCRBV27-01+TCRBJ02-01,TCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTT...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,,,,,,ASSLLGWEQLDEQF,14.0,TRBV27*01,TRBJ2-1*01,True
1,9,CASSAGQGASDEQFF+TCRBV07-09+TCRBJ02-01,CAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCA...,eMR15,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,,,,DRB5*01:01:01,,ASSAGQGASDEQF,13.0,TRBV7-9*01,TRBJ2-1*01,True
2,13,CASSPLEWEGPTEAFF+TCRBV27-01+TCRBJ01-01,TCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTC...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,,,,,,ASSPLEWEGPTEAF,14.0,TRBV27*01,TRBJ1-1*01,True
3,17,CASSSIEWEGPGDEQFF+TCRBV27-01+TCRBJ02-01,CCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTAGTA...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,,,,,,ASSSIEWEGPGDEQF,15.0,TRBV27*01,TRBJ2-1*01,True
4,19,CASSWDGGLASNQPQHF+TCRBV07-09+TCRBJ01-05,ACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCTGGG...,eQD137,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,,DRB4*01:01:01,,,,ASSWDGGLASNQPQH,15.0,TRBV7-9*01,TRBJ1-5*01,True


In [13]:
# Filter to TRB sequencing data by looking at V gene name
df = df[df["v_segment"].str.startswith("TRBV")]

# set isotype flag
df["extracted_isotype"] = "TCRB"

df.shape

(43592, 47)

In [14]:
df["productive"].value_counts()

True     41977
False     1614
Name: productive, dtype: int64

In [15]:
# productive field is missing for one sequence, no big deal
df["productive"].isna().value_counts()

False    43591
True         1
Name: productive, dtype: int64

In [16]:
df.dropna(subset="productive", inplace=True)
df.shape

(43591, 47)

In [17]:
df = df[df["productive"]].copy()
df.shape

(41977, 47)

In [18]:
# compute important columns
# note that this converts v_segment, j_segment (with alleles) to v_gene, j_gene columns (no alleles).
df = etl._compute_columns(df=df, gene_locus=GeneLocus.TCR)
df.shape

2022-12-28 17:34:47,843 - malid.etl - INFO - Number of rows missing a CDR1, by v_segment: {'TRBV7-5*01': 1}


(41633, 55)

In [19]:
# Deprecated logic for working with Adaptive's own V/J gene calls:

# # Filter to TRB
# df = df[df["v_gene"].str.startswith("TCRBV")]

# # Change from Adaptive V/J gene nomenclature to IMGT
# # See https://tcrdist3.readthedocs.io/en/latest/adaptive.html
# df["v_gene"] = df["v_gene"].apply(lambda vgene: adaptive_to_imgt["human"].get(vgene))
# df["j_gene"] = df["j_gene"].apply(lambda vgene: adaptive_to_imgt["human"].get(vgene))

# # Remove alleles
# df["v_gene"] = df["v_gene"].str.split("*").str[0]
# df["j_gene"] = df["j_gene"].str.split("*").str[0]

# # Drop N/A and duplicates
# df = df.dropna(subset=["v_gene", "j_gene", "cdr3_seq_aa_q_trim"]).drop_duplicates(
#     subset=["v_gene", "j_gene", "cdr3_seq_aa_q_trim"]
# )

# # Make categorical
# df["v_gene"] = df["v_gene"].astype("category")
# df["j_gene"] = df["j_gene"].astype("category")

In [20]:
# Downselect only to V genes that are in our data
# We will never get matches on the rest. Clustering will always fail.
invalid_v_genes = set(df["v_gene"].unique()) - set(
    helpers.all_observed_v_genes()[GeneLocus.TCR]
)
logger.warning(f"Dropping MIRA V genes that aren't in our data: {invalid_v_genes}")
df = df[df["v_gene"].isin(helpers.all_observed_v_genes()[GeneLocus.TCR])]



# Export

In [21]:
# Drop duplicates
print(df.shape)
df = df.drop_duplicates(subset=["v_gene", "j_gene", "cdr3_seq_aa_q_trim"])
print(df.shape)

(41311, 55)
(37591, 55)


In [22]:
df

Unnamed: 0,rownum,TCR BioIdentity,TCR Nucleotide Sequence,Experiment,ORF Coverage,Amino Acids,Start Index in Genome,End Index in Genome,source,ORF,...,productive,extracted_isotype,isotype_supergroup,v_gene,j_gene,cdr1_seq_aa_q,cdr2_seq_aa_q,cdr1_seq_aa_q_trim,cdr2_seq_aa_q_trim,v_mut
0,5,CASSLLGWEQLDEQFF+TCRBV27-01+TCRBJ02-01,TCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTT...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,True,TCRB,TCRB,TRBV27,TRBJ2-1,MNHEY,SMNVEV,MNHEY,SMNVEV,0.0
1,9,CASSAGQGASDEQFF+TCRBV07-09+TCRBJ02-01,CAGCGCACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCA...,eMR15,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,True,TCRB,TCRB,TRBV7-9,TRBJ2-1,SEHNR,FQNEAQ,SEHNR,FQNEAQ,0.0
2,13,CASSPLEWEGPTEAFF+TCRBV27-01+TCRBJ01-01,TCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTC...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,True,TCRB,TCRB,TRBV27,TRBJ1-1,MNHEY,SMNVEV,MNHEY,SMNVEV,0.0
3,17,CASSSIEWEGPGDEQFF+TCRBV27-01+TCRBJ02-01,CCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCAGCAGTAGTA...,eMR16,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,True,TCRB,TCRB,TRBV27,TRBJ2-1,MNHEY,SMNVEV,MNHEY,SMNVEV,0.0
4,19,CASSWDGGLASNQPQHF+TCRBV07-09+TCRBJ01-05,ACAGAGCAGGGGGACTCGGCCATGTATCTCTGTGCCAGCAGCTGGG...,eQD137,"ORF1ab,surface glycoprotein","ADAGFIKQY,AELEGIQY,LADAGFIKQY,TLADAGFIK",533,24073,peptide-cI,,...,True,TCRB,TCRB,TRBV7-9,TRBJ1-5,SEHNR,FQNEAQ,SEHNR,FQNEAQ,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43587,6801,CASSLVGQTQPQHF+TCRBV27-01+TCRBJ01-05,CTGGAGTCGCCCAGCCCCAACCAGACCTCTCTGTACTTCTGTGCCA...,eAV100,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,True,TCRB,TCRB,TRBV27,TRBJ1-5,MNHEY,SMNVEV,MNHEY,SMNVEV,0.0
43588,6802,CASSPWFERGKIRRNEQFF+TCRBV05-04+TCRBJ02-01,CTGGACGACTCGGCCCTGTATCTCTGTGCCAGCAGCCCCTGGTTCG...,eNL192,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,True,TCRB,TCRB,TRBV5-4,TRBJ2-1,SGHNT,YYREEE,SGHNT,YYREEE,0.0
43589,6803,CASSRRDPSTDTQYF+TCRBV06-02+TCRBJ02-03,NTGTCGGCTGCTCCCTCCCAAACATCTGTGTACTTCTGTGCCAGCA...,eQD139,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,True,TCRB,TCRB,TRBV6-2,TRBJ2-3,MNHEY,SVGEGT,MNHEY,SVGEGT,0.0
43590,6804,CASSPRGGGPPKEQYF+TCRBV12-X+TCRBJ02-07,CCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGCAGTC...,eQD139,nucleocapsid phosphoprotein,"DETQALPQRQKKQQTVTLL,KKDKKKKADETQALPQRQK,KTFPPT...",29354,29530,peptide-cII,,...,True,TCRB,TCRB,TRBV12-3,TRBJ2-7,SGHNS,FNNNVP,SGHNS,FNNNVP,0.0


In [23]:
df.columns

Index(['rownum', 'TCR BioIdentity', 'TCR Nucleotide Sequence', 'Experiment',
       'ORF Coverage', 'Amino Acids', 'Start Index in Genome',
       'End Index in Genome', 'source', 'ORF', 'ORF Genebank ID', 'Amino Acid',
       'Subject', 'Cell Type', 'Target Type', 'Cohort', 'Age', 'Gender',
       'Race', 'HLA-A', 'HLA-A.1', 'HLA-B', 'HLA-B.1', 'HLA-C', 'HLA-C.1',
       'DPA1', 'DPA1.1', 'DPB1', 'DPB1.1', 'DQA1', 'DQA1.1', 'DQB1', 'DQB1.1',
       'DRB1', 'DRB1.1', 'DRB3', 'DRB3.1', 'DRB4', 'DRB4.1', 'DRB5', 'DRB5.1',
       'cdr3_seq_aa_q_trim', 'cdr3_aa_sequence_trim_len', 'v_segment',
       'j_segment', 'productive', 'extracted_isotype', 'isotype_supergroup',
       'v_gene', 'j_gene', 'cdr1_seq_aa_q', 'cdr2_seq_aa_q',
       'cdr1_seq_aa_q_trim', 'cdr2_seq_aa_q_trim', 'v_mut'],
      dtype='object')

In [24]:
df.to_csv(
    config.paths.external_raw_data
    / "immunecode_all/mira/ImmuneCODE-MIRA-Release002.1"
    / "mira_combined.filtered.tsv",
    sep="\t",
    index=None,
)