In [1]:
import numpy as np
import pandas as pd
from malid import config, io, helpers
from malid.datamodels import healthy_label, GeneLocus, TargetObsColumnEnum

In [2]:
# Uses data from vgene_usage_stats.ipynb


def get_dirs(gene_locus: GeneLocus):
    output_dir = config.paths.model_interpretations_output_dir / gene_locus.name
    highres_output_dir = (
        config.paths.high_res_outputs_dir / "model_interpretations" / gene_locus.name
    )

    return output_dir, highres_output_dir


def import_v_gene_counts(gene_locus: GeneLocus):
    output_dir, highres_output_dir = get_dirs(gene_locus)

    specimen_v_gene_counts_df = pd.read_csv(
        highres_output_dir / "v_gene_counts_by_specimen.tsv.gz", sep="\t"
    )

    # subselect to test folds only (which also excludes global fold -1), and set index
    specimen_v_gene_counts_df_test_only = specimen_v_gene_counts_df[
        specimen_v_gene_counts_df["fold_label"] == "test"
    ]

    # confirm only one entry per specimen now
    assert not specimen_v_gene_counts_df_test_only["specimen_label"].duplicated().any()
    specimen_v_gene_counts_df_test_only = specimen_v_gene_counts_df_test_only.set_index(
        "specimen_label"
    ).drop(["fold_id", "fold_label"], axis=1)

    # fill na
    specimen_v_gene_counts_df_test_only = specimen_v_gene_counts_df_test_only.fillna(0)

    v_gene_cols = specimen_v_gene_counts_df_test_only.columns
    v_gene_cols = v_gene_cols[~v_gene_cols.isin(["disease"])]

    # get filtered subset of v_gene_cols, produced previously
    # TODO: switch to V genes from model1's choices?
    v_gene_cols_filtered = pd.read_csv(output_dir / "meaningful_v_genes.txt")[
        "v_gene"
    ].values
    assert all(vgene in v_gene_cols for vgene in v_gene_cols_filtered)  # sanity check

    return specimen_v_gene_counts_df_test_only, v_gene_cols, v_gene_cols_filtered

In [3]:
totals = {}
for gene_locus in config.gene_loci_used:
    print(gene_locus)
    df, v_gene_cols, _ = import_v_gene_counts(gene_locus=gene_locus)
    totals[gene_locus.name] = df[v_gene_cols].sum(axis=1).astype(int)
totals = pd.DataFrame(totals).fillna(0).astype(int)
totals

GeneLocus.BCR
GeneLocus.TCR


Unnamed: 0_level_0,BCR,TCR
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1
M111-S001,67904,30156
M111-S002,65301,64634
M111-S003,56522,66207
M111-S004,57568,45237
M111-S005,50086,42019
...,...,...
M64-110,43981,69474
M64-111,14112,32660
M64-112,21480,40749
M64-113,125228,94335


In [4]:
# have some BCR only specimens, as expected
(totals == 0).any(axis=0)

BCR    False
TCR     True
dtype: bool

In [5]:
# have some BCR only specimens, as expected
totals.loc[(totals == 0).any(axis=1)]

Unnamed: 0_level_0,BCR,TCR
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1
M281redo-S001,6748,0
M281redo-S002,5283,0
M281redo-S003,14555,0
M281redo-S004,10394,0
M281redo-S005,10912,0
...,...,...
M404-S002,57194,0
M404-S005,56246,0
M404-S008,69789,0
M404-S011,51472,0


In [6]:
cols = totals.columns
cols

Index(['BCR', 'TCR'], dtype='object')

In [7]:
total = totals.sum(axis=1)
total

specimen_label
M111-S001     98060
M111-S002    129935
M111-S003    122729
M111-S004    102805
M111-S005     92105
              ...  
M64-110      113455
M64-111       46772
M64-112       62229
M64-113      219563
M64-114      223398
Length: 480, dtype: int64

In [8]:
orig_shape = totals.shape
totals = pd.merge(
    totals,
    helpers.get_all_specimen_info().set_index("specimen_label")[
        ["disease", "study_name", "participant_label", "in_training_set"]
    ],
    left_index=True,
    right_index=True,
    validate="1:1",
    how="inner",
)
assert totals.shape[0] == orig_shape[0]
totals

Unnamed: 0_level_0,BCR,TCR,disease,study_name,participant_label,in_training_set
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M111-S001,67904,30156,HIV,HIV,BFI-0003462,True
M111-S002,65301,64634,HIV,HIV,BFI-0003481,True
M111-S003,56522,66207,HIV,HIV,BFI-0003466,True
M111-S004,57568,45237,HIV,HIV,BFI-0003460,True
M111-S005,50086,42019,HIV,HIV,BFI-0003463,True
...,...,...,...,...,...,...
M64-110,43981,69474,Healthy/Background,Healthy-StanfordBloodCenter,BFI-0003159,True
M64-111,14112,32660,Healthy/Background,Healthy-StanfordBloodCenter,BFI-0003160,True
M64-112,21480,40749,Healthy/Background,Healthy-StanfordBloodCenter,BFI-0003161,True
M64-113,125228,94335,Healthy/Background,Healthy-StanfordBloodCenter,BFI-0003162,True


In [9]:
assert totals["in_training_set"].all(), "sanity check"

In [10]:
# num clones
totals.groupby(["disease", "study_name"], observed=True)[cols].sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,BCR,TCR
disease,study_name,Unnamed: 2_level_1,Unnamed: 3_level_1
Covid19,Covid19-Seattle,276076,0
Covid19,Covid19-Stanford,403562,654000
Covid19,Covid19-buffycoat,256655,193568
HIV,HIV,2762764,3164681
Healthy/Background,HIV,832374,1472515
Healthy/Background,Healthy-StanfordBloodCenter,4740876,5803482
Healthy/Background,Lupus,365431,0
Healthy/Background,New Lupus Paxgene,117351,377830
Healthy/Background,New Lupus RNA,125576,107635
Healthy/Background,healthy_children,1134937,3834725


In [11]:
# num patients
totals.groupby(["disease", "study_name"], observed=True)[
    "participant_label"
].nunique().to_frame(name="number of individuals")

Unnamed: 0_level_0,Unnamed: 1_level_0,number of individuals
disease,study_name,Unnamed: 2_level_1
Covid19,Covid19-Seattle,5
Covid19,Covid19-Stanford,48
Covid19,Covid19-buffycoat,10
HIV,HIV,95
Healthy/Background,HIV,43
Healthy/Background,Healthy-StanfordBloodCenter,102
Healthy/Background,Lupus,23
Healthy/Background,New Lupus Paxgene,2
Healthy/Background,New Lupus RNA,4
Healthy/Background,healthy_children,43


In [12]:
# num specimens
totals.groupby(["disease", "study_name"], observed=True).size().to_frame(
    name="number of specimens"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,number of specimens
disease,study_name,Unnamed: 2_level_1
Covid19,Covid19-Seattle,5
Covid19,Covid19-Stanford,48
Covid19,Covid19-buffycoat,10
HIV,HIV,98
Healthy/Background,HIV,43
Healthy/Background,Healthy-StanfordBloodCenter,102
Healthy/Background,Lupus,27
Healthy/Background,New Lupus Paxgene,2
Healthy/Background,New Lupus RNA,4
Healthy/Background,healthy_children,43


In [13]:
# make a table of all
df_all = pd.concat(
    [
        # num patients
        totals.groupby(["disease", "study_name"], observed=True)["participant_label"]
        .nunique()
        .to_frame(name="number of individuals"),
        # num specimens
        totals.groupby(["disease", "study_name"], observed=True)
        .size()
        .to_frame(name="number of specimens"),
        # num clones
        totals.groupby(["disease", "study_name"], observed=True)[cols].sum(),
    ],
    axis=1,
)
df_all.to_csv(config.paths.output_dir / "size_of_each_disease_batch.tsv", sep="\t")
df_all

Unnamed: 0_level_0,Unnamed: 1_level_0,number of individuals,number of specimens,BCR,TCR
disease,study_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Covid19,Covid19-Seattle,5,5,276076,0
Covid19,Covid19-Stanford,48,48,403562,654000
Covid19,Covid19-buffycoat,10,10,256655,193568
HIV,HIV,95,98,2762764,3164681
Healthy/Background,HIV,43,43,832374,1472515
Healthy/Background,Healthy-StanfordBloodCenter,102,102,4740876,5803482
Healthy/Background,Lupus,23,27,365431,0
Healthy/Background,New Lupus Paxgene,2,2,117351,377830
Healthy/Background,New Lupus RNA,4,4,125576,107635
Healthy/Background,healthy_children,43,43,1134937,3834725


In [14]:
### SANITY CHECKS

In [15]:
def import_cdr3_length_counts(gene_locus: GeneLocus):
    output_dir, highres_output_dir = get_dirs(gene_locus)

    specimen_cdr3_length_counts_df = pd.read_csv(
        highres_output_dir / "cdr3_length_counts_by_specimen.tsv.gz", sep="\t"
    )

    # subselect to test folds only (which also excludes global fold -1), and set index
    specimen_cdr3_length_counts_df_test_only = specimen_cdr3_length_counts_df[
        specimen_cdr3_length_counts_df["fold_label"] == "test"
    ]

    # confirm only one entry per specimen now
    assert (
        not specimen_cdr3_length_counts_df_test_only["specimen_label"]
        .duplicated()
        .any()
    )
    specimen_cdr3_length_counts_df_test_only = (
        specimen_cdr3_length_counts_df_test_only.set_index("specimen_label").drop(
            ["fold_id", "fold_label"], axis=1
        )
    )

    # drop any columns that are all N/A
    specimen_cdr3_length_counts_df_test_only = (
        specimen_cdr3_length_counts_df_test_only.dropna(axis=1, how="all")
    )

    # fill remaining N/As with 0
    specimen_cdr3_length_counts_df_test_only = (
        specimen_cdr3_length_counts_df_test_only.fillna(0)
    )

    cdr3_length_cols = specimen_cdr3_length_counts_df_test_only.columns
    cdr3_length_cols = cdr3_length_cols[~cdr3_length_cols.isin(["disease"])]

    # Convert cols to ints
    specimen_cdr3_length_counts_df_test_only.rename(
        columns={i: int(i) for i in cdr3_length_cols}, inplace=True
    )
    # Get latest column list
    cdr3_length_cols = specimen_cdr3_length_counts_df_test_only.columns
    cdr3_length_cols = cdr3_length_cols[~cdr3_length_cols.isin(["disease"])]

    # Fill in skips as all 0s
    for cdr3_len in np.arange(min(cdr3_length_cols), max(cdr3_length_cols)):
        if cdr3_len not in cdr3_length_cols:
            specimen_cdr3_length_counts_df_test_only[cdr3_len] = 0.0

    # Get latest column list
    cdr3_length_cols = specimen_cdr3_length_counts_df_test_only.columns
    cdr3_length_cols = cdr3_length_cols[~cdr3_length_cols.isin(["disease"])]

    return specimen_cdr3_length_counts_df_test_only, cdr3_length_cols

In [16]:
df, v_gene_cols, _ = import_v_gene_counts(gene_locus=GeneLocus.BCR)
total = df[v_gene_cols].sum(axis=1).astype(int)
total

specimen_label
M111-S003     56522
M111-S007     35322
M111-S009     43108
M111-S011     28770
M111-S016     33151
              ...  
M64-100       71108
M64-104       28736
M64-111       14112
M64-113      125228
M64-114      120576
Length: 480, dtype: int64

In [17]:
df, v_gene_cols = import_cdr3_length_counts(gene_locus=GeneLocus.BCR)
total2 = df[v_gene_cols].sum(axis=1).astype(int)
total2

specimen_label
M111-S003     56522
M111-S007     35322
M111-S009     43108
M111-S011     28770
M111-S016     33151
              ...  
M64-100       71108
M64-104       28736
M64-111       14112
M64-113      125228
M64-114      120576
Length: 480, dtype: int64

In [18]:
assert (total == total2).all()

In [19]:
specimen_isotype_counts_df = pd.read_csv(
    config.paths.dataset_specific_metadata / "isotype_counts_by_specimen.tsv", sep="\t"
)
specimen_isotype_counts_df = specimen_isotype_counts_df[
    specimen_isotype_counts_df["fold_label"] == "test"
]
assert not specimen_isotype_counts_df["specimen_label"].duplicated().any()
specimen_isotype_counts_df = specimen_isotype_counts_df.set_index("specimen_label")[
    ["IGHD-M", "IGHA", "IGHG"]
]
total3 = specimen_isotype_counts_df.sum(axis=1)
total3

specimen_label
M111-S003     56522
M111-S007     35322
M111-S009     43108
M111-S011     28770
M111-S016     33151
              ...  
M64-100       71108
M64-104       28736
M64-111       14112
M64-113      125228
M64-114      120576
Length: 480, dtype: int64

In [20]:
set(total.index).symmetric_difference(set(total3.index))

set()

In [21]:
assert (total == total3.loc[total.index]).all()

In [22]:
df, v_gene_cols, _ = import_v_gene_counts(gene_locus=GeneLocus.TCR)
total = df[v_gene_cols].sum(axis=1)
total

specimen_label
M111-S003     66207.0
M111-S007     28364.0
M111-S009     94863.0
M111-S011     49706.0
M111-S016     32850.0
               ...   
M64-100       42102.0
M64-104       46856.0
M64-111       32660.0
M64-113       94335.0
M64-114      102822.0
Length: 414, dtype: float64

In [23]:
df, v_gene_cols = import_cdr3_length_counts(gene_locus=GeneLocus.TCR)
total2 = df[v_gene_cols].sum(axis=1)
total2

specimen_label
M111-S003     66207.0
M111-S007     28364.0
M111-S009     94863.0
M111-S011     49706.0
M111-S016     32850.0
               ...   
M64-100       42102.0
M64-104       46856.0
M64-111       32660.0
M64-113       94335.0
M64-114      102822.0
Length: 414, dtype: float64

In [24]:
assert (total == total2).all()