In [1]:
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd
import os
from joblib import Parallel, delayed

In [2]:
from malid import config
from malid.datamodels import GeneLocus, healthy_label

## Instructions for adding more

**Merging cell type subsets from the same sample:** `specimen_label` should be consistent across entries with different `replicate_label`'s.

**If we have external cohorts that are BCR+TCR:**

We should have one row per locus per replicate-of-a-specimen. Often specimens are single-replicate, so we can think of this as: one low per locus per specimen.

Set a `specimen_label_by_locus` column that is the globally-unique specimen label tailored to a particular locus, e.g. `$SPECIMENLABEL-IGH` or `$SPECIMENLABEL-TRB` format. (TODO: update this to be replicate_label_by_locus?)

And set a `specimen_label` column that is equivalent across different-loci rows for that specimen.

The row's `gene_locus` column should be set to the locus of that row (must be the name of a valid `GeneLocus` enum value), and the `fname` column should be set to the path to the file containing the data for that locus.

_NOTE: The above is not so relevant for Adaptive -- we see below that this data is almost entirely TCR, and we will just ignore the BCR._

## Adaptive Covid TCR specimens

In [3]:
# This metadata was created in covid_tcr_immunecode_metadata.ipynb. We filtered to peak disease samples.
adaptive_tcr_covid_specimens = (
    pd.read_csv(
        config.paths.metadata_dir
        / "adaptive"
        / "generated.immunecode_covid_tcr.specimens.tsv",
        sep="\t",
    )
    .assign(
        study="immunecode",
        species="Human",
    )
    .rename(columns={"ethnicity_condensed": "ethnicity"})
)
adaptive_tcr_covid_specimens["sample_name"] = adaptive_tcr_covid_specimens[
    "specimen_label"
]

adaptive_tcr_covid_specimens

Unnamed: 0,participant_label,specimen_label,disease,disease_subtype,age,sex,ethnicity,sequencing_type,locus,study,species,sample_name
0,ImmuneCode-190921,860011232_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,53,F,Hispanic/Latino,gDNA,TCRB,immunecode,Human,860011232_TCRB
1,ImmuneCode-026,INCOV026-AC-3_TCRB,Covid19,COVID-19-ISB,33,M,Hispanic/Latino,gDNA,TCRB,immunecode,Human,INCOV026-AC-3_TCRB
2,ImmuneCode-321977,860011116_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,52,F,Caucasian,gDNA,TCRB,immunecode,Human,860011116_TCRB
3,ImmuneCode-087,INCOV087-BL-3_TCRB,Covid19,COVID-19-ISB,56,M,Hispanic/Latino,gDNA,TCRB,immunecode,Human,INCOV087-BL-3_TCRB
4,ImmuneCode-0000051,BS-EQ-0014-T2-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,55,M,Caucasian,gDNA,TCRB,immunecode,Human,BS-EQ-0014-T2-replacement_TCRB
...,...,...,...,...,...,...,...,...,...,...,...,...
83,ImmuneCode-304752,860011117_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized - ICU,60,M,Caucasian,gDNA,TCRB,immunecode,Human,860011117_TCRB
84,ImmuneCode-775827,860011106_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,57,F,Caucasian,gDNA,TCRB,immunecode,Human,860011106_TCRB
85,ImmuneCode-0000446,BS-GIGI_10-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,75,M,Caucasian,gDNA,TCRB,immunecode,Human,BS-GIGI_10-replacement_TCRB
86,ImmuneCode-0000160,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized - ICU,72,M,Caucasian,gDNA,TCRB,immunecode,Human,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB


# All others

First a general survey:

In [4]:
dfs_adaptive = []

cols_desired = [
    "sample_name",
    "locus",
    "sample_tags",
    "counting_method",
    "primer_set",
    "species",
    "study",
]


def _load_metadata_df(study, filter_cols=True):
    df = pd.read_csv(
        config.paths.metadata_dir / "adaptive" / f"{study}.tsv", sep="\t"
    ).assign(study=study)
    if filter_cols:
        # Filter down to any matching cols
        return df[[c for c in cols_desired if c in df.columns]]
    return df


for study in [
    "emerson-2017-natgen",
    "mitchell-2022-jcii",
    "mustjoki-2017-natcomms",
    "ramesh-2015-ci",
    "TCRBv4-control",
    "towlerton-2022-hiv",
]:
    dfs_adaptive.append(_load_metadata_df(study))
dfs_adaptive = pd.concat(dfs_adaptive, axis=0).reset_index(drop=True)
dfs_adaptive

Unnamed: 0,sample_name,locus,sample_tags,study,counting_method,primer_set,species
0,HIP15860,TCRB,"Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B...",emerson-2017-natgen,,,
1,HIP14363,TCRB,"Cohort 01, HLA-A*02, HLA-A*11, HLA-B*35, Human...",emerson-2017-natgen,,,
2,HIP14178,TCRB,"Cohort 01, HLA-A*01, HLA-A*03, HLA-B*08, HLA-B...",emerson-2017-natgen,,,
3,HIP13944,TCRB,"Cohort 01, HLA-A*02, HLA-A*03, HLA-B*18, HLA-B...",emerson-2017-natgen,,,
4,HIP13911,TCRB,"22 Years, Caucasian, Cohort 01, Cytomegaloviru...",emerson-2017-natgen,,,
...,...,...,...,...,...,...,...
1645,015V09001205_CFAR,TCRB,"-0.539726027 years_rel_to_art, 41.89589041 age...",towlerton-2022-hiv,v3,Human-TCRB-PD1x,Human
1646,015V09003735_CFAR,TCRB,"0.073972603 years_rel_to_art, 41.89589041 age_...",towlerton-2022-hiv,v3,Human-TCRB-PD1x,Human
1647,015V10006273_CFAR,TCRB,"1.2 years_rel_to_art, 41.89589041 age_at_min_e...",towlerton-2022-hiv,v3,Human-TCRB-PD1x,Human
1648,015V12002996_CFAR,TCRB,"2.950819672 years_rel_to_art, 41.89589041 age_...",towlerton-2022-hiv,v3,Human-TCRB-PD1x,Human


In [5]:
dfs_adaptive["species"].value_counts()

Human    864
Name: species, dtype: int64

In [6]:
dfs_adaptive.groupby(["locus", "primer_set", "counting_method"]).size()

locus  primer_set        counting_method
TCRB   Human-TCRB-PD1x   v2                  36
                         v3                 250
       Human-TCRB-PD4bx  v4                 354
       Human-TCRB-PD4x   v4                 163
       Human-TCRB-none   v1                  61
dtype: int64

In [7]:
sample_tags = pd.merge(
    # explode the sample tags
    dfs_adaptive["sample_tags"].str.split(",").explode(),
    # link back to study
    dfs_adaptive["study"],
    left_index=True,
    right_index=True,
    how="left",
    validate="m:1",
).dropna(subset="sample_tags")

sample_tags["sample_tags"] = sample_tags["sample_tags"].str.strip()
sample_tags = sample_tags.drop_duplicates()
sample_tags

Unnamed: 0,sample_tags,study
0,Cohort 01,emerson-2017-natgen
0,HLA-A*01,emerson-2017-natgen
0,HLA-A*68,emerson-2017-natgen
0,HLA-B*14,emerson-2017-natgen
0,HLA-B*57,emerson-2017-natgen
...,...,...
1644,-0.873972603 years_rel_to_art,towlerton-2022-hiv
1645,-0.539726027 years_rel_to_art,towlerton-2022-hiv
1646,0.073972603 years_rel_to_art,towlerton-2022-hiv
1647,1.2 years_rel_to_art,towlerton-2022-hiv


In [8]:
for k, v in sample_tags.groupby("study")["sample_tags"].unique().iteritems():
    print(k)
    print(", ".join(v))
    print("")



TCRBv4-control
67 Years, Blood, Cohort 3, Healthy, Hispanic, Inactive, Male, Non-smoker, 60 Years, Caucasian, Female, 40 Years, Smoker, 29 Years, Black or African American, 61 Years, Head and Neck Cancer, Never Smoked, PBMC, 38 Years, Unknown Smoking Status, 68 Years, Ex-smoker, Lung Cancer, 36 Years, 27 Years, 52 Years, 45 Years, 55 Years, 69 Years, Cigar Smoker, 66 Years, 32 Years, 43 Years, 56 Years, 30 Years, 59 Years, Asian or Pacific Islander, 54 Years, 84 Years, 20 Years, 24 Years, 64 Years, 46 Years, 72 Years, 80 Years, 73 Years, 75 Years, Colorectal Cancer, 50 Years, FFPE, Melanoma, 35 Years, 48 Years, Native American or Alaska Native, 31 Years, 57 Years, 65 Years, 63 Years, 33 Years, 25 Years, 23 Years, 37 Years, 34 Years, 21 Years, 51 Years, 49 Years, 26 Years, Smokeless Tobacco User, 58 Years, 47 Years, 62 Years, 74 Years, 78 Years, 53 Years, 42 Years, 81 Years

emerson-2017-natgen
Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B*57, Human, Inferred HLA-A*01, Inferred HLA-A*3

## Let's annotate each study manually, one by one.

In [9]:
dfs_adaptive_manual = []

In [10]:
"""
Emerson healthy TCR specimens, CMV+ and CMV-:

ImmuneAccess says not to run "Export Sample", and instead use their dedicated zip file link. That sounds fine, and we could use ImmuneAccess's metadata from the Sample Overview screen as usual...

except ImmuneAccess has somehow lost the "HIP"-prefixed labels for cohort 1, which are necessary to match up to CMV positivity metadata from SI Table 1. Instead ImmuneAccess now labels cohort 1 samples from 1 to N, basically.

But we found an older zip file online that has the correct "HIP-" labels: https://s3-us-west-2.amazonaws.com/publishedproject-supplements/emerson-2017-natgen/emerson-2017-natgen.zip

And someone kindly exported the Sample Overview ImmuneAccess metadata a while back and uploaded it here: https://github.com/jostmey/dkm/blob/7839937413af23203a442d5291e311ccb034dce7/repertoire-classification-problem/dataset/downloads/SampleOverview_08-19-2017_12-59-19_PM.tsv
This is saved in our repo as emerson-2017-natgen.tsv.

From brief inspection, all that has changed is the sample labels are now mangled in ImmuneAccess. And the order is different. Bummer.

We're going to use the older exports, but merge in some important fields from the newer metadata export.

One more important note: We have split the source files into two study names corresponding to the Emerson training and validation cohorts.
The metadata files here are under the master study name, but the source files are under the two split study names.
"HIP" samples: emerson-2017-natgen_train
"Keck" samples: emerson-2017-natgen_validation
"""

df = _load_metadata_df("emerson-2017-natgen", filter_cols=False)
df.sort_values("sample_name")

Unnamed: 0,sample_name,total_templates,total_reads,productive_templates,total_productive_reads,fraction_productive,total_rearrangements,productive_rearrangements,productive_clonality,max_productive_frequency,locus,sample_tags,study
315,HIP00110,224859,4059338,179411,3238889,0.7979,130940,104850,0.1007,1.371890,TCRB,"55 Years, Cohort 01, Cytomegalovirus -, HLA-A*...",emerson-2017-natgen
239,HIP00169,164698,1338914,138676,1127369,0.8420,115919,96869,0.0690,2.641903,TCRB,"41 Years, Cohort 01, Cytomegalovirus -, HLA-A*...",emerson-2017-natgen
206,HIP00594,356131,2918797,289407,2371942,0.8126,198830,161024,0.1787,2.311060,TCRB,"21 Years, Caucasian, Cohort 01, Cytomegaloviru...",emerson-2017-natgen
542,HIP00602,427705,9588747,335488,7521347,0.7844,248959,197856,0.1145,0.673071,TCRB,"45 Years, Cohort 01, Cytomegalovirus -, Female...",emerson-2017-natgen
773,HIP00614,na,1937837,na,1585816,0.8183,128274,105597,0.0944,1.085183,TCRB,"27 Years, Caucasian, Cohort 01, Cytomegaloviru...",emerson-2017-natgen
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,Keck0116_MC1,611470,na,494052,na,0.8080,438842,357423,0.0495,0.627464,TCRB,"19 Years, Asian or Pacific Islander, Cohort 02...",emerson-2017-natgen
54,Keck0117_MC1,638224,na,513475,na,0.8045,331272,272565,0.1364,5.619358,TCRB,"34 Years, Caucasian, Cohort 02, Cytomegaloviru...",emerson-2017-natgen
93,Keck0118_MC1,848716,na,660975,na,0.7788,397497,315296,0.1585,1.969061,TCRB,"28 Years, Caucasian, Cohort 02, Cytomegaloviru...",emerson-2017-natgen
94,Keck0119_MC1,590033,na,487792,na,0.8267,421166,353573,0.0425,1.139010,TCRB,"24 Years, Asian or Pacific Islander, Cohort 02...",emerson-2017-natgen


In [11]:
df["sample_tags"].str.split(",").explode().str.strip().value_counts().head(n=50)

Inferred CMV -                        796
gDNA                                  786
T cells                               786
Peripheral blood lymphocytes (PBL)    786
Human                                 786
PBMC                                  786
Cohort 01                             666
Inferred CMV +                        606
Non-Hispanic or Latino                529
Caucasian                             465
Cytomegalovirus -                     421
Male                                  392
Female                                370
Cytomegalovirus +                     340
HLA-A*02                              294
Inferred HLA-A*02                     282
Unknown racial group                  252
Unknown Ethnicity                     226
Inferred HLA-A*01                     195
HLA-A*01                              186
Inferred HLA-B*07                     159
Inferred HLA-B*44                     158
HLA-B*07                              156
Inferred HLA-A*03                 

In [12]:
# counting_method is not available in the older immuneaccess sampleoverview we found online (see note above)
# we'll use the wrong-labeled one to check this:
df_m_wrong_labels = _load_metadata_df(
    "emerson-2017-natgen.wrong_labels", filter_cols=False
)
df_m_wrong_labels["counting_method"].value_counts()

v2    590
v3    120
v1     76
Name: counting_method, dtype: int64

In [13]:
# same shape, different orders
df_m_wrong_labels.shape, df.shape

((786, 22), (786, 13))

In [14]:
cols_desired

['sample_name',
 'locus',
 'sample_tags',
 'counting_method',
 'primer_set',
 'species',
 'study']

In [15]:
df.shape

(786, 13)

In [16]:
df = pd.merge(
    df,
    df_m_wrong_labels[
        [
            "total_templates",
            "total_reads",
            "productive_templates",
            "total_productive_reads",
            "total_rearrangements",
            "productive_rearrangements",
            "counting_method",
            "primer_set",
            "species",
        ]
    ],
    on=[
        "total_templates",
        "total_reads",
        "productive_templates",
        "total_productive_reads",
        "total_rearrangements",
        "productive_rearrangements",
    ],
    how="inner",
    validate="1:1",
)
df.shape

(786, 16)

In [17]:
df = df[cols_desired]
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study
0,HIP15860,TCRB,"Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen
1,HIP14363,TCRB,"Cohort 01, HLA-A*02, HLA-A*11, HLA-B*35, Human...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen
2,HIP14178,TCRB,"Cohort 01, HLA-A*01, HLA-A*03, HLA-B*08, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen
3,HIP13944,TCRB,"Cohort 01, HLA-A*02, HLA-A*03, HLA-B*18, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen
4,HIP13911,TCRB,"22 Years, Caucasian, Cohort 01, Cytomegaloviru...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen
...,...,...,...,...,...,...,...
781,HIP01765,TCRB,"48 Years, Cohort 01, Cytomegalovirus +, Female...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen
782,HIP13760,TCRB,"Cohort 01, Cytomegalovirus -, HLA-A*11, HLA-A*...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen
783,HIP08821,TCRB,"55 Years, Caucasian, Cohort 01, Cytomegaloviru...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen
784,HIP05552,TCRB,"53 Years, Caucasian, Cohort 01, Cytomegaloviru...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen


In [18]:
# sanity check
df["counting_method"].value_counts()

v2    590
v3    120
v1     76
Name: counting_method, dtype: int64

In [19]:
df["sequencing_type"] = "gDNA"
# Split e.g. Keck0116_MC1 into Keck0116
df["specimen_label"] = df["sample_name"]
df["participant_label"] = df["sample_name"].str.split("_").str[0]
assert (df["participant_label"].value_counts() == 1).all()
df["participant_label"].sort_values()

315    HIP00110
239    HIP00169
206    HIP00594
542    HIP00602
773    HIP00614
         ...   
92     Keck0116
54     Keck0117
93     Keck0118
94     Keck0119
55     Keck0120
Name: participant_label, Length: 786, dtype: object

In [20]:
# let's add metadata from SI Table 1
df_m1 = pd.read_excel(
    config.paths.metadata_dir / "adaptive" / "emerson-2017-natgen.si_table_1.xlsx",
    sheet_name=0,
)
df_m2 = pd.read_excel(
    config.paths.metadata_dir / "adaptive" / "emerson-2017-natgen.si_table_1.xlsx",
    sheet_name=1,
)
df_m1

Unnamed: 0,Subject ID,Sex,Age,Race and ethnicity,Known CMV status,Inferred CMV status (all data),Inferred CMV status (cross validation),Known HLA alleles,Inferred HLA alleles (cross validation)
0,HIP00110,Male,54.557153,Unknown,-,-,-,"A24, A3, B7, B7","A24, A3, B7"
1,HIP00169,Male,40.681725,Unknown,-,-,-,"A1, A2, B37, B27","A2, B27, B7"
2,HIP00594,Male,20.632444,"White, Not Hispanic or Latino",+,+,+,"A2, A32, B7, B61","A2, A3, B44, B7"
3,HIP00602,Female,44.678987,Unknown,-,-,-,"A11, A2, B35, B51","A11, A2, B35, B51"
4,HIP00614,Male,26.688569,"White, Not Hispanic or Latino",-,-,-,"A3, A29, B7, B7","A3, B7"
...,...,...,...,...,...,...,...,...,...
661,HIP17887,Unknown,Unknown,Unknown,Unknown,ND,ND,"A2, A29, B7, B44","A29, A2, B7"
662,HIP19048,Female,57.404517,Unknown,-,-,-,"A1, A2, B57, B51","A1, A2"
663,HIP19089,Unknown,Unknown,Unknown,Unknown,ND,ND,"A1, A80, B13, B44","A1, A29, B44"
664,HIP19716,Unknown,Unknown,Unknown,Unknown,ND,ND,"A1, A1, B8, B8","A1, B8"


In [21]:
df_m2

Unnamed: 0,Subject ID,Sex,Age,Race and ethnicity,Known CMV status,Inferred CMV status (trained on Cohort 1),Unnamed: 6,Unnamed: 7,Unnamed: 9,.1,.2
0,Keck0001,Female,22,"White, Not Hispanic or Latino",+,-,,,,,
1,Keck0002,Female,21,"White, Not Hispanic or Latino",-,-,,,,,
2,Keck0003,Female,25,"American Indian or Alaska Native, Not Hispanic...",-,-,,,,,
3,Keck0004,Female,24,"White, Not Hispanic or Latino",+,+,,,,,
4,Keck0005,Male,24,"White, Not Hispanic or Latino",-,-,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
115,Keck0116,Female,19,"Asian, Not Hispanic or Latino",-,-,,,,,
116,Keck0117,Male,34,"White, Not Hispanic or Latino",+,+,,,,,
117,Keck0118,Male,28,"White, Not Hispanic or Latino",+,+,,,,,
118,Keck0119,Female,24,"Asian, Not Hispanic or Latino",+,+,,,,,


In [22]:
df_m = pd.concat([df_m1, df_m2], axis=0)
df_m

Unnamed: 0,Subject ID,Sex,Age,Race and ethnicity,Known CMV status,Inferred CMV status (all data),Inferred CMV status (cross validation),Known HLA alleles,Inferred HLA alleles (cross validation),Inferred CMV status (trained on Cohort 1),Unnamed: 6,Unnamed: 7,Unnamed: 13,.1,.2
0,HIP00110,Male,54.557153,Unknown,-,-,-,"A24, A3, B7, B7","A24, A3, B7",,,,,,
1,HIP00169,Male,40.681725,Unknown,-,-,-,"A1, A2, B37, B27","A2, B27, B7",,,,,,
2,HIP00594,Male,20.632444,"White, Not Hispanic or Latino",+,+,+,"A2, A32, B7, B61","A2, A3, B44, B7",,,,,,
3,HIP00602,Female,44.678987,Unknown,-,-,-,"A11, A2, B35, B51","A11, A2, B35, B51",,,,,,
4,HIP00614,Male,26.688569,"White, Not Hispanic or Latino",-,-,-,"A3, A29, B7, B7","A3, B7",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,Keck0116,Female,19,"Asian, Not Hispanic or Latino",-,,,,,-,,,,,
116,Keck0117,Male,34,"White, Not Hispanic or Latino",+,,,,,+,,,,,
117,Keck0118,Male,28,"White, Not Hispanic or Latino",+,,,,,+,,,,,
118,Keck0119,Female,24,"Asian, Not Hispanic or Latino",+,,,,,+,,,,,


In [23]:
df_m["Known CMV status"].value_counts()

-          421
+          340
Unknown     25
Name: Known CMV status, dtype: int64

In [24]:
df_m = df_m.rename(
    columns={
        "Subject ID": "participant_label",
        "Race and ethnicity ": "Ethnicity",
    }
).assign(cmv=df_m["Known CMV status"].map({"+": "CMV+", "-": "CMV-"}))[
    ["participant_label", "Sex", "Age", "Ethnicity", "cmv"]
]
df_m

Unnamed: 0,participant_label,Sex,Age,Ethnicity,cmv
0,HIP00110,Male,54.557153,Unknown,CMV-
1,HIP00169,Male,40.681725,Unknown,CMV-
2,HIP00594,Male,20.632444,"White, Not Hispanic or Latino",CMV+
3,HIP00602,Female,44.678987,Unknown,CMV-
4,HIP00614,Male,26.688569,"White, Not Hispanic or Latino",CMV-
...,...,...,...,...,...
115,Keck0116,Female,19,"Asian, Not Hispanic or Latino",CMV-
116,Keck0117,Male,34,"White, Not Hispanic or Latino",CMV+
117,Keck0118,Male,28,"White, Not Hispanic or Latino",CMV+
118,Keck0119,Female,24,"Asian, Not Hispanic or Latino",CMV+


In [25]:
df = pd.merge(
    df,
    df_m,
    how="left",
    on="participant_label",
    validate="1:1",
)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,participant_label,Sex,Age,Ethnicity,cmv
0,HIP15860,TCRB,"Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP15860,HIP15860,Unknown,Unknown,Unknown,
1,HIP14363,TCRB,"Cohort 01, HLA-A*02, HLA-A*11, HLA-B*35, Human...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP14363,HIP14363,Unknown,Unknown,Unknown,
2,HIP14178,TCRB,"Cohort 01, HLA-A*01, HLA-A*03, HLA-B*08, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP14178,HIP14178,Unknown,Unknown,Unknown,
3,HIP13944,TCRB,"Cohort 01, HLA-A*02, HLA-A*03, HLA-B*18, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP13944,HIP13944,Unknown,Unknown,Unknown,
4,HIP13911,TCRB,"22 Years, Caucasian, Cohort 01, Cytomegaloviru...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP13911,HIP13911,Male,22.469541,"White, Not Hispanic or Latino",CMV+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,HIP01765,TCRB,"48 Years, Cohort 01, Cytomegalovirus +, Female...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP01765,HIP01765,Female,47.843943,Unknown,CMV+
782,HIP13760,TCRB,"Cohort 01, Cytomegalovirus -, HLA-A*11, HLA-A*...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP13760,HIP13760,Male,Unknown,Unknown,CMV-
783,HIP08821,TCRB,"55 Years, Caucasian, Cohort 01, Cytomegaloviru...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP08821,HIP08821,Female,54.631075,"White, Not Hispanic or Latino",CMV+
784,HIP05552,TCRB,"53 Years, Caucasian, Cohort 01, Cytomegaloviru...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen,gDNA,HIP05552,HIP05552,Female,52.835044,"White, Not Hispanic or Latino",CMV-


In [26]:
df["cmv"].isna().value_counts()

False    760
True      26
Name: cmv, dtype: int64

In [27]:
df["cmv"].value_counts()

CMV-    420
CMV+    340
Name: cmv, dtype: int64

In [28]:
df["disease"] = healthy_label
df["disease_subtype"] = f"{healthy_label} - " + df["cmv"].fillna("CMV Unknown")
df["disease_subtype"].value_counts()

Healthy/Background - CMV-           420
Healthy/Background - CMV+           340
Healthy/Background - CMV Unknown     26
Name: disease_subtype, dtype: int64

In [29]:
# One more important note: We have split the source files into two study names corresponding to the Emerson training and validation cohorts.
# The metadata files here are under the master study name, but the source files are under the two split study names.
# "HIP" samples: emerson-2017-natgen_train
# "Keck" samples: emerson-2017-natgen_validation

# So we must update the study name.
df["study"] = (
    df["study"]
    + "_"
    + df["participant_label"]
    .str.startswith("Keck")
    .map({True: "validation", False: "train"})
)
df["study"].value_counts()

emerson-2017-natgen_train         666
emerson-2017-natgen_validation    120
Name: study, dtype: int64

In [30]:
dfs_adaptive_manual.append(df)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,participant_label,Sex,Age,Ethnicity,cmv,disease,disease_subtype
0,HIP15860,TCRB,"Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP15860,HIP15860,Unknown,Unknown,Unknown,,Healthy/Background,Healthy/Background - CMV Unknown
1,HIP14363,TCRB,"Cohort 01, HLA-A*02, HLA-A*11, HLA-B*35, Human...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP14363,HIP14363,Unknown,Unknown,Unknown,,Healthy/Background,Healthy/Background - CMV Unknown
2,HIP14178,TCRB,"Cohort 01, HLA-A*01, HLA-A*03, HLA-B*08, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP14178,HIP14178,Unknown,Unknown,Unknown,,Healthy/Background,Healthy/Background - CMV Unknown
3,HIP13944,TCRB,"Cohort 01, HLA-A*02, HLA-A*03, HLA-B*18, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP13944,HIP13944,Unknown,Unknown,Unknown,,Healthy/Background,Healthy/Background - CMV Unknown
4,HIP13911,TCRB,"22 Years, Caucasian, Cohort 01, Cytomegaloviru...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP13911,HIP13911,Male,22.469541,"White, Not Hispanic or Latino",CMV+,Healthy/Background,Healthy/Background - CMV+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,HIP01765,TCRB,"48 Years, Cohort 01, Cytomegalovirus +, Female...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP01765,HIP01765,Female,47.843943,Unknown,CMV+,Healthy/Background,Healthy/Background - CMV+
782,HIP13760,TCRB,"Cohort 01, Cytomegalovirus -, HLA-A*11, HLA-A*...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP13760,HIP13760,Male,Unknown,Unknown,CMV-,Healthy/Background,Healthy/Background - CMV-
783,HIP08821,TCRB,"55 Years, Caucasian, Cohort 01, Cytomegaloviru...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP08821,HIP08821,Female,54.631075,"White, Not Hispanic or Latino",CMV+,Healthy/Background,Healthy/Background - CMV+
784,HIP05552,TCRB,"53 Years, Caucasian, Cohort 01, Cytomegaloviru...",v1,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP05552,HIP05552,Female,52.835044,"White, Not Hispanic or Latino",CMV-,Healthy/Background,Healthy/Background - CMV-


In [31]:
df = _load_metadata_df("mitchell-2022-jcii")

"""
https://insight.jci.org/articles/view/161885
Childhood T1D:

143 new-onset T1D: "As an independent validation, we sequenced and analyzed TCRβ repertoires from a cohort of new-onset T1D patients (n=143)"

25 HHC (4 samples each)
29 children that progress to T1D (4 samples each)
"longitudinal peripheral blood DNA samples at four time points beginning early in life (median age of 1.4 years) from children that progressed to T1D (n=29) and age/sex-matched islet autoantibody negative controls (n=25)"
"""

df["sequencing_type"] = "gDNA"

df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
0,310114_TCRB,TCRB,"03 Years, 12.6575342465753 Years at diagnosis,...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA
1,310251_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA
2,310113_TCRB,TCRB,"03 Years, 11.5095890410959 Years at diagnosis,...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA
3,310205_TCRB,TCRB,"0 Years at visit, 09 Months, Caucasian, Contro...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA
4,310283_TCRB,TCRB,"08 Years, 12.6575342465753 Years at diagnosis,...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA
...,...,...,...,...,...,...,...,...
354,DenverT1D-170_TCRB,TCRB,"17 Years, 17 Years at diagnosis, 18-24 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA
355,DenverT1D-106_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA
356,DenverT1D-256_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA
357,DenverT1D-055_TCRB,TCRB,"02-12 (Childhood), 07 Years, 7 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA


In [32]:
df["specimen_label"] = df["sample_name"].str.split("_TCRB").str[0]
df["specimen_label"]

0             310114
1             310251
2             310113
3             310205
4             310283
           ...      
354    DenverT1D-170
355    DenverT1D-106
356    DenverT1D-256
357    DenverT1D-055
358    DenverT1D-271
Name: specimen_label, Length: 359, dtype: object

In [33]:
# new onset n=143 cohort is "Denver"
df["sample_name"].str.contains("Denver").value_counts()

False    216
True     143
Name: sample_name, dtype: int64

In [34]:
# The other 216 samples
29 * 4 + 25 * 4

216

In [35]:
df["sample_tags"].str.split(",").explode().str.strip().value_counts().head(n=20)

Caucasian                 354
Non-Hispanic or Latino    310
HLA-DQB1*0302             238
HLA-DQA1*0301             216
Male                      214
HLA-DPA1*0103             212
IA-2 0                    182
HLA-DQB1*0201             178
HLA-DRB1*0301             178
HLA-DQA1*0501             178
GAD65 0                   161
HLA-DRB1*0401             150
HLA-DPB1*0401             148
Female                    145
Type 1 Diabetes           143
Case                      116
ZnT8 NR                   109
HLA-A*0201                108
IAA 0                     108
Whole Blood               104
Name: sample_tags, dtype: int64

In [36]:
# Now: import subject IDs, age, sex, ancestry from https://insight.jci.org/articles/view/161885/sd/2
# Notice that the 29 children that progress to T1D don't necessarily have T1D diagnosis prior to the last sample
# So only include the new-onset T1D samples and the 25 healthy control children (with repeated samples)

In [37]:
control_samples = pd.read_excel(
    config.paths.metadata_dir / "adaptive" / "jci.insight.161885.sdd1.xlsx",
    sheet_name="DAISY Control",
)

# first row per person has Sex/Race/Ethnicity not null
not_null_demographics = control_samples["Sex"].notna()

# double check that all those rows have sample timepoint = 1
assert (control_samples.loc[not_null_demographics, "Sample Timepoint"] == 1).all()


# Assign 'participant_label' as the cumulative sum of 'not_null_demographics', starting from 1
control_samples[
    "participant_label"
] = "control_" + not_null_demographics.cumsum().astype(str)
control_samples["disease"] = healthy_label
control_samples["disease_subtype"] = f"{healthy_label} - T1D negative"

# Transfer demographics to rest of rows per person
# Fill null column values in each group with the first non-null column value in the group (ffill() is the key part, but bfill() saves us if for some reason the first row isn't the one with the entry - not the case for us here but just in case)
for col in ["Sex", "Race", "Ethnicity"]:
    control_samples[col] = (
        control_samples.groupby("participant_label")[col].bfill().ffill()
    )

control_samples

Unnamed: 0,Adaptive ID,Age at Visit (years),Sample Timepoint,Sex,Race,Ethnicity,GAD65 (nl < 20 DK Units),IA-2 (nl < 5 DK Units),IAA (nl < 0.011)*,ZnT8 (nl < 0.021),...,PPI TCR 9,PPI TCR 10,PPI TCR 11,PPI TCR 12,PPI TCR 13,PPI TCR 14,PPI TCR 15,participant_label,disease,disease_subtype
0,310101,2.913073,1,Female,White,Non-Hispanic,0,5,0.006,NR,...,0,0,0,0,0,0,0,control_1,Healthy/Background,Healthy/Background - T1D negative
1,310103,3.961670,2,Female,White,Non-Hispanic,3,0,0.002,NR,...,0,0,0,0,0,0,0,control_1,Healthy/Background,Healthy/Background - T1D negative
2,310216,8.570000,3,Female,White,Non-Hispanic,0,0,0.000,NR,...,0,0,0,0,0,0,0,control_1,Healthy/Background,Healthy/Background - T1D negative
3,310146,12.720000,4,Female,White,Non-Hispanic,0,0,0.000,NR,...,0,0,0,0,0,0,0,control_1,Healthy/Background,Healthy/Background - T1D negative
4,310104,1.256673,1,Male,White,Non-Hispanic,0,0,0.009,NR,...,0,0,0,0,0,0,0,control_2,Healthy/Background,Healthy/Background - T1D negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,310163,10.570000,4,Male,White,Non-Hispanic,0,0,0.000,NR,...,0,1,0,0,0,0,0,control_24,Healthy/Background,Healthy/Background - T1D negative
96,310219,0.860000,1,Male,White,Non-Hispanic,0,0,0.000,NR,...,0,0,0,0,0,0,0,control_25,Healthy/Background,Healthy/Background - T1D negative
97,310172,9.875427,2,Male,White,Non-Hispanic,0,0,0.002,NR,...,0,0,0,0,0,0,0,control_25,Healthy/Background,Healthy/Background - T1D negative
98,310182,11.767282,3,Male,White,Non-Hispanic,0,3,0.004,NR,...,0,0,0,0,0,0,0,control_25,Healthy/Background,Healthy/Background - T1D negative


In [38]:
control_samples = control_samples[
    [
        "Adaptive ID",
        "Age at Visit (years)",
        "Sample Timepoint",
        "Sex",
        "Race",
        "Ethnicity",
        "participant_label",
        "disease",
        "disease_subtype",
    ]
].rename(columns={"Age at Visit (years)": "Age"})
control_samples["Adaptive ID"] = control_samples["Adaptive ID"].astype(str)
control_samples

Unnamed: 0,Adaptive ID,Age,Sample Timepoint,Sex,Race,Ethnicity,participant_label,disease,disease_subtype
0,310101,2.913073,1,Female,White,Non-Hispanic,control_1,Healthy/Background,Healthy/Background - T1D negative
1,310103,3.961670,2,Female,White,Non-Hispanic,control_1,Healthy/Background,Healthy/Background - T1D negative
2,310216,8.570000,3,Female,White,Non-Hispanic,control_1,Healthy/Background,Healthy/Background - T1D negative
3,310146,12.720000,4,Female,White,Non-Hispanic,control_1,Healthy/Background,Healthy/Background - T1D negative
4,310104,1.256673,1,Male,White,Non-Hispanic,control_2,Healthy/Background,Healthy/Background - T1D negative
...,...,...,...,...,...,...,...,...,...
95,310163,10.570000,4,Male,White,Non-Hispanic,control_24,Healthy/Background,Healthy/Background - T1D negative
96,310219,0.860000,1,Male,White,Non-Hispanic,control_25,Healthy/Background,Healthy/Background - T1D negative
97,310172,9.875427,2,Male,White,Non-Hispanic,control_25,Healthy/Background,Healthy/Background - T1D negative
98,310182,11.767282,3,Male,White,Non-Hispanic,control_25,Healthy/Background,Healthy/Background - T1D negative


In [39]:
control_samples = pd.merge(
    df[~df["sample_name"].str.contains("Denver")],
    control_samples,
    how="inner",
    left_on="specimen_label",
    right_on="Adaptive ID",
    validate="1:1",
).drop(columns="Adaptive ID")
# left with 25*4 as expected
control_samples

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,Age,Sample Timepoint,Sex,Race,Ethnicity,participant_label,disease,disease_subtype
0,310251_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310251,6.661190,4,Female,White,Hispanic,control_16,Healthy/Background,Healthy/Background - T1D negative
1,310205_TCRB,TCRB,"0 Years at visit, 09 Months, Caucasian, Contro...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310205,0.829568,1,Female,White,Non-Hispanic,control_6,Healthy/Background,Healthy/Background - T1D negative
2,310168_TCRB,TCRB,"05 Years, 5 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310168,5.670088,3,Male,White,Non-Hispanic,control_13,Healthy/Background,Healthy/Background - T1D negative
3,310170_TCRB,TCRB,"08 Years, 8 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310170,8.407939,2,Male,White,Non-Hispanic,control_12,Healthy/Background,Healthy/Background - T1D negative
4,310148_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310148,6.269678,4,Male,White,Non-Hispanic,control_9,Healthy/Background,Healthy/Background - T1D negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,310137_TCRB,TCRB,"07 Years, 7 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310137,7.140314,3,Female,White,Non-Hispanic,control_14,Healthy/Background,Healthy/Background - T1D negative
96,310217_TCRB,TCRB,"03 Years, 3 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310217,3.300000,3,Male,White,Non-Hispanic,control_20,Healthy/Background,Healthy/Background - T1D negative
97,310188_TCRB,TCRB,"09 Years, 9 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310188,9.607118,4,Male,White,Non-Hispanic,control_22,Healthy/Background,Healthy/Background - T1D negative
98,310106_TCRB,TCRB,"01 Years, 1 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310106,1.212867,1,Male,White,Non-Hispanic,control_23,Healthy/Background,Healthy/Background - T1D negative


In [40]:
control_samples["participant_label"].value_counts()

control_16    4
control_7     4
control_14    4
control_23    4
control_24    4
control_2     4
control_25    4
control_4     4
control_10    4
control_22    4
control_5     4
control_15    4
control_1     4
control_6     4
control_20    4
control_18    4
control_17    4
control_11    4
control_19    4
control_3     4
control_21    4
control_9     4
control_12    4
control_13    4
control_8     4
Name: participant_label, dtype: int64

In [41]:
new_onset_samples = pd.read_excel(
    config.paths.metadata_dir / "adaptive" / "jci.insight.161885.sdd1.xlsx",
    sheet_name="New-onset T1D",
)
# Left zero pad
new_onset_samples["participant_label"] = "DenverT1D-" + new_onset_samples[
    "Adaptive ID"
].astype(str).str.zfill(3)
new_onset_samples["disease"] = "T1D"
new_onset_samples["disease_subtype"] = "T1D - new onset"
new_onset_samples

Unnamed: 0,Adaptive ID,Age at Diagnosis (years),Sex,Race,Ethnicity,T1D Duration (days),GAD65 (nl < 20 DK Units),IA-2 (nl < 5 DK Units),IAA (nl < 0.011),ZnT8 (nl < 0.021),...,PPI TCR 9,PPI TCR 10,PPI TCR 11,PPI TCR 12,PPI TCR 13,PPI TCR 14,PPI TCR 15,participant_label,disease,disease_subtype
0,9,13.5,Female,White,Non-Hispanic,1,23.0,212.0,0.007,0.621,...,,0,0,,,0,0,DenverT1D-009,T1D,T1D - new onset
1,41,14.9,Female,White,Non-Hispanic,15,36.0,2.0,0.000,0.005,...,,0,0,,,0,0,DenverT1D-041,T1D,T1D - new onset
2,42,9.0,Female,White,Non-Hispanic,0,703.0,196.0,0.011,0.398,...,,0,0,,,0,0,DenverT1D-042,T1D,T1D - new onset
3,43,5.5,Female,White,Non-Hispanic,1,9.0,246.0,0.006,0.030,...,,0,0,,,0,0,DenverT1D-043,T1D,T1D - new onset
4,44,16.5,Male,White,Non-Hispanic,1,5.0,239.0,0.001,0.270,...,,0,0,,,0,0,DenverT1D-044,T1D,T1D - new onset
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,382,11.6,Male,Black,Non-Hispanic,55,168.0,381.0,0.011,0.021,...,,0,0,,,0,0,DenverT1D-382,T1D,T1D - new onset
139,383,10.8,Female,White,Non-Hispanic,2,68.0,0.0,0.000,0.458,...,,0,0,,,0,0,DenverT1D-383,T1D,T1D - new onset
140,384,10.9,Male,White,Non-Hispanic,3,613.0,348.0,0.031,0.208,...,,0,0,,,0,0,DenverT1D-384,T1D,T1D - new onset
141,386,10.1,Female,White,Non-Hispanic,3,279.0,271.0,0.096,0.000,...,,0,0,,,0,0,DenverT1D-386,T1D,T1D - new onset


In [42]:
# these are really new onset
new_onset_samples["T1D Duration (days)"].describe()

count    143.000000
mean       4.881119
std        9.059653
min        0.000000
25%        1.000000
50%        2.000000
75%        5.000000
max       55.000000
Name: T1D Duration (days), dtype: float64

In [43]:
new_onset_samples = new_onset_samples[
    [
        "participant_label",
        "Age at Diagnosis (years)",
        "Sex",
        "Race",
        "Ethnicity",
        "disease",
        "disease_subtype",
    ]
].rename(columns={"Age at Diagnosis (years)": "Age"})
new_onset_samples

Unnamed: 0,participant_label,Age,Sex,Race,Ethnicity,disease,disease_subtype
0,DenverT1D-009,13.5,Female,White,Non-Hispanic,T1D,T1D - new onset
1,DenverT1D-041,14.9,Female,White,Non-Hispanic,T1D,T1D - new onset
2,DenverT1D-042,9.0,Female,White,Non-Hispanic,T1D,T1D - new onset
3,DenverT1D-043,5.5,Female,White,Non-Hispanic,T1D,T1D - new onset
4,DenverT1D-044,16.5,Male,White,Non-Hispanic,T1D,T1D - new onset
...,...,...,...,...,...,...,...
138,DenverT1D-382,11.6,Male,Black,Non-Hispanic,T1D,T1D - new onset
139,DenverT1D-383,10.8,Female,White,Non-Hispanic,T1D,T1D - new onset
140,DenverT1D-384,10.9,Male,White,Non-Hispanic,T1D,T1D - new onset
141,DenverT1D-386,10.1,Female,White,Non-Hispanic,T1D,T1D - new onset


In [44]:
df[df["sample_name"].str.contains("Denver")]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label
216,DenverT1D-045_TCRB,TCRB,"12-18 Years (Adolescence), 15 Years, 15 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-045
217,DenverT1D-041_TCRB,TCRB,"12-18 Years (Adolescence), 14 Years, 14 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-041
218,DenverT1D-103_TCRB,TCRB,"02-12 (Childhood), 09 Years, 9 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-103
219,DenverT1D-332_TCRB,TCRB,"02-12 (Childhood), 08 Years, 8 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-332
220,DenverT1D-179_TCRB,TCRB,"12 Years, 12 Years at diagnosis, 12-18 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-179
...,...,...,...,...,...,...,...,...,...
354,DenverT1D-170_TCRB,TCRB,"17 Years, 17 Years at diagnosis, 18-24 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-170
355,DenverT1D-106_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-106
356,DenverT1D-256_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-256
357,DenverT1D-055_TCRB,TCRB,"02-12 (Childhood), 07 Years, 7 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-055


In [45]:
new_onset_samples = pd.merge(
    df[df["sample_name"].str.contains("Denver")],
    new_onset_samples,
    how="inner",
    left_on="specimen_label",
    right_on="participant_label",
    validate="1:1",
)
# left with 143 as expected
new_onset_samples

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,participant_label,Age,Sex,Race,Ethnicity,disease,disease_subtype
0,DenverT1D-045_TCRB,TCRB,"12-18 Years (Adolescence), 15 Years, 15 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-045,DenverT1D-045,15.6,Female,White,Non-Hispanic,T1D,T1D - new onset
1,DenverT1D-041_TCRB,TCRB,"12-18 Years (Adolescence), 14 Years, 14 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-041,DenverT1D-041,14.9,Female,White,Non-Hispanic,T1D,T1D - new onset
2,DenverT1D-103_TCRB,TCRB,"02-12 (Childhood), 09 Years, 9 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-103,DenverT1D-103,9.2,Female,White,Hispanic,T1D,T1D - new onset
3,DenverT1D-332_TCRB,TCRB,"02-12 (Childhood), 08 Years, 8 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-332,DenverT1D-332,8.1,Female,White,Non-Hispanic,T1D,T1D - new onset
4,DenverT1D-179_TCRB,TCRB,"12 Years, 12 Years at diagnosis, 12-18 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-179,DenverT1D-179,12.8,Male,White,Non-Hispanic,T1D,T1D - new onset
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,DenverT1D-170_TCRB,TCRB,"17 Years, 17 Years at diagnosis, 18-24 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-170,DenverT1D-170,17.9,Male,White,Non-Hispanic,T1D,T1D - new onset
139,DenverT1D-106_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-106,DenverT1D-106,16.4,Male,White,Non-Hispanic,T1D,T1D - new onset
140,DenverT1D-256_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-256,DenverT1D-256,16.1,Male,White,Non-Hispanic,T1D,T1D - new onset
141,DenverT1D-055_TCRB,TCRB,"02-12 (Childhood), 07 Years, 7 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-055,DenverT1D-055,7.5,Female,White,Non-Hispanic,T1D,T1D - new onset


In [46]:
# Recombine
df = pd.concat([control_samples, new_onset_samples], axis=0)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,Age,Sample Timepoint,Sex,Race,Ethnicity,participant_label,disease,disease_subtype
0,310251_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310251,6.661190,4.0,Female,White,Hispanic,control_16,Healthy/Background,Healthy/Background - T1D negative
1,310205_TCRB,TCRB,"0 Years at visit, 09 Months, Caucasian, Contro...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310205,0.829568,1.0,Female,White,Non-Hispanic,control_6,Healthy/Background,Healthy/Background - T1D negative
2,310168_TCRB,TCRB,"05 Years, 5 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310168,5.670088,3.0,Male,White,Non-Hispanic,control_13,Healthy/Background,Healthy/Background - T1D negative
3,310170_TCRB,TCRB,"08 Years, 8 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310170,8.407939,2.0,Male,White,Non-Hispanic,control_12,Healthy/Background,Healthy/Background - T1D negative
4,310148_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310148,6.269678,4.0,Male,White,Non-Hispanic,control_9,Healthy/Background,Healthy/Background - T1D negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,DenverT1D-170_TCRB,TCRB,"17 Years, 17 Years at diagnosis, 18-24 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-170,17.900000,,Male,White,Non-Hispanic,DenverT1D-170,T1D,T1D - new onset
139,DenverT1D-106_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-106,16.400000,,Male,White,Non-Hispanic,DenverT1D-106,T1D,T1D - new onset
140,DenverT1D-256_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-256,16.100000,,Male,White,Non-Hispanic,DenverT1D-256,T1D,T1D - new onset
141,DenverT1D-055_TCRB,TCRB,"02-12 (Childhood), 07 Years, 7 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-055,7.500000,,Female,White,Non-Hispanic,DenverT1D-055,T1D,T1D - new onset


In [47]:
dfs_adaptive_manual.append(df)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,Age,Sample Timepoint,Sex,Race,Ethnicity,participant_label,disease,disease_subtype
0,310251_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310251,6.661190,4.0,Female,White,Hispanic,control_16,Healthy/Background,Healthy/Background - T1D negative
1,310205_TCRB,TCRB,"0 Years at visit, 09 Months, Caucasian, Contro...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310205,0.829568,1.0,Female,White,Non-Hispanic,control_6,Healthy/Background,Healthy/Background - T1D negative
2,310168_TCRB,TCRB,"05 Years, 5 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310168,5.670088,3.0,Male,White,Non-Hispanic,control_13,Healthy/Background,Healthy/Background - T1D negative
3,310170_TCRB,TCRB,"08 Years, 8 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310170,8.407939,2.0,Male,White,Non-Hispanic,control_12,Healthy/Background,Healthy/Background - T1D negative
4,310148_TCRB,TCRB,"06 Years, 6 Years at visit, Caucasian, Control...",v4,Human-TCRB-PD4bx,Human,mitchell-2022-jcii,gDNA,310148,6.269678,4.0,Male,White,Non-Hispanic,control_9,Healthy/Background,Healthy/Background - T1D negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,DenverT1D-170_TCRB,TCRB,"17 Years, 17 Years at diagnosis, 18-24 Years (...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-170,17.900000,,Male,White,Non-Hispanic,DenverT1D-170,T1D,T1D - new onset
139,DenverT1D-106_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-106,16.400000,,Male,White,Non-Hispanic,DenverT1D-106,T1D,T1D - new onset
140,DenverT1D-256_TCRB,TCRB,"12-18 Years (Adolescence), 16 Years, 16 Years ...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-256,16.100000,,Male,White,Non-Hispanic,DenverT1D-256,T1D,T1D - new onset
141,DenverT1D-055_TCRB,TCRB,"02-12 (Childhood), 07 Years, 7 Years at diagno...",v4,Human-TCRB-PD4x,Human,mitchell-2022-jcii,gDNA,DenverT1D-055,7.500000,,Female,White,Non-Hispanic,DenverT1D-055,T1D,T1D - new onset


In [48]:
df = _load_metadata_df("mustjoki-2017-natcomms")

"""
https://www.nature.com/articles/ncomms15869
RA, newly diagnosed. Some seronegative, some seropositive. A few patients have on-treatment followup
See kelkka-2020-jci above for a follow-up study.

Peripheral-blood mononuclear cells (PBMCs) were separated from EDTA blood using Ficoll gradient separation (Ficoll-Paque PLUS, GE Healthcare, cat. no 17-1440-03).
CD4+ and CD8+ cells were separated with magnetic bead sorting using positive selection for both fractions

DNA was extracted from CD4- and CD8-enriched samples or from the PBMC fraction
TCRB deep sequencing was performed from 65 patients, accompanied by sequencing of 20 healthy controls
Genomic DNA was used in all cases.

Totals: 82 newly diagnosed RA patients, among whom 25 patients were sequenced with the immunogene panel. The 20 healthy controls were also included in the immunogene panel.

Table 1: Immunogene panel sequencing was performed on both CD4+ and CD8+ cells of 25 newly diagnosed RA patients and 20 healthy controls (HCs)

More info at https://static-content.springer.com/esm/art%3A10.1038%2Fncomms15869/MediaObjects/41467_2017_BFncomms15869_MOESM218_ESM.pdf SI Table 2
"""

df["sequencing_type"] = "gDNA"
df["participant_label"] = df["sample_name"].str.split("-").str[0]

df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label
0,RA3-3years,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA3
1,RA1-7months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA1
2,RA2-18months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA2
3,HC8,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC8
4,HC9,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC9
...,...,...,...,...,...,...,...,...,...
89,RA7,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA7
90,RA8,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA8
91,RA14,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA14
92,RA4,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA4


In [49]:
df["participant_label"].sort_values().unique()

array(['HC1', 'HC10', 'HC11', 'HC12', 'HC13', 'HC14', 'HC15', 'HC16',
       'HC17', 'HC18', 'HC19', 'HC2', 'HC20', 'HC3', 'HC4', 'HC5', 'HC6',
       'HC7', 'HC8', 'HC9', 'RA1', 'RA10', 'RA11', 'RA12', 'RA13', 'RA14',
       'RA15', 'RA16', 'RA17', 'RA18', 'RA19', 'RA2', 'RA20', 'RA21',
       'RA22', 'RA23', 'RA24', 'RA25', 'RA26', 'RA28', 'RA29', 'RA3',
       'RA30', 'RA31', 'RA32', 'RA33', 'RA37', 'RA4', 'RA40', 'RA41',
       'RA46', 'RA47', 'RA48', 'RA49', 'RA5', 'RA50', 'RA51', 'RA52',
       'RA53', 'RA54', 'RA55', 'RA56', 'RA57', 'RA58', 'RA59', 'RA6',
       'RA60', 'RA62', 'RA63', 'RA65', 'RA66', 'RA68', 'RA69', 'RA7',
       'RA70', 'RA72', 'RA74', 'RA75', 'RA76', 'RA79', 'RA8', 'RA80',
       'RA81', 'RA82', 'RA9'], dtype=object)

In [50]:
df["sample_tags"].str.split(",").explode().str.strip().value_counts()

T cells                                            35
Rheumatoid Arthritis                               27
CD8+                                               24
HLA MHC Class I                                    12
Peripheral blood lymphocytes (PBL)                 11
CD4-                                                8
Synovial fluid                                      3
CD4+                                                2
T Cell Large Granular Lymphocyte Leukemia (LGL)     2
CD8-                                                1
Name: sample_tags, dtype: int64

In [51]:
df = df[~df["sample_tags"].str.contains("Synovial fluid").fillna(False)].copy()

# sanity check
assert not df["sample_name"].str.contains("-SF-").any()

df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label
0,RA3-3years,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA3
1,RA1-7months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA1
2,RA2-18months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA2
3,HC8,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC8
4,HC9,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC9
...,...,...,...,...,...,...,...,...,...
89,RA7,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA7
90,RA8,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA8
91,RA14,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA14
92,RA4,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA4


In [52]:
# table 1 has the HCs
df1 = pd.read_csv(
    config.paths.metadata_dir / "adaptive" / "mustjoki-2017-natcomms.table_1.csv"
)
df1

Unnamed: 0,Patient ID,Sex,Age at diagnosis,Seropositive,DAS28,Shared epitope,Clonality,Largest CD8+ clone (%)
0,1,M,75,Yes,3.5,Yes/no,0.63,33.9
1,2,F,72,Yes,3.5,Yes/yes,0.28,7.8
2,3,M,44,Yes,,Yes/no,0.6,51.0
3,4,F,74,No,,No/no,0.24,13.2
4,5,F,59,Yes,,No/no,0.44,29.0
5,6,M,75,Yes,4.0,Yes/yes,0.41,11.1
6,7,M,66,Yes,3.9,No/no,0.35,13.8
7,8,F,61,Yes,3.1,No/no,0.08,2.0
8,9,M,45,Yes,4.0,Yes/no,0.28,17.2
9,10,M,69,Yes,3.3,Yes/no,0.46,16.2


In [53]:
df1 = df1[df1["Patient ID"].str.startswith("HC")]
df1 = df1.rename(columns={"Age at diagnosis": "Age"})[["Patient ID", "Sex", "Age"]]
df1["disease"] = healthy_label
df1["disease_subtype"] = f"{healthy_label} - RA negative"
df1

Unnamed: 0,Patient ID,Sex,Age,disease,disease_subtype
25,HC1,F,58,Healthy/Background,Healthy/Background - RA negative
26,HC2,M,61,Healthy/Background,Healthy/Background - RA negative
27,HC3,F,50,Healthy/Background,Healthy/Background - RA negative
28,HC4,M,58,Healthy/Background,Healthy/Background - RA negative
29,HC5,F,65,Healthy/Background,Healthy/Background - RA negative
30,HC6,F,50,Healthy/Background,Healthy/Background - RA negative
31,HC7,F,62,Healthy/Background,Healthy/Background - RA negative
32,HC8,F,44,Healthy/Background,Healthy/Background - RA negative
33,HC9,M,55,Healthy/Background,Healthy/Background - RA negative
34,HC10,M,52,Healthy/Background,Healthy/Background - RA negative


In [54]:
# SI table 2 has all the RAs
df2 = pd.read_csv(
    config.paths.metadata_dir / "adaptive" / "mustjoki-2017-natcomms.si_table_2.csv"
)
df2

Unnamed: 0,Patient ID,Gender,Largest CD8+ clone (% of CD8+),Age at diagnosis,Hb,ESR,Leucocytes,Lymphocytes,Neutrophils,Monocytes,...,RF,Seropositive,ACPA,DAS28,tender joints,swollen joints,pat_global,rtg_erosions,HAQ,Duration
0,1,M,33.9,74.7,132,33,6.4,1.1,4.4,0.8,...,189.0,yes,160,3.46,5,0,3,no,2.000,5.0
1,2,F,7.8,72.0,126,47,7.1,2.1,4.1,0.6,...,19.0,yes,300,3.54,1,3,40,,1.300,
2,3,M,51.0,44.3,156,12,8.3,3.2,4.2,0.7,...,,yes,200,,11,1,55,yes,0.625,
3,4,F,13.2,73.6,150,31,7.0,2.2,4.1,0.6,...,10.0,no,6,,1,15,18,,0.630,
4,5,F,29.0,58.8,134,5,9.1,3.2,4.6,1.0,...,5.0,yes,204,,4,1,47,,,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,F,,39.1,142,8,9.2,2.1,6.1,0.8,...,120.0,yes,340,4.65,14,5,68,,1.380,
78,79,F,10.3,76.6,113,68,8.1,1.7,5.5,0.7,...,11.0,no,6,5.17,1,5,50,no,0.880,8.0
79,80,F,2.0,44.2,146,27,7.8,2.1,5.0,0.6,...,18.0,yes,301,3.87,2,2,27,yes,1.000,14.0
80,81,F,19.1,59.0,122,24,5.1,1.5,2.9,0.5,...,216.0,yes,120,5.28,13,6,92,,0.500,


In [55]:
df2 = df2.rename(columns={"Age at diagnosis": "Age", "Gender": "Sex"})[
    ["Patient ID", "Sex", "Seropositive", "Age"]
]
df2["Patient ID"] = "RA" + df2["Patient ID"].astype(str)
df2["disease"] = "RA"
df2["disease_subtype"] = "RA - " + df2["Seropositive"].map(
    {"yes": "sero-positive", "no": "sero-negative"}
)
df2

Unnamed: 0,Patient ID,Sex,Seropositive,Age,disease,disease_subtype
0,RA1,M,yes,74.7,RA,RA - sero-positive
1,RA2,F,yes,72.0,RA,RA - sero-positive
2,RA3,M,yes,44.3,RA,RA - sero-positive
3,RA4,F,no,73.6,RA,RA - sero-negative
4,RA5,F,yes,58.8,RA,RA - sero-positive
...,...,...,...,...,...,...
77,RA78,F,yes,39.1,RA,RA - sero-positive
78,RA79,F,no,76.6,RA,RA - sero-negative
79,RA80,F,yes,44.2,RA,RA - sero-positive
80,RA81,F,yes,59.0,RA,RA - sero-positive


In [56]:
df = pd.merge(
    df,
    pd.concat([df1, df2], axis=0).drop(columns="Seropositive"),
    how="left",
    left_on="participant_label",
    right_on="Patient ID",
    validate="m:1",
).drop(columns="Patient ID")
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
0,RA3-3years,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA3,M,44.3,RA,RA - sero-positive
1,RA1-7months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA1,M,74.7,RA,RA - sero-positive
2,RA2-18months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA2,F,72.0,RA,RA - sero-positive
3,HC8,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC8,F,44.0,Healthy/Background,Healthy/Background - RA negative
4,HC9,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC9,M,55.0,Healthy/Background,Healthy/Background - RA negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,RA7,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA7,M,66.1,RA,RA - sero-positive
87,RA8,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA8,F,60.5,RA,RA - sero-positive
88,RA14,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA14,F,73.5,RA,RA - sero-positive
89,RA4,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA4,F,73.6,RA,RA - sero-negative


In [57]:
assert not df["disease"].isna().any()

In [58]:
df["participant_label"].value_counts().head(n=10)

RA2     3
RA3     2
RA6     2
RA23    2
RA1     2
RA47    1
HC5     1
HC6     1
HC7     1
RA51    1
Name: participant_label, dtype: int64

In [59]:
# people with multiple samples
df[
    df["participant_label"].isin(
        df["participant_label"]
        .value_counts()[df["participant_label"].value_counts() > 1]
        .index
    )
].sort_values("sample_name")

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
80,RA1,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA1,M,74.7,RA,RA - sero-positive
1,RA1-7months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA1,M,74.7,RA,RA - sero-positive
85,RA2,TCRB,"CD8+, HLA MHC Class I, Rheumatoid Arthritis, T...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA2,F,72.0,RA,RA - sero-positive
2,RA2-18months,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA2,F,72.0,RA,RA - sero-positive
79,RA2-CD8+Vb1+,TCRB,"CD8+, T Cell Large Granular Lymphocyte Leukemi...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA2,F,72.0,RA,RA - sero-positive
65,RA23,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative
66,RA23-CD4,TCRB,"CD4+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative
84,RA3,TCRB,"CD8+, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA3,M,44.3,RA,RA - sero-positive
0,RA3-3years,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA3,M,44.3,RA,RA - sero-positive
77,RA6,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive


In [60]:
# Remove late timepoints from RA 1, 2, and 3. We want newly diagnosed, treatment naive.
df = df[~df["sample_name"].isin(["RA1-7months", "RA2-18months", "RA3-3years"])]

In [61]:
# Remove RA patient 1 2's special extra sorts:
# Patient 2 was a 72-year-old female who also had palindromic rheumatism and a previous history of other inflammatory disorders: asthma, lichen ruber and atrophic gastritis. At the time of RA diagnosis, flow cytometric screening identified two unusually large populations of CD8+ T cells: Vβ1+ (14%) and Vβ13.6+ (11%)
df = df[~df["sample_name"].isin(["RA2-CD8+Vb1+"])]

In [62]:
# Recheck: people with multiple samples
# We see some CD4 separated out of CD8
df[
    df["participant_label"].isin(
        df["participant_label"]
        .value_counts()[df["participant_label"].value_counts() > 1]
        .index
    )
].sort_values("sample_name")

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
65,RA23,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative
66,RA23-CD4,TCRB,"CD4+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative
77,RA6,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive
76,RA6-CD4,TCRB,"CD8-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive


In [63]:
df[df["sample_tags"].str.contains("CD4\+").fillna(False)]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
66,RA23-CD4,TCRB,"CD4+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative


In [64]:
df[df["sample_tags"].str.contains("CD8-").fillna(False)]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
76,RA6-CD4,TCRB,"CD8-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive


In [65]:
df[df["sample_tags"].str.contains("CD4-").fillna(False)]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
69,RA20,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA20,F,46.1,RA,RA - sero-positive
70,RA19,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA19,F,57.9,RA,RA - sero-positive
71,RA18,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA18,F,65.1,RA,RA - sero-positive
72,RA17,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA17,F,70.7,RA,RA - sero-negative
73,RA15,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA15,F,25.0,RA,RA - sero-negative
74,RA13,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA13,F,65.7,RA,RA - sero-positive
75,RA12,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA12,M,50.6,RA,RA - sero-positive
77,RA6,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive


In [66]:
df[df["sample_tags"].str.contains("CD8\+").fillna(False)]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
58,HC7,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC7,F,62.0,Healthy/Background,Healthy/Background - RA negative
59,HC6,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC6,F,50.0,Healthy/Background,Healthy/Background - RA negative
60,HC5,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC5,F,65.0,Healthy/Background,Healthy/Background - RA negative
61,HC4,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC4,M,58.0,Healthy/Background,Healthy/Background - RA negative
62,HC3,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC3,F,50.0,Healthy/Background,Healthy/Background - RA negative
63,HC2,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC2,M,61.0,Healthy/Background,Healthy/Background - RA negative
64,HC1,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC1,F,58.0,Healthy/Background,Healthy/Background - RA negative
65,RA23,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative
67,RA22,TCRB,"CD8+, HLA MHC Class I, Peripheral blood lympho...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA22,F,68.2,RA,RA - sero-positive
68,RA21,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA21,F,48.2,RA,RA - sero-negative


In [67]:
# Many samples are full PBMC though
df[
    (~df["sample_tags"].str.contains("CD8").fillna(False))
    & (~df["sample_tags"].str.contains("CD4").fillna(False))
]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype
3,HC8,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC8,F,44.0,Healthy/Background,Healthy/Background - RA negative
4,HC9,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC9,M,55.0,Healthy/Background,Healthy/Background - RA negative
5,HC11,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC11,M,48.0,Healthy/Background,Healthy/Background - RA negative
6,HC17,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC17,F,21.0,Healthy/Background,Healthy/Background - RA negative
7,HC10,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC10,M,52.0,Healthy/Background,Healthy/Background - RA negative
8,HC18,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC18,M,60.0,Healthy/Background,Healthy/Background - RA negative
9,HC19,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC19,M,43.0,Healthy/Background,Healthy/Background - RA negative
10,HC20,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC20,F,66.0,Healthy/Background,Healthy/Background - RA negative
11,RA72,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA72,F,61.6,RA,RA - sero-positive
12,RA74,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA74,F,43.4,RA,RA - sero-positive


In [68]:
# Mark the CD8/CD4 separations
# keep the cell type inside the replicate label, to be merged within the ETL flow later.
df["specimen_label"] = df["participant_label"]
df["replicate_label"] = df["sample_name"]
df.loc[df["sample_name"] == "RA23-CD4", "replicate_label"] = "RA23_CD4"
df.loc[df["sample_name"] == "RA23", "replicate_label"] = "RA23_CD8"
df.loc[df["sample_name"] == "RA6-CD4", "replicate_label"] = "RA6_CD4"
df.loc[df["sample_name"] == "RA6", "replicate_label"] = "RA6_CD8"

# Recheck: people with multiple samples
# We see some CD4 separated out of CD8
df[
    df["participant_label"].isin(
        df["participant_label"]
        .value_counts()[df["participant_label"].value_counts() > 1]
        .index
    )
].sort_values("sample_name")

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype,specimen_label,replicate_label
65,RA23,TCRB,"CD8+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative,RA23,RA23_CD8
66,RA23-CD4,TCRB,"CD4+, Peripheral blood lymphocytes (PBL), Rheu...",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA23,F,24.4,RA,RA - sero-negative,RA23,RA23_CD4
77,RA6,TCRB,"CD4-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive,RA6,RA6_CD8
76,RA6-CD4,TCRB,"CD8-, Rheumatoid Arthritis, T cells",v2,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,RA6,M,74.5,RA,RA - sero-positive,RA6,RA6_CD4


In [69]:
# Keep full PBMC samples only, for consistency
df = pd.concat(
    [
        df[
            (~df["sample_tags"].str.contains("CD8").fillna(False))
            & (~df["sample_tags"].str.contains("CD4").fillna(False))
            # this would indicate CD8:
            & (~df["sample_tags"].str.contains("HLA MHC Class I").fillna(False))
        ],
        # also keep the ones where we have both fractions - see previous cell
        df[df["replicate_label"].isin(["RA23_CD8", "RA23_CD4", "RA6_CD8", "RA6_CD4"])],
    ],
    axis=0,
)
df = df.sort_values("replicate_label")
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype,specimen_label,replicate_label
7,HC10,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC10,M,52.0,Healthy/Background,Healthy/Background - RA negative,HC10,HC10
5,HC11,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC11,M,48.0,Healthy/Background,Healthy/Background - RA negative,HC11,HC11
21,HC12,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC12,F,57.0,Healthy/Background,Healthy/Background - RA negative,HC12,HC12
22,HC13,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC13,F,56.0,Healthy/Background,Healthy/Background - RA negative,HC13,HC13
23,HC14,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC14,M,70.0,Healthy/Background,Healthy/Background - RA negative,HC14,HC14
24,HC15,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC15,F,47.0,Healthy/Background,Healthy/Background - RA negative,HC15,HC15
25,HC16,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC16,M,48.0,Healthy/Background,Healthy/Background - RA negative,HC16,HC16
6,HC17,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC17,F,21.0,Healthy/Background,Healthy/Background - RA negative,HC17,HC17
8,HC18,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC18,M,60.0,Healthy/Background,Healthy/Background - RA negative,HC18,HC18
9,HC19,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC19,M,43.0,Healthy/Background,Healthy/Background - RA negative,HC19,HC19


In [70]:
dfs_adaptive_manual.append(df)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,Sex,Age,disease,disease_subtype,specimen_label,replicate_label
7,HC10,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC10,M,52.0,Healthy/Background,Healthy/Background - RA negative,HC10,HC10
5,HC11,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC11,M,48.0,Healthy/Background,Healthy/Background - RA negative,HC11,HC11
21,HC12,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC12,F,57.0,Healthy/Background,Healthy/Background - RA negative,HC12,HC12
22,HC13,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC13,F,56.0,Healthy/Background,Healthy/Background - RA negative,HC13,HC13
23,HC14,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC14,M,70.0,Healthy/Background,Healthy/Background - RA negative,HC14,HC14
24,HC15,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC15,F,47.0,Healthy/Background,Healthy/Background - RA negative,HC15,HC15
25,HC16,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC16,M,48.0,Healthy/Background,Healthy/Background - RA negative,HC16,HC16
6,HC17,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC17,F,21.0,Healthy/Background,Healthy/Background - RA negative,HC17,HC17
8,HC18,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC18,M,60.0,Healthy/Background,Healthy/Background - RA negative,HC18,HC18
9,HC19,TCRB,,v3,Human-TCRB-PD1x,Human,mustjoki-2017-natcomms,gDNA,HC19,M,43.0,Healthy/Background,Healthy/Background - RA negative,HC19,HC19


In [71]:
df = _load_metadata_df("ramesh-2015-ci")

"""
CVID as a T cell defect, not just B cell
https://www.sciencedirect.com/science/article/abs/pii/S1521661615000042?via%3Dihub
up to 44 CVID, 22 HC? see https://ars.els-cdn.com/content/image/1-s2.0-S1521661615000042-gr6_lrg.jpg for the participant names

Peripheral blood DNA samples of 44 CVID subjects, 15 females and 29 males of ages 9 to 64 years with a mean of 40,
and 22 healthy adult volunteers, 12 females and 10 males of ages 23 to 66 years with a mean of 34

Genomic DNA
"""

df["sequencing_type"] = "gDNA"
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
0,155,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
1,213,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
2,374,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
3,320,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
4,228,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
...,...,...,...,...,...,...,...,...
56,N18,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
57,N7,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
58,N9,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
59,14_Mt. Sinai-Cunningham-Rundles-P01,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA


In [72]:
df["sample_tags"].str.split(",").explode().str.strip().value_counts().head(n=50)

PBMC               61
HLA MHC Class I     6
Blood               1
Name: sample_tags, dtype: int64

In [73]:
# Not sure: What are the last two?
df["sample_name"].unique()

array(['155', '213', '374', '320', '228', '130', '345', '131', '211',
       '289', '279', '335', '334', '386', '317', '378', '441', '172',
       '139', '110', '299', '203', '462', '329', '272', '102', 'N12',
       '400', '251', '218', '268', '161', '366', '248', '263', '232',
       '311', '175', 'N8', 'N17', 'N13', 'N14', 'N4', 'N16', 'N15', 'N2',
       'N11', 'N5', 'N20', 'N10', 'N1', 'N3', 'N21', 'N6', 'N22', 'N19',
       'N18', 'N7', 'N9', '14_Mt. Sinai-Cunningham-Rundles-P01',
       '2_Mt. Sinai-Cunningham-Rundles-P01'], dtype=object)

In [74]:
# Drop those to be safe.
df = df[~df["sample_name"].str.contains("Rundles")].copy()
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
0,155,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
1,213,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
2,374,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
3,320,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
4,228,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
5,130,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
6,345,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
7,131,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
8,211,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA
9,289,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA


In [75]:
df["specimen_label"] = df["sample_name"]
df["participant_label"] = df["sample_name"]
df["disease"] = (
    df["sample_name"].str.startswith("N").map({True: healthy_label, False: "CVID"})
)
df["disease_subtype"] = df["disease"].replace(
    {healthy_label: healthy_label + " - CVID negative"}
)

In [76]:
dfs_adaptive_manual.append(df)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,participant_label,disease,disease_subtype
0,155,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,155,155,CVID,CVID
1,213,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,213,213,CVID,CVID
2,374,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,374,374,CVID,CVID
3,320,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,320,320,CVID,CVID
4,228,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,228,228,CVID,CVID
5,130,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,130,130,CVID,CVID
6,345,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,345,345,CVID,CVID
7,131,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,131,131,CVID,CVID
8,211,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,211,211,CVID,CVID
9,289,TCRB,PBMC,v1,Human-TCRB-none,Human,ramesh-2015-ci,gDNA,289,289,CVID,CVID


In [77]:
df = _load_metadata_df("TCRBv4-control")

"""
colorectal cancer, lung cancer, head and neck cancer, and healthy control
58 cancer, 88 hhc, pre-covid
ignore the 11 FFPE samples
add the HHCs, don't do cancer for now
"""

df["sequencing_type"] = "gDNA"

df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
0,Subject_76,TCRB,"67 Years, Blood, Cohort 3, Healthy, Hispanic, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
1,Subject_57,TCRB,"60 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
2,Subject_53,TCRB,"40 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
3,Subject_104,TCRB,"29 Years, Black or African American, Blood, Co...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
4,Subject_146,TCRB,"61 Years, Caucasian, Head and Neck Cancer, Mal...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
...,...,...,...,...,...,...,...,...
153,Subject_141,TCRB,"60 Years, Caucasian, Colorectal Cancer, Male, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
154,Subject_49,TCRB,"72 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
155,Subject_58,TCRB,"40 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
156,Subject_111,TCRB,"81 Years, Ex-smoker, Lung Cancer, Male, Native...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA


In [78]:
df["sample_tags"].str.split(",").explode().str.strip().value_counts().head(n=50)

Male                                94
Blood                               88
Cohort 3                            88
Healthy                             88
Inactive                            88
Caucasian                           70
Female                              64
PBMC                                59
Non-smoker                          53
Black or African American           43
Smoker                              34
Ex-smoker                           27
Never Smoked                        25
Hispanic                            22
Lung Cancer                         22
Colorectal Cancer                   20
Head and Neck Cancer                20
Unknown Smoking Status              17
Asian or Pacific Islander           16
FFPE                                11
Melanoma                             8
66 Years                             8
30 Years                             7
57 Years                             7
51 Years                             6
63 Years                 

In [79]:
# almost always, a row's sample tag will contain EITHER cancer or healthy
# except for these FFPE rows
df[
    df["sample_tags"].str.contains("Cancer")
    != ~df["sample_tags"].str.contains("Healthy")
]

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
39,Subject_3,TCRB,"Caucasian, FFPE, Male, Melanoma, Unknown Smoki...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
40,Subject_6,TCRB,"Caucasian, FFPE, Male, Melanoma, Never Smoked",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
48,Subject_4,TCRB,"Caucasian, FFPE, Male, Melanoma, Unknown Smoki...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
66,Subject_7,TCRB,"Caucasian, FFPE, Male, Melanoma, Unknown Smoki...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
76,Subject_5,TCRB,"Caucasian, FFPE, Male, Melanoma, Unknown Smoki...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
83,Subject_2,TCRB,"Caucasian, FFPE, Female, Melanoma, Unknown Smo...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
89,Subject_8,TCRB,"Caucasian, FFPE, Male, Melanoma, Unknown Smoki...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
112,Subject_9,TCRB,"Caucasian, FFPE, Female, Melanoma, Unknown Smo...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA


In [80]:
# remove FFPE
df = df[~df["sample_tags"].str.contains("FFPE")]

In [81]:
# All remaining rows either marked Blood or PBMC (or both)
assert (
    df["sample_tags"].str.contains("Blood") | df["sample_tags"].str.contains("PBMC")
).all()

In [82]:
# now that we filtered to PBMC: a row's sample tag will contain EITHER cancer or healthy, NEVER both
assert (
    df["sample_tags"].str.contains("Cancer")
    != df["sample_tags"].str.contains("Healthy")
).all()

In [83]:
# filter to healthy only. remove cancer
df = df[df["sample_tags"].str.contains("Healthy")]
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
0,Subject_76,TCRB,"67 Years, Blood, Cohort 3, Healthy, Hispanic, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
1,Subject_57,TCRB,"60 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
2,Subject_53,TCRB,"40 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
3,Subject_104,TCRB,"29 Years, Black or African American, Blood, Co...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
5,Subject_152,TCRB,"38 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
...,...,...,...,...,...,...,...,...
149,Subject_63,TCRB,"57 Years, Blood, Cohort 3, Healthy, Hispanic, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
150,Subject_18,TCRB,"38 Years, Asian or Pacific Islander, Blood, Co...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
151,Subject_60,TCRB,"57 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA
154,Subject_49,TCRB,"72 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA


In [84]:
# TODO: Pull out age, sex, ancestry
# df['sample_tags'].unique()

In [85]:
df["disease"] = healthy_label
df["disease_subtype"] = f"{healthy_label} - TCRBv4-control"

In [86]:
df["participant_label"] = df["sample_name"]
df["participant_label"].unique()

array(['Subject_76', 'Subject_57', 'Subject_53', 'Subject_104',
       'Subject_152', 'Subject_158', 'Subject_73', 'Subject_98',
       'Subject_151', 'Subject_92', 'Subject_52', 'Subject_75',
       'Subject_85', 'Subject_56', 'Subject_54', 'Subject_15',
       'Subject_82', 'Subject_97', 'Subject_87', 'Subject_65',
       'Subject_48', 'Subject_41', 'Subject_26', 'Subject_16',
       'Subject_68', 'Subject_86', 'Subject_102', 'Subject_47',
       'Subject_19', 'Subject_39', 'Subject_72', 'Subject_70',
       'Subject_78', 'Subject_74', 'Subject_69', 'Subject_99',
       'Subject_44', 'Subject_12', 'Subject_66', 'Subject_88',
       'Subject_46', 'Subject_50', 'Subject_91', 'Subject_93',
       'Subject_27', 'Subject_17', 'Subject_40', 'Subject_90',
       'Subject_103', 'Subject_101', 'Subject_80', 'Subject_43',
       'Subject_96', 'Subject_59', 'Subject_71', 'Subject_14',
       'Subject_81', 'Subject_64', 'Subject_94', 'Subject_55',
       'Subject_155', 'Subject_61', 'Subject_42'

In [87]:
df["specimen_label"] = df["sample_name"]

In [88]:
dfs_adaptive_manual.append(df)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,disease,disease_subtype,participant_label,specimen_label
0,Subject_76,TCRB,"67 Years, Blood, Cohort 3, Healthy, Hispanic, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_76,Subject_76
1,Subject_57,TCRB,"60 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_57,Subject_57
2,Subject_53,TCRB,"40 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_53,Subject_53
3,Subject_104,TCRB,"29 Years, Black or African American, Blood, Co...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_104,Subject_104
5,Subject_152,TCRB,"38 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_152,Subject_152
...,...,...,...,...,...,...,...,...,...,...,...,...
149,Subject_63,TCRB,"57 Years, Blood, Cohort 3, Healthy, Hispanic, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_63,Subject_63
150,Subject_18,TCRB,"38 Years, Asian or Pacific Islander, Blood, Co...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_18,Subject_18
151,Subject_60,TCRB,"57 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_60,Subject_60
154,Subject_49,TCRB,"72 Years, Blood, Caucasian, Cohort 3, Female, ...",v4,Human-TCRB-PD4bx,Human,TCRBv4-control,gDNA,Healthy/Background,Healthy/Background - TCRBv4-control,Subject_49,Subject_49


In [89]:
df = _load_metadata_df("towlerton-2022-hiv")

"""
HIV on long term therapy, before and after
30 adults, 192 samples?
30 adults with HIV infection before and after the initiation of ART

A total of 192 cryopreserved PBMC samples serially collected over a mean of 6 (range, 1-12) years from 30 adults with confirmed HIV infection (median, 7 samples per subject) were received from the University of Washington CFAR Biorepository.
PBMC samples collected at 1-4 timepoints before and 2-8 time points after the initiation of ART were available from each subject.
genomic DNA was extracted from unsorted PBMC

The blood samples from PLHIV were obtained between 2004 and 2017: pre-pandemic

Metadata:
https://www.frontiersin.org/articles/10.3389/fimmu.2022.879190/full#supplementary-material
https://github.com/shashidhar22/airr_seq_pipelines/blob/main/metadata/cfar/CFAR_Dean_metadata.xlsx
(identical?)
"""

df["sequencing_type"] = "gDNA"

df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type
0,015V15001607_CFAR,TCRB,"4.906849315 years_rel_to_art, 40.48219178 age_...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
1,015V15000803_CFAR,TCRB,"22.38630137 age_at_min_evidence, 25.95890411 a...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
2,015V15009714_CFAR,TCRB,"39.70491803 age_at_min_evidence, 43.71038251 a...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
3,015V15000746_CFAR,TCRB,"40.1420765 age_at_min_evidence, 42.09041096 ag...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
4,015V16003146_CFAR,TCRB,"40.60547945 age_at_min_evidence, 45.6147541 ag...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
...,...,...,...,...,...,...,...,...
187,015V09001205_CFAR,TCRB,"-0.539726027 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
188,015V09003735_CFAR,TCRB,"0.073972603 years_rel_to_art, 41.89589041 age_...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
189,015V10006273_CFAR,TCRB,"1.2 years_rel_to_art, 41.89589041 age_at_min_e...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA
190,015V12002996_CFAR,TCRB,"2.950819672 years_rel_to_art, 41.89589041 age_...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA


In [90]:
df["sample_tags"].str.split(",").explode().str.strip().value_counts().head(n=50)

Positive HIV_status                 192
NA age_at_death                     178
DPA1*010301 DPA1allele1             156
Post-ART timePoint                  131
DPA1*010301 DPA1allele2             123
Male                                117
DPB1*040101 DPB1allele1              76
Female                               75
Pre-ART timePoint                    61
DQA1*010201 DQA1allele1              61
DRB3*010102 DRB345allele1            56
DRB5*010101 DRB345allele2            54
C*040101 cAllele1                    52
A*020101 aAllele1                    52
DPB1*040101 DPB1allele2              51
DRB345*Not_Present DRB345allele1     49
DRB4*010301 DRB345allele2            46
DRB1*030101 DRB1allele1              45
DQB1*020101 DQB1allele1              45
DRB1*150101 DRB1allele2              45
A*010101 aAllele1                    45
DQA1*050101 DQA1allele2              45
DQB1*060201 DQB1allele2              38
DPA1*020101 DPA1allele2              37
DPB1*030101 DPB1allele1              37


In [91]:
# each entry has a pre or post ART timepoint identified
df["sample_tags"].str.contains("ART").all()

True

In [92]:
sample_tags = df["sample_tags"].str.split(",").explode().str.strip().drop_duplicates()
sample_tags[sample_tags.str.contains("ART")].sort_values()

32       <= 0 month Pre-ART time_group
25       <= 1 year Post-ART time_group
51        <= 1 year Pre-ART time_group
12     <= 10 years Post-ART time_group
28      <= 2 years Post-ART time_group
62       <= 2 years Pre-ART time_group
24     <= 3 months Post-ART time_group
27      <= 3 months Pre-ART time_group
34      <= 3 years Post-ART time_group
139      <= 3 years Pre-ART time_group
13      <= 4 years Post-ART time_group
153      <= 4 years Pre-ART time_group
0       <= 5 years Post-ART time_group
31     <= 6 months Post-ART time_group
50      <= 6 months Pre-ART time_group
1       <= 6 years Post-ART time_group
2       <= 7 years Post-ART time_group
5       <= 8 years Post-ART time_group
4       <= 9 years Post-ART time_group
20      > 10 years Post-ART time_group
0                   Post-ART timePoint
27                   Pre-ART timePoint
Name: sample_tags, dtype: object

In [93]:
df_m = pd.read_excel(
    config.paths.metadata_dir
    / "adaptive"
    / "towlerton-2022-hiv.CFAR_Dean_metadata.xlsx",
    sheet_name="CFAR_Sample_info",
)
df_m

Unnamed: 0,sampleID,patientID,years_rel_to_art,timePoint,time_group,sex,age_at_collection
0,015V07002801_CFAR,1001,-2.630137,Pre-ART,<= 2 years Pre-ART,M,45
1,015V08002889_CFAR,1001,-0.873973,Pre-ART,<= 6 months Pre-ART,M,46
2,015V09001205_CFAR,1001,-0.539726,Pre-ART,<= 6 months Pre-ART,M,47
3,015V09003735_CFAR,1001,0.073973,Post-ART,<= 0 month Pre-ART,M,47
4,015V10006273_CFAR,1001,1.200000,Post-ART,<= 2 years Post-ART,M,48
...,...,...,...,...,...,...,...
187,015V15015350_CFAR,1029,4.387978,Post-ART,<= 5 years Post-ART,F,54
188,015V16002103_CFAR,1029,5.065753,Post-ART,<= 5 years Post-ART,F,55
189,015V13001607_CFAR,1030,0.000000,Pre-ART,<= 0 month Pre-ART,F,27
190,015V13009411_CFAR,1030,0.567123,Post-ART,<= 6 months Post-ART,F,27


In [94]:
df.shape

(192, 8)

In [95]:
df = pd.merge(
    df, df_m, left_on="sample_name", right_on="sampleID", validate="1:1", how="left"
).drop(columns="sampleID")
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,patientID,years_rel_to_art,timePoint,time_group,sex,age_at_collection
0,015V15001607_CFAR,TCRB,"4.906849315 years_rel_to_art, 40.48219178 age_...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1016,4.906849,Post-ART,<= 5 years Post-ART,F,52
1,015V15000803_CFAR,TCRB,"22.38630137 age_at_min_evidence, 25.95890411 a...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1024,5.312329,Post-ART,<= 6 years Post-ART,M,31
2,015V15009714_CFAR,TCRB,"39.70491803 age_at_min_evidence, 43.71038251 a...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1009,6.917808,Post-ART,<= 7 years Post-ART,M,50
3,015V15000746_CFAR,TCRB,"40.1420765 age_at_min_evidence, 42.09041096 ag...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1023,5.150685,Post-ART,<= 6 years Post-ART,F,47
4,015V16003146_CFAR,TCRB,"40.60547945 age_at_min_evidence, 45.6147541 ag...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1007,8.180822,Post-ART,<= 9 years Post-ART,M,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,015V09001205_CFAR,TCRB,"-0.539726027 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-0.539726,Pre-ART,<= 6 months Pre-ART,M,47
188,015V09003735_CFAR,TCRB,"0.073972603 years_rel_to_art, 41.89589041 age_...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,0.073973,Post-ART,<= 0 month Pre-ART,M,47
189,015V10006273_CFAR,TCRB,"1.2 years_rel_to_art, 41.89589041 age_at_min_e...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,1.200000,Post-ART,<= 2 years Post-ART,M,48
190,015V12002996_CFAR,TCRB,"2.950819672 years_rel_to_art, 41.89589041 age_...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,2.950820,Post-ART,<= 3 years Post-ART,M,50


In [96]:
df["timePoint"].value_counts()

Post-ART    131
Pre-ART      61
Name: timePoint, dtype: int64

In [97]:
df = df[df["timePoint"] == "Pre-ART"]
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,patientID,years_rel_to_art,timePoint,time_group,sex,age_at_collection
27,015V11001386_CFAR,TCRB,"-0.345205479 years_rel_to_art, 32 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1028,-0.345205,Pre-ART,<= 3 months Pre-ART,M,32
32,015V13001607_CFAR,TCRB,"0 years_rel_to_art, 27 age_at_collection, 27.1...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1030,0.000000,Pre-ART,<= 0 month Pre-ART,F,27
35,015V11001839_CFAR,TCRB,"-0.090410959 years_rel_to_art, 50 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1029,-0.090411,Pre-ART,<= 0 month Pre-ART,F,50
41,015V11002805_CFAR,TCRB,"-0.054794521 years_rel_to_art, 32 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1028,-0.054795,Pre-ART,<= 0 month Pre-ART,M,32
49,015V09002862_CFAR,TCRB,"-0.04109589 years_rel_to_art, 27 age_at_collec...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1027,-0.041096,Pre-ART,<= 0 month Pre-ART,F,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,015V06004712_CFAR,TCRB,"-0.643835616 years_rel_to_art, 37.20821918 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1002,-0.643836,Pre-ART,<= 6 months Pre-ART,F,49
184,015V06009959_CFAR,TCRB,"-0.317808219 years_rel_to_art, 37.20821918 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1002,-0.317808,Pre-ART,<= 3 months Pre-ART,F,49
185,015V07002801_CFAR,TCRB,"-2.630136986 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-2.630137,Pre-ART,<= 2 years Pre-ART,M,45
186,015V08002889_CFAR,TCRB,"-0.873972603 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-0.873973,Pre-ART,<= 6 months Pre-ART,M,46


In [98]:
df["time_group"].value_counts()

<= 0 month Pre-ART     15
<= 1 year Pre-ART      14
<= 6 months Pre-ART    12
<= 3 months Pre-ART    10
<= 2 years Pre-ART      5
<= 3 years Pre-ART      3
<= 4 years Pre-ART      2
Name: time_group, dtype: int64

In [99]:
df = df.rename(
    columns={"patientID": "participant_label", "age_at_collection": "Age", "sex": "Sex"}
)
df["specimen_label"] = df["sample_name"]

In [100]:
df["disease"] = "HIV"
df["disease_subtype"] = "HIV - before anti-retroviral therapy"
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,years_rel_to_art,timePoint,time_group,Sex,Age,specimen_label,disease,disease_subtype
27,015V11001386_CFAR,TCRB,"-0.345205479 years_rel_to_art, 32 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1028,-0.345205,Pre-ART,<= 3 months Pre-ART,M,32,015V11001386_CFAR,HIV,HIV - before anti-retroviral therapy
32,015V13001607_CFAR,TCRB,"0 years_rel_to_art, 27 age_at_collection, 27.1...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1030,0.000000,Pre-ART,<= 0 month Pre-ART,F,27,015V13001607_CFAR,HIV,HIV - before anti-retroviral therapy
35,015V11001839_CFAR,TCRB,"-0.090410959 years_rel_to_art, 50 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1029,-0.090411,Pre-ART,<= 0 month Pre-ART,F,50,015V11001839_CFAR,HIV,HIV - before anti-retroviral therapy
41,015V11002805_CFAR,TCRB,"-0.054794521 years_rel_to_art, 32 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1028,-0.054795,Pre-ART,<= 0 month Pre-ART,M,32,015V11002805_CFAR,HIV,HIV - before anti-retroviral therapy
49,015V09002862_CFAR,TCRB,"-0.04109589 years_rel_to_art, 27 age_at_collec...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1027,-0.041096,Pre-ART,<= 0 month Pre-ART,F,27,015V09002862_CFAR,HIV,HIV - before anti-retroviral therapy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,015V06004712_CFAR,TCRB,"-0.643835616 years_rel_to_art, 37.20821918 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1002,-0.643836,Pre-ART,<= 6 months Pre-ART,F,49,015V06004712_CFAR,HIV,HIV - before anti-retroviral therapy
184,015V06009959_CFAR,TCRB,"-0.317808219 years_rel_to_art, 37.20821918 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1002,-0.317808,Pre-ART,<= 3 months Pre-ART,F,49,015V06009959_CFAR,HIV,HIV - before anti-retroviral therapy
185,015V07002801_CFAR,TCRB,"-2.630136986 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-2.630137,Pre-ART,<= 2 years Pre-ART,M,45,015V07002801_CFAR,HIV,HIV - before anti-retroviral therapy
186,015V08002889_CFAR,TCRB,"-0.873972603 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-0.873973,Pre-ART,<= 6 months Pre-ART,M,46,015V08002889_CFAR,HIV,HIV - before anti-retroviral therapy


In [101]:
# how many samples per person?
df["participant_label"].value_counts().value_counts()

1    14
3     7
2     5
4     4
Name: participant_label, dtype: int64

In [102]:
dfs_adaptive_manual.append(df)
df

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,participant_label,years_rel_to_art,timePoint,time_group,Sex,Age,specimen_label,disease,disease_subtype
27,015V11001386_CFAR,TCRB,"-0.345205479 years_rel_to_art, 32 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1028,-0.345205,Pre-ART,<= 3 months Pre-ART,M,32,015V11001386_CFAR,HIV,HIV - before anti-retroviral therapy
32,015V13001607_CFAR,TCRB,"0 years_rel_to_art, 27 age_at_collection, 27.1...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1030,0.000000,Pre-ART,<= 0 month Pre-ART,F,27,015V13001607_CFAR,HIV,HIV - before anti-retroviral therapy
35,015V11001839_CFAR,TCRB,"-0.090410959 years_rel_to_art, 50 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1029,-0.090411,Pre-ART,<= 0 month Pre-ART,F,50,015V11001839_CFAR,HIV,HIV - before anti-retroviral therapy
41,015V11002805_CFAR,TCRB,"-0.054794521 years_rel_to_art, 32 age_at_colle...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1028,-0.054795,Pre-ART,<= 0 month Pre-ART,M,32,015V11002805_CFAR,HIV,HIV - before anti-retroviral therapy
49,015V09002862_CFAR,TCRB,"-0.04109589 years_rel_to_art, 27 age_at_collec...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1027,-0.041096,Pre-ART,<= 0 month Pre-ART,F,27,015V09002862_CFAR,HIV,HIV - before anti-retroviral therapy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,015V06004712_CFAR,TCRB,"-0.643835616 years_rel_to_art, 37.20821918 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1002,-0.643836,Pre-ART,<= 6 months Pre-ART,F,49,015V06004712_CFAR,HIV,HIV - before anti-retroviral therapy
184,015V06009959_CFAR,TCRB,"-0.317808219 years_rel_to_art, 37.20821918 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1002,-0.317808,Pre-ART,<= 3 months Pre-ART,F,49,015V06009959_CFAR,HIV,HIV - before anti-retroviral therapy
185,015V07002801_CFAR,TCRB,"-2.630136986 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-2.630137,Pre-ART,<= 2 years Pre-ART,M,45,015V07002801_CFAR,HIV,HIV - before anti-retroviral therapy
186,015V08002889_CFAR,TCRB,"-0.873972603 years_rel_to_art, 41.89589041 age...",v3,Human-TCRB-PD1x,Human,towlerton-2022-hiv,gDNA,1001,-0.873973,Pre-ART,<= 6 months Pre-ART,M,46,015V08002889_CFAR,HIV,HIV - before anti-retroviral therapy


# Combine all manually annotated studies

In [103]:
len(dfs_adaptive_manual)

6

In [104]:
# Also add ImmuneCode from the very top
dfs_adaptive_manual.append(adaptive_tcr_covid_specimens)

In [105]:
dfs_adaptive_manual = pd.concat(dfs_adaptive_manual, axis=0).reset_index(drop=True)
dfs_adaptive_manual_bak = dfs_adaptive_manual.copy()

In [106]:
dfs_adaptive_manual = dfs_adaptive_manual_bak.copy()  # for easy reset when debugging
dfs_adaptive_manual

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,participant_label,...,disease_subtype,Sample Timepoint,Race,replicate_label,years_rel_to_art,timePoint,time_group,age,sex,ethnicity
0,HIP15860,TCRB,"Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP15860,HIP15860,...,Healthy/Background - CMV Unknown,,,,,,,,,
1,HIP14363,TCRB,"Cohort 01, HLA-A*02, HLA-A*11, HLA-B*35, Human...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP14363,HIP14363,...,Healthy/Background - CMV Unknown,,,,,,,,,
2,HIP14178,TCRB,"Cohort 01, HLA-A*01, HLA-A*03, HLA-B*08, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP14178,HIP14178,...,Healthy/Background - CMV Unknown,,,,,,,,,
3,HIP13944,TCRB,"Cohort 01, HLA-A*02, HLA-A*03, HLA-B*18, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP13944,HIP13944,...,Healthy/Background - CMV Unknown,,,,,,,,,
4,HIP13911,TCRB,"22 Years, Caucasian, Cohort 01, Cytomegaloviru...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,HIP13911,HIP13911,...,Healthy/Background - CMV+,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,860011117_TCRB,TCRB,,,,Human,immunecode,gDNA,860011117_TCRB,ImmuneCode-304752,...,COVID-19-HUniv12Oct - Hospitalized - ICU,,,,,,,60.0,M,Caucasian
1373,860011106_TCRB,TCRB,,,,Human,immunecode,gDNA,860011106_TCRB,ImmuneCode-775827,...,COVID-19-HUniv12Oct - Hospitalized,,,,,,,57.0,F,Caucasian
1374,BS-GIGI_10-replacement_TCRB,TCRB,,,,Human,immunecode,gDNA,BS-GIGI_10-replacement_TCRB,ImmuneCode-0000446,...,COVID-19-NIH/NIAID - Hospitalized,,,,,,,75.0,M,Caucasian
1375,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,TCRB,,,,Human,immunecode,gDNA,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,ImmuneCode-0000160,...,COVID-19-NIH/NIAID - Hospitalized - ICU,,,,,,,72.0,M,Caucasian


In [107]:
del df  # unused variable now - let's get it out of scope to be safe

In [108]:
dfs_adaptive_manual["sequencing_type"].value_counts()

gDNA    1377
Name: sequencing_type, dtype: int64

In [109]:
assert not dfs_adaptive_manual["sequencing_type"].isna().any()

In [110]:
dfs_adaptive_manual["disease"].value_counts()

Healthy/Background    1009
T1D                    143
Covid19                 88
HIV                     61
RA                      39
CVID                    37
Name: disease, dtype: int64

In [111]:
assert not dfs_adaptive_manual["study"].isna().any()
assert not dfs_adaptive_manual["participant_label"].isna().any()
assert not dfs_adaptive_manual["specimen_label"].isna().any()

In [112]:
# To be consistent with boydlab columns, we'll add amplification_label, which here will always equal specimen_label.
# See sample_sequences.py for more details on how this gets used.
if "amplification_label" not in dfs_adaptive_manual.columns:
    dfs_adaptive_manual["amplification_label"] = dfs_adaptive_manual["specimen_label"]
else:
    # fill NA
    dfs_adaptive_manual["amplification_label"].fillna(
        dfs_adaptive_manual["specimen_label"], inplace=True
    )

# Fill replicate_label
dfs_adaptive_manual["replicate_label"].fillna(
    dfs_adaptive_manual["specimen_label"], inplace=True
)

In [113]:
# add study prefixes to make these labels unique to study:
for col in [
    "participant_label",
    "specimen_label",
    "amplification_label",
    "replicate_label",
]:
    dfs_adaptive_manual[col] = (
        dfs_adaptive_manual["study"] + "_" + dfs_adaptive_manual[col].astype(str)
    )

In [114]:
# confirm one entry per replicate label per locus, at most!
# (specimens can have multiple replicates, e.g. cell type subsets that get merged.)
# (participants can have multiple specimens, e.g. separate time points)
assert (dfs_adaptive_manual.groupby(["locus", "replicate_label"]).size() == 1).all()

In [115]:
dfs_adaptive_manual.groupby(["sequencing_type", "locus", "disease"], observed=True)[
    "participant_label"
].nunique().to_frame().sort_values("participant_label")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,participant_label
sequencing_type,locus,disease,Unnamed: 3_level_1
gDNA,TCRB,HIV,30
gDNA,TCRB,CVID,37
gDNA,TCRB,RA,37
gDNA,TCRB,Covid19,88
gDNA,TCRB,T1D,143
gDNA,TCRB,Healthy/Background,934


In [116]:
dfs_adaptive_manual.groupby(
    ["sequencing_type", "locus", "counting_method", "disease"], observed=True
)["participant_label"].nunique().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,participant_label
sequencing_type,locus,counting_method,disease,Unnamed: 4_level_1
gDNA,TCRB,v1,CVID,37
gDNA,TCRB,v1,Healthy/Background,98
gDNA,TCRB,v2,Healthy/Background,590
gDNA,TCRB,v2,RA,2
gDNA,TCRB,v3,HIV,30
gDNA,TCRB,v3,Healthy/Background,133
gDNA,TCRB,v3,RA,35
gDNA,TCRB,v4,Healthy/Background,113
gDNA,TCRB,v4,T1D,143


In [117]:
dfs_adaptive_manual["disease_subtype"].isna().any()

False

In [118]:
dfs_adaptive_manual["disease"].isna().any()

False

In [119]:
dfs_adaptive_manual["disease_subtype"].fillna(
    dfs_adaptive_manual["disease"], inplace=True
)

In [120]:
dfs_adaptive_manual.isna().any()[dfs_adaptive_manual.isna().any()]

sample_tags         True
counting_method     True
primer_set          True
Sex                 True
Age                 True
Ethnicity           True
cmv                 True
Sample Timepoint    True
Race                True
years_rel_to_art    True
timePoint           True
time_group          True
age                 True
sex                 True
ethnicity           True
dtype: bool

In [121]:
dfs_adaptive_manual["disease_subtype"].value_counts()

Healthy/Background - CMV-                   420
Healthy/Background - CMV+                   340
T1D - new onset                             143
Healthy/Background - T1D negative           100
Healthy/Background - TCRBv4-control          88
HIV - before anti-retroviral therapy         61
COVID-19-NIH/NIAID - Hospitalized            39
CVID                                         37
RA - sero-positive                           32
Healthy/Background - CMV Unknown             26
COVID-19-HUniv12Oct - Hospitalized           26
Healthy/Background - CVID negative           22
Healthy/Background - RA negative             13
COVID-19-ISB                                  9
RA - sero-negative                            7
COVID-19-NIH/NIAID - Hospitalized - ICU       7
COVID-19-HUniv12Oct - Hospitalized - ICU      6
COVID-19-HUniv12Oct                           1
Name: disease_subtype, dtype: int64

In [122]:
dfs_adaptive_manual[dfs_adaptive_manual["disease_subtype"] == healthy_label][
    "study"
].value_counts()

Series([], Name: study, dtype: int64)

In [123]:
dfs_adaptive_manual.groupby(["locus", "disease", "disease_subtype"], observed=True)[
    "participant_label"
].nunique().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,participant_label
locus,disease,disease_subtype,Unnamed: 3_level_1
TCRB,CVID,CVID,37
TCRB,Covid19,COVID-19-HUniv12Oct,1
TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,26
TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized - ICU,6
TCRB,Covid19,COVID-19-ISB,9
TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,39
TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized - ICU,7
TCRB,HIV,HIV - before anti-retroviral therapy,30
TCRB,Healthy/Background,Healthy/Background - CMV Unknown,26
TCRB,Healthy/Background,Healthy/Background - CMV+,340


In [124]:
dfs_adaptive_manual.groupby(
    ["locus", "counting_method", "disease", "disease_subtype", "study"], observed=True
)["participant_label"].nunique().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,participant_label
locus,counting_method,disease,disease_subtype,study,Unnamed: 5_level_1
TCRB,v1,CVID,CVID,ramesh-2015-ci,37
TCRB,v1,Healthy/Background,Healthy/Background - CMV Unknown,emerson-2017-natgen_train,2
TCRB,v1,Healthy/Background,Healthy/Background - CMV+,emerson-2017-natgen_train,28
TCRB,v1,Healthy/Background,Healthy/Background - CMV-,emerson-2017-natgen_train,46
TCRB,v1,Healthy/Background,Healthy/Background - CVID negative,ramesh-2015-ci,22
TCRB,v2,Healthy/Background,Healthy/Background - CMV Unknown,emerson-2017-natgen_train,24
TCRB,v2,Healthy/Background,Healthy/Background - CMV+,emerson-2017-natgen_train,261
TCRB,v2,Healthy/Background,Healthy/Background - CMV-,emerson-2017-natgen_train,305
TCRB,v2,RA,RA - sero-negative,mustjoki-2017-natcomms,1
TCRB,v2,RA,RA - sero-positive,mustjoki-2017-natcomms,1


In [125]:
dfs_adaptive_manual = dfs_adaptive_manual[
    dfs_adaptive_manual["sequencing_type"] == "gDNA"
]
dfs_adaptive_manual = dfs_adaptive_manual[dfs_adaptive_manual["locus"] == "TCRB"]
dfs_adaptive_manual

Unnamed: 0,sample_name,locus,sample_tags,counting_method,primer_set,species,study,sequencing_type,specimen_label,participant_label,...,Sample Timepoint,Race,replicate_label,years_rel_to_art,timePoint,time_group,age,sex,ethnicity,amplification_label
0,HIP15860,TCRB,"Cohort 01, HLA-A*01, HLA-A*68, HLA-B*14, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,emerson-2017-natgen_train_HIP15860,emerson-2017-natgen_train_HIP15860,...,,,emerson-2017-natgen_train_HIP15860,,,,,,,emerson-2017-natgen_train_HIP15860
1,HIP14363,TCRB,"Cohort 01, HLA-A*02, HLA-A*11, HLA-B*35, Human...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,emerson-2017-natgen_train_HIP14363,emerson-2017-natgen_train_HIP14363,...,,,emerson-2017-natgen_train_HIP14363,,,,,,,emerson-2017-natgen_train_HIP14363
2,HIP14178,TCRB,"Cohort 01, HLA-A*01, HLA-A*03, HLA-B*08, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,emerson-2017-natgen_train_HIP14178,emerson-2017-natgen_train_HIP14178,...,,,emerson-2017-natgen_train_HIP14178,,,,,,,emerson-2017-natgen_train_HIP14178
3,HIP13944,TCRB,"Cohort 01, HLA-A*02, HLA-A*03, HLA-B*18, HLA-B...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,emerson-2017-natgen_train_HIP13944,emerson-2017-natgen_train_HIP13944,...,,,emerson-2017-natgen_train_HIP13944,,,,,,,emerson-2017-natgen_train_HIP13944
4,HIP13911,TCRB,"22 Years, Caucasian, Cohort 01, Cytomegaloviru...",v2,Human-TCRB-PD1x,Human,emerson-2017-natgen_train,gDNA,emerson-2017-natgen_train_HIP13911,emerson-2017-natgen_train_HIP13911,...,,,emerson-2017-natgen_train_HIP13911,,,,,,,emerson-2017-natgen_train_HIP13911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,860011117_TCRB,TCRB,,,,Human,immunecode,gDNA,immunecode_860011117_TCRB,immunecode_ImmuneCode-304752,...,,,immunecode_860011117_TCRB,,,,60.0,M,Caucasian,immunecode_860011117_TCRB
1373,860011106_TCRB,TCRB,,,,Human,immunecode,gDNA,immunecode_860011106_TCRB,immunecode_ImmuneCode-775827,...,,,immunecode_860011106_TCRB,,,,57.0,F,Caucasian,immunecode_860011106_TCRB
1374,BS-GIGI_10-replacement_TCRB,TCRB,,,,Human,immunecode,gDNA,immunecode_BS-GIGI_10-replacement_TCRB,immunecode_ImmuneCode-0000446,...,,,immunecode_BS-GIGI_10-replacement_TCRB,,,,75.0,M,Caucasian,immunecode_BS-GIGI_10-replacement_TCRB
1375,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,TCRB,,,,Human,immunecode,gDNA,immunecode_BS-EQ-25-T1_BS-GIGI-71-replacement_...,immunecode_ImmuneCode-0000160,...,,,immunecode_BS-EQ-25-T1_BS-GIGI-71-replacement_...,,,,72.0,M,Caucasian,immunecode_BS-EQ-25-T1_BS-GIGI-71-replacement_...


In [126]:
dfs_adaptive_manual.groupby("disease")["participant_label"].nunique().sort_values()

disease
HIV                    30
CVID                   37
RA                     37
Covid19                88
T1D                   143
Healthy/Background    934
Name: participant_label, dtype: int64

In [127]:
# Review which replicates are getting combined into which specimens
# dfs_adaptive_manual[dfs_adaptive_manual['replicate_label'] != dfs_adaptive_manual['specimen_label']].groupby('specimen_label')['replicate_label'].unique().tolist()
dfs_adaptive_manual[
    dfs_adaptive_manual["replicate_label"] != dfs_adaptive_manual["specimen_label"]
][["specimen_label", "replicate_label"]]

Unnamed: 0,specimen_label,replicate_label
1042,mustjoki-2017-natcomms_RA23,mustjoki-2017-natcomms_RA23_CD4
1043,mustjoki-2017-natcomms_RA23,mustjoki-2017-natcomms_RA23_CD8
1070,mustjoki-2017-natcomms_RA6,mustjoki-2017-natcomms_RA6_CD4
1071,mustjoki-2017-natcomms_RA6,mustjoki-2017-natcomms_RA6_CD8


In [128]:
# Review which replicates are getting combined into which specimens
replicates_being_merged_into_same_specimen = (
    dfs_adaptive_manual[
        dfs_adaptive_manual["replicate_label"] != dfs_adaptive_manual["specimen_label"]
    ]
    .groupby("specimen_label")["replicate_label"]
    .unique()
    .apply(pd.Series)
)
# remove rows where single replicate (but just happened to have different label) - no merging happening
replicates_being_merged_into_same_specimen = replicates_being_merged_into_same_specimen[
    replicates_being_merged_into_same_specimen.notna().sum(axis=1) > 1
]
replicates_being_merged_into_same_specimen

Unnamed: 0_level_0,0,1
specimen_label,Unnamed: 1_level_1,Unnamed: 2_level_1
mustjoki-2017-natcomms_RA23,mustjoki-2017-natcomms_RA23_CD4,mustjoki-2017-natcomms_RA23_CD8
mustjoki-2017-natcomms_RA6,mustjoki-2017-natcomms_RA6_CD4,mustjoki-2017-natcomms_RA6_CD8


In [129]:
# # Also review the cell_type helper column we made for ourselves:
# (TODO: bring this back)
# dfs_adaptive_manual["cell_type"].value_counts()

In [130]:
dfs_adaptive_manual["species"].value_counts()

Human    1377
Name: species, dtype: int64

In [131]:
# all available columns, in case-insensitive sorted order
dfs_adaptive_manual.columns.sort_values(key=lambda idx: idx.str.lower())

Index(['age', 'Age', 'amplification_label', 'cmv', 'counting_method',
       'disease', 'disease_subtype', 'ethnicity', 'Ethnicity', 'locus',
       'participant_label', 'primer_set', 'Race', 'replicate_label',
       'Sample Timepoint', 'sample_name', 'sample_tags', 'sequencing_type',
       'Sex', 'sex', 'species', 'specimen_label', 'study', 'time_group',
       'timePoint', 'years_rel_to_art'],
      dtype='object')

In [132]:
# Symptom metadata columns:
# Add more here if we add more studies
symptoms_columns = [
    "cmv",
]

In [133]:
# if future studies have participant-level description fields, rename those columns into "participant_description"

# specimen description can come in several fields:
specimen_description_fields = ["time_group", "timePoint"]

# They are either all NA or one is set. Never have multiple of these set:
assert dfs_adaptive_manual[specimen_description_fields].notna().sum(axis=1).max()

# So we can just take first non-null value (if any) per row from these columns (https://stackoverflow.com/a/37938780/130164):
dfs_adaptive_manual["specimen_description"] = (
    dfs_adaptive_manual[specimen_description_fields]
    .fillna(method="bfill", axis=1)
    .iloc[:, 0]
)
dfs_adaptive_manual["specimen_description"].value_counts()

<= 0 month Pre-ART     15
<= 1 year Pre-ART      14
<= 6 months Pre-ART    12
<= 3 months Pre-ART    10
<= 2 years Pre-ART      5
<= 3 years Pre-ART      3
<= 4 years Pre-ART      2
Name: specimen_description, dtype: int64

In [134]:
# the two age columns are never set at the same time, so we can fillna to combine
assert (
    dfs_adaptive_manual["age"].notna().astype(int)
    + dfs_adaptive_manual["Age"].notna().astype(int)
).max() == 1
dfs_adaptive_manual["Age"].fillna(dfs_adaptive_manual["age"], inplace=True)

In [135]:
# the two sex columns are never set at the same time, so we can fillna to combine
assert (
    dfs_adaptive_manual["sex"].notna().astype(int)
    + dfs_adaptive_manual["Sex"].notna().astype(int)
).max() == 1
dfs_adaptive_manual["Sex"].fillna(dfs_adaptive_manual["sex"], inplace=True)

In [136]:
# ethnicity and Ethnicity columns are never set at the same time, so we can fillna to combine
assert (
    dfs_adaptive_manual["ethnicity"].notna().astype(int)
    + dfs_adaptive_manual["Ethnicity"].notna().astype(int)
).max() == 1
dfs_adaptive_manual["Ethnicity"].fillna(dfs_adaptive_manual["ethnicity"], inplace=True)

In [137]:
# Race and Ethnicity are indeed set together, so we need fancier logic to combine
assert (
    dfs_adaptive_manual["Race"].notna().astype(int)
    + dfs_adaptive_manual["Ethnicity"].notna().astype(int)
).max() > 1


def _combine_race_ethnicity_cols(row):
    non_empty = row.dropna()
    if len(non_empty) == 0:
        return np.nan
    return " - ".join(non_empty)


dfs_adaptive_manual["Ethnicity"] = dfs_adaptive_manual[["Race", "Ethnicity"]].apply(
    _combine_race_ethnicity_cols, axis=1
)
dfs_adaptive_manual["Ethnicity"].value_counts()

White, Not Hispanic or Latino                                        460
Unknown                                                              226
White - Non-Hispanic                                                 200
Caucasian                                                             74
Asian, Not Hispanic or Latino                                         42
Unknown, Hispanic or Latino                                           26
White - Hispanic                                                      19
White - Hispanic                                                      16
Black or African American, Not Hispanic or Latino                     11
Hispanic/Latino                                                       10
American Indian or Alaska Native, Not Hispanic or Latino              10
Asian - Non-Hispanic                                                   4
Native Hawaiian or other Pacific Islander, Not Hispanic or Latino      3
White, Hispanic or Latino                          

In [138]:
# Subset to these surviving columns
dfs_adaptive_manual = dfs_adaptive_manual[
    [
        "study",
        "sample_name",
        "locus",
        "counting_method",
        "disease",
        "sequencing_type",
        "disease_subtype",
        "participant_label",
        "specimen_label",
        "amplification_label",
        "replicate_label",
        # "cell_type",  # optional
        "Sex",
        "Age",
        "Ethnicity",
        "specimen_description",
        # "participant_description",
        "primer_set",
    ]
    + symptoms_columns
].rename(columns={col: f"symptoms_{col}" for col in symptoms_columns})
dfs_adaptive_manual

Unnamed: 0,study,sample_name,locus,counting_method,disease,sequencing_type,disease_subtype,participant_label,specimen_label,amplification_label,replicate_label,Sex,Age,Ethnicity,specimen_description,primer_set,symptoms_cmv
0,emerson-2017-natgen_train,HIP15860,TCRB,v2,Healthy/Background,gDNA,Healthy/Background - CMV Unknown,emerson-2017-natgen_train_HIP15860,emerson-2017-natgen_train_HIP15860,emerson-2017-natgen_train_HIP15860,emerson-2017-natgen_train_HIP15860,Unknown,Unknown,Unknown,,Human-TCRB-PD1x,
1,emerson-2017-natgen_train,HIP14363,TCRB,v2,Healthy/Background,gDNA,Healthy/Background - CMV Unknown,emerson-2017-natgen_train_HIP14363,emerson-2017-natgen_train_HIP14363,emerson-2017-natgen_train_HIP14363,emerson-2017-natgen_train_HIP14363,Unknown,Unknown,Unknown,,Human-TCRB-PD1x,
2,emerson-2017-natgen_train,HIP14178,TCRB,v2,Healthy/Background,gDNA,Healthy/Background - CMV Unknown,emerson-2017-natgen_train_HIP14178,emerson-2017-natgen_train_HIP14178,emerson-2017-natgen_train_HIP14178,emerson-2017-natgen_train_HIP14178,Unknown,Unknown,Unknown,,Human-TCRB-PD1x,
3,emerson-2017-natgen_train,HIP13944,TCRB,v2,Healthy/Background,gDNA,Healthy/Background - CMV Unknown,emerson-2017-natgen_train_HIP13944,emerson-2017-natgen_train_HIP13944,emerson-2017-natgen_train_HIP13944,emerson-2017-natgen_train_HIP13944,Unknown,Unknown,Unknown,,Human-TCRB-PD1x,
4,emerson-2017-natgen_train,HIP13911,TCRB,v2,Healthy/Background,gDNA,Healthy/Background - CMV+,emerson-2017-natgen_train_HIP13911,emerson-2017-natgen_train_HIP13911,emerson-2017-natgen_train_HIP13911,emerson-2017-natgen_train_HIP13911,Male,22.469541,"White, Not Hispanic or Latino",,Human-TCRB-PD1x,CMV+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,immunecode,860011117_TCRB,TCRB,,Covid19,gDNA,COVID-19-HUniv12Oct - Hospitalized - ICU,immunecode_ImmuneCode-304752,immunecode_860011117_TCRB,immunecode_860011117_TCRB,immunecode_860011117_TCRB,M,60.0,Caucasian,,,
1373,immunecode,860011106_TCRB,TCRB,,Covid19,gDNA,COVID-19-HUniv12Oct - Hospitalized,immunecode_ImmuneCode-775827,immunecode_860011106_TCRB,immunecode_860011106_TCRB,immunecode_860011106_TCRB,F,57.0,Caucasian,,,
1374,immunecode,BS-GIGI_10-replacement_TCRB,TCRB,,Covid19,gDNA,COVID-19-NIH/NIAID - Hospitalized,immunecode_ImmuneCode-0000446,immunecode_BS-GIGI_10-replacement_TCRB,immunecode_BS-GIGI_10-replacement_TCRB,immunecode_BS-GIGI_10-replacement_TCRB,M,75.0,Caucasian,,,
1375,immunecode,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,TCRB,,Covid19,gDNA,COVID-19-NIH/NIAID - Hospitalized - ICU,immunecode_ImmuneCode-0000160,immunecode_BS-EQ-25-T1_BS-GIGI-71-replacement_...,immunecode_BS-EQ-25-T1_BS-GIGI-71-replacement_...,immunecode_BS-EQ-25-T1_BS-GIGI-71-replacement_...,M,72.0,Caucasian,,,


In [139]:
all_specimens = dfs_adaptive_manual

# Make metadata columns consistent with standard Boydlab pipeline

In [140]:
all_specimens = all_specimens.rename(
    columns={
        "Age": "age",
        "Sex": "sex",
        "Ethnicity": "ethnicity",
        "study": "study_name",
    }
).assign(has_BCR=False, has_TCR=True)

In [141]:
all_specimens["sex"].value_counts()

Male       533
Female     471
M          109
F           92
Unknown     24
Name: sex, dtype: int64

In [142]:
# Consolidate
all_specimens["sex"] = (
    all_specimens["sex"].str.upper().str.strip().replace({"MALE": "M", "FEMALE": "F"})
)
all_specimens["sex"].mask(all_specimens["sex"] == "UNKNOWN", inplace=True)

In [143]:
all_specimens["sex"].value_counts()

M    642
F    563
Name: sex, dtype: int64

In [144]:
all_specimens["ethnicity"].isna().value_counts()

False    1113
True      264
Name: ethnicity, dtype: int64

In [145]:
# Here's who is missing ethnicity:
all_specimens[all_specimens["ethnicity"].isna()]["disease"].value_counts()

Healthy/Background    124
HIV                    61
RA                     39
CVID                   37
Covid19                 3
Name: disease, dtype: int64

In [146]:
# Here's who is missing ethnicity:
all_specimens[all_specimens["ethnicity"].isna()]["study_name"].value_counts()

TCRBv4-control               88
towlerton-2022-hiv           61
ramesh-2015-ci               59
mustjoki-2017-natcomms       52
immunecode                    3
emerson-2017-natgen_train     1
Name: study_name, dtype: int64

In [147]:
all_specimens["ethnicity"].value_counts()

White, Not Hispanic or Latino                                        460
Unknown                                                              226
White - Non-Hispanic                                                 200
Caucasian                                                             74
Asian, Not Hispanic or Latino                                         42
Unknown, Hispanic or Latino                                           26
White - Hispanic                                                      19
White - Hispanic                                                      16
Black or African American, Not Hispanic or Latino                     11
Hispanic/Latino                                                       10
American Indian or Alaska Native, Not Hispanic or Latino              10
Asian - Non-Hispanic                                                   4
Native Hawaiian or other Pacific Islander, Not Hispanic or Latino      3
White, Hispanic or Latino                          

In [148]:
# Condense rare ethnicity names
all_specimens["ethnicity_condensed"] = (
    all_specimens["ethnicity"]
    .str.strip()
    .replace(
        {
            "Black - Non-Hispanic": "African",
            "Black or African American, Not Hispanic or Latino": "African",
            #
            "White, Hispanic or Latino": "Hispanic/Latino",
            "White - Hispanic": "Hispanic/Latino",
            "Hispanic - Hispanic": "Hispanic/Latino",
            "Unknown, Hispanic or Latino": "Hispanic/Latino",
            "Other, Hispanic or Latino": "Hispanic/Latino",
            #
            "Asian - Non-Hispanic": "Asian",
            "Asian, Not Hispanic or Latino": "Asian",
            #
            "American Indian or Alaska Native, Not Hispanic or Latino": "Native American",
            #
            "Native Hawaiian or other Pacific Islander, Not Hispanic or Latino": "Pacific Islander",
            #
            "White - Non-Hispanic": "Caucasian",
            "White, Not Hispanic or Latino": "Caucasian",
            #
            "Not reported - Not reported": np.nan,
            "Unknown": np.nan,
            "Other, Not Hispanic or Latino": np.nan,
            "Asian, Hispanic or Latino": np.nan,
        }
    )
)

In [149]:
all_specimens["ethnicity_condensed"].value_counts()

Caucasian           734
Hispanic/Latino      76
Asian                47
African              13
Native American      10
Pacific Islander      3
Name: ethnicity_condensed, dtype: int64

In [150]:
all_specimens["ethnicity_condensed"].isna().value_counts()

False    883
True     494
Name: ethnicity_condensed, dtype: int64

In [151]:
# Here's who is missing ethnicity_condensed:
all_specimens[all_specimens["ethnicity_condensed"].isna()]["disease"].value_counts()

Healthy/Background    353
HIV                    61
RA                     39
CVID                   37
Covid19                 3
T1D                     1
Name: disease, dtype: int64

In [152]:
# Here's who is missing ethnicity_condensed:
# *Important*: If we see entries here that can be resolved, update the ethnicity_condensed rules above.
all_specimens[all_specimens["ethnicity_condensed"].isna()]["ethnicity"].value_counts()

Unknown                          226
Other, Not Hispanic or Latino      2
Asian, Hispanic or Latino          1
Not reported - Not reported        1
Name: ethnicity, dtype: int64

In [153]:
# Here's who is missing ethnicity_condensed:
all_specimens[all_specimens["ethnicity_condensed"].isna()]["study_name"].value_counts()

emerson-2017-natgen_train         227
TCRBv4-control                     88
towlerton-2022-hiv                 61
ramesh-2015-ci                     59
mustjoki-2017-natcomms             52
emerson-2017-natgen_validation      3
immunecode                          3
mitchell-2022-jcii                  1
Name: study_name, dtype: int64

In [154]:
# Versus total counts
all_specimens["disease"].value_counts()

Healthy/Background    1009
T1D                    143
Covid19                 88
HIV                     61
RA                      39
CVID                    37
Name: disease, dtype: int64

In [155]:
all_specimens.groupby(["ethnicity_condensed", "disease"]).size()

ethnicity_condensed  disease           
African              Healthy/Background     11
                     T1D                     2
Asian                Covid19                 1
                     Healthy/Background     42
                     T1D                     4
Caucasian            Covid19                74
                     Healthy/Background    544
                     T1D                   116
Hispanic/Latino      Covid19                10
                     Healthy/Background     46
                     T1D                    20
Native American      Healthy/Background     10
Pacific Islander     Healthy/Background      3
dtype: int64

In [156]:
all_specimens["age"].dropna()

0         Unknown
1         Unknown
2         Unknown
3         Unknown
4       22.469541
          ...    
1372         60.0
1373         57.0
1374         75.0
1375         72.0
1376         78.0
Name: age, Length: 1229, dtype: object

In [157]:
# Remove "Unknown"
all_specimens["age"].mask(all_specimens["age"] == "Unknown", inplace=True)

In [158]:
# Now we can convert to float
all_specimens["age"] = all_specimens["age"].astype(float)

In [159]:
# Set age_group column as well, just as in assemble_etl_metadata
all_specimens["age"].describe()

count    1118.000000
mean       33.563947
std        18.811632
min         0.717316
25%        18.277259
50%        33.567420
75%        47.724846
max        89.000000
Name: age, dtype: float64

In [160]:
all_specimens["age_group"] = pd.cut(
    all_specimens["age"],
    bins=[0, 20, 30, 40, 50, 60, 70, 80, 100],
    labels=["<20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80+"],
    right=False,
)
all_specimens["age_group"].value_counts()

<20      293
30-40    200
40-50    200
20-30    195
50-60    142
60-70     53
70-80     24
80+       11
Name: age_group, dtype: int64

In [161]:
all_specimens["age_group"].cat.categories

Index(['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+'], dtype='object')

In [162]:
all_specimens["age"].isna().value_counts()

False    1118
True      259
Name: age, dtype: int64

In [163]:
all_specimens["age_group"].isna().value_counts()

False    1118
True      259
Name: age_group, dtype: int64

In [164]:
for age_group, grp in all_specimens.groupby("age_group"):
    print(age_group, grp["age"].min(), grp["age"].max())

<20 0.717316 19.843942
20-30 20.0 29.987679671457904
30-40 30.0 39.99452429842574
40-50 40.0 49.91375770020534
50-60 50.0 59.97262149212868
60-70 60.0 69.6
70-80 70.0 79.0
80+ 80.0 89.0


In [165]:
# Just as in assemble_etl_metadata:
# Null out "age_group" column for extreme ages with small sample size.

# Note that we are not getting rid of these specimens altogether,
# but marking age_group NaN will disable their use for demographics-controlling models

orig_shapes = all_specimens.shape[0], all_specimens["age_group"].isna().sum()
mask = all_specimens["age_group"].isin(["80+"])
all_specimens.loc[mask, "age_group"] = np.nan
new_shapes = all_specimens.shape[0], all_specimens["age_group"].isna().sum()

# sanity checks:
# - we did not drop any specimens
assert orig_shapes[0] == new_shapes[0]
# - but we did null out some age_group entries
assert orig_shapes[1] < new_shapes[1]
# - we nulled out the right amount
assert new_shapes[1] - orig_shapes[1] == mask.sum()

In [166]:
# # Fillna for cohorts that are single-locus
# if "specimen_label_by_locus" not in all_specimens:
#     # in case we had no BCR+TCR combined cohorts that set this field already
#     all_specimens["specimen_label_by_locus"] = all_specimens["specimen_label"]
# else:
#     all_specimens["specimen_label_by_locus"].fillna(
#         all_specimens["specimen_label"], inplace=True
#     )

In [167]:
# # make sure input fnames exist
# assert all_specimens["fname"].apply(os.path.exists).all()

In [168]:
all_specimens.shape

(1377, 21)

In [169]:
# # confirm all specimen labels are unique within each locus (may have one BCR and one TCR line per specimen)
# # TODO: in the future, allow for replicates of each specimen
# assert not all_specimens["specimen_label_by_locus"].duplicated().any()
# for locus, grp in all_specimens.groupby("gene_locus"):
#     assert not grp["specimen_label"].duplicated().any()

In [170]:
# # Which specimens are in multiple loci?
# all_specimens[all_specimens["specimen_label"].duplicated(keep=False)]

In [171]:
all_specimens["study_name"].value_counts()

emerson-2017-natgen_train         666
mitchell-2022-jcii                243
emerson-2017-natgen_validation    120
TCRBv4-control                     88
immunecode                         88
towlerton-2022-hiv                 61
ramesh-2015-ci                     59
mustjoki-2017-natcomms             52
Name: study_name, dtype: int64

In [172]:
all_specimens["disease"].value_counts()

Healthy/Background    1009
T1D                    143
Covid19                 88
HIV                     61
RA                      39
CVID                    37
Name: disease, dtype: int64

In [173]:
all_specimens["locus"].value_counts()

TCRB    1377
Name: locus, dtype: int64

In [174]:
all_specimens["disease_subtype"].value_counts()

Healthy/Background - CMV-                   420
Healthy/Background - CMV+                   340
T1D - new onset                             143
Healthy/Background - T1D negative           100
Healthy/Background - TCRBv4-control          88
HIV - before anti-retroviral therapy         61
COVID-19-NIH/NIAID - Hospitalized            39
CVID                                         37
RA - sero-positive                           32
Healthy/Background - CMV Unknown             26
COVID-19-HUniv12Oct - Hospitalized           26
Healthy/Background - CVID negative           22
Healthy/Background - RA negative             13
COVID-19-ISB                                  9
RA - sero-negative                            7
COVID-19-NIH/NIAID - Hospitalized - ICU       7
COVID-19-HUniv12Oct - Hospitalized - ICU      6
COVID-19-HUniv12Oct                           1
Name: disease_subtype, dtype: int64

In [175]:
for key, grp in all_specimens.groupby("disease"):
    print(key)
    print(grp["disease_subtype"].value_counts())
    print()

CVID
CVID    37
Name: disease_subtype, dtype: int64

Covid19
COVID-19-NIH/NIAID - Hospitalized           39
COVID-19-HUniv12Oct - Hospitalized          26
COVID-19-ISB                                 9
COVID-19-NIH/NIAID - Hospitalized - ICU      7
COVID-19-HUniv12Oct - Hospitalized - ICU     6
COVID-19-HUniv12Oct                          1
Name: disease_subtype, dtype: int64

HIV
HIV - before anti-retroviral therapy    61
Name: disease_subtype, dtype: int64

Healthy/Background
Healthy/Background - CMV-              420
Healthy/Background - CMV+              340
Healthy/Background - T1D negative      100
Healthy/Background - TCRBv4-control     88
Healthy/Background - CMV Unknown        26
Healthy/Background - CVID negative      22
Healthy/Background - RA negative        13
Name: disease_subtype, dtype: int64

RA
RA - sero-positive    32
RA - sero-negative     7
Name: disease_subtype, dtype: int64

T1D
T1D - new onset    143
Name: disease_subtype, dtype: int64



In [176]:
for key, grp in all_specimens.groupby("disease"):
    print(key)
    print(grp["specimen_description"].value_counts())
    print()

CVID
Series([], Name: specimen_description, dtype: int64)

Covid19
Series([], Name: specimen_description, dtype: int64)

HIV
<= 0 month Pre-ART     15
<= 1 year Pre-ART      14
<= 6 months Pre-ART    12
<= 3 months Pre-ART    10
<= 2 years Pre-ART      5
<= 3 years Pre-ART      3
<= 4 years Pre-ART      2
Name: specimen_description, dtype: int64

Healthy/Background
Series([], Name: specimen_description, dtype: int64)

RA
Series([], Name: specimen_description, dtype: int64)

T1D
Series([], Name: specimen_description, dtype: int64)



In [177]:
for demographics_column in ["age", "age_group", "sex", "ethnicity_condensed"]:
    print(demographics_column)
    print(all_specimens[demographics_column].value_counts())
    print(all_specimens[demographics_column].isna().value_counts())
    print()

age
27.000000    15
25.000000    15
23.000000    14
24.000000    13
22.000000    12
             ..
14.918549     1
54.067077     1
64.082136     1
44.136893     1
72.000000     1
Name: age, Length: 830, dtype: int64
False    1118
True      259
Name: age, dtype: int64

age_group
<20      293
30-40    200
40-50    200
20-30    195
50-60    142
60-70     53
70-80     24
80+        0
Name: age_group, dtype: int64
False    1107
True      270
Name: age_group, dtype: int64

sex
M    642
F    563
Name: sex, dtype: int64
False    1205
True      172
Name: sex, dtype: int64

ethnicity_condensed
Caucasian           734
Hispanic/Latino      76
Asian                47
African              13
Native American      10
Pacific Islander      3
Name: ethnicity_condensed, dtype: int64
False    883
True     494
Name: ethnicity_condensed, dtype: int64



In [178]:
all_specimens.drop(columns=["ethnicity"]).to_csv(
    config.paths.metadata_dir / "adaptive" / "generated.adaptive_external_cohorts.tsv",
    sep="\t",
    index=None,
)