In [1]:
import sys
import os
import numpy as np
import pandas as pd

In [2]:
import glob

In [3]:
from malid import config, helpers
from malid.datamodels import healthy_label

# produce metadata about external cohorts

produces:

- all cohorts
    - `metadata/generated.external_cohorts.all_bcr.participant_metadata.tsv`
- covid
    - `metadata/generated.external_cohorts.covid19_bcr.specimen_metadata_extra.tsv`
    - `metadata/generated.external_cohorts.covid19_bcr.participant_metadata.tsv`
- healthy
    - `metadata/generated.external_cohorts.healthy_bcr.participant_metadata.tsv`

# external covid cohorts

## load metadata

`repertoire_id`'s are like specimen IDs. they are many-to-one with patient IDs. below, we will introduce actual patient IDs for these external cohorts.

In [4]:
df = pd.read_csv(
    config.paths.base_data_dir
    / "external_cohorts/raw_data/covid_external_as_part_tables/exported.metadata.tsv",
    sep="\t",
)
df.shape

(96, 110)

In [5]:
df.head()

Unnamed: 0,repertoire_id,repertoire_name,repertoire_description,study.study_id,study.study_title,study.study_type.label,study.study_type.id,study.study_description,study.inclusion_exclusion_criteria,study.grants,...,data_processing.0.germline_database,data_processing.0.analysis_provenance_id,study.keywords_study.1,study.vdjserver_uuid,subject.vdjserver_uuid,sample.0.tissue.value,sample.0.cell_subset,sample.0.cell_species,sample.0.vdjserver_uuid,data_processing.0.vdjserver_uuid
0,5f21e814e1adeb2edc12613c,,,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,...,IMGT,,,,,,,,,
1,5f21e814e1adeb2edc12613d,,,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,...,IMGT,,,,,,,,,
2,5f21e815e1adeb2edc12613e,,,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,...,IMGT,,,,,,,,,
3,5f21e815e1adeb2edc12613f,,,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,...,IMGT,,,,,,,,,
4,5f21e815e1adeb2edc126140,,,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,...,IMGT,,,,,,,,,


In [6]:
df[
    [
        "repertoire_id",
        "study.study_id",
        "subject.subject_id",
        "sample.0.collection_time_point_relative",
        "sample.0.collection_time_point_reference",
    ]
].head()

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,sample.0.collection_time_point_relative,sample.0.collection_time_point_reference
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,d11,Symptom onset
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,d17,Symptom onset
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,d45,Symptom onset
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,d10,Symptom onset
4,5f21e815e1adeb2edc126140,PRJNA648677,B,d19,Symptom onset


In [7]:
df["sample.0.collection_time_point_relative"].value_counts()

8 days     7
7 days     6
15 days    5
34 days    5
14 days    4
71 days    4
22 days    4
38 days    3
39 days    3
10 days    3
18 days    3
11 days    3
13 days    3
41 days    3
36 days    3
30 days    3
37 days    2
43 days    2
3 days     2
53 days    2
d6         2
44 days    2
32 days    2
28 days    1
5 days     1
16 days    1
d11        1
27 days    1
d17        1
2 days     1
d22        1
d9         1
d36        1
d14        1
d99        1
d44        1
d23        1
d28        1
d15        1
d19        1
d10        1
d45        1
6 days     1
Name: sample.0.collection_time_point_relative, dtype: int64

In [8]:
df["sample.0.collection_time_point_reference"].value_counts()

appearance of clinical symptoms    80
Symptom onset                      16
Name: sample.0.collection_time_point_reference, dtype: int64

In [9]:
assert not df["sample.0.collection_time_point_relative"].isna().any()
assert not df["sample.0.collection_time_point_reference"].isna().any()

In [10]:
df.columns

Index(['repertoire_id', 'repertoire_name', 'repertoire_description',
       'study.study_id', 'study.study_title', 'study.study_type.label',
       'study.study_type.id', 'study.study_description',
       'study.inclusion_exclusion_criteria', 'study.grants',
       ...
       'data_processing.0.germline_database',
       'data_processing.0.analysis_provenance_id', 'study.keywords_study.1',
       'study.vdjserver_uuid', 'subject.vdjserver_uuid',
       'sample.0.tissue.value', 'sample.0.cell_subset',
       'sample.0.cell_species', 'sample.0.vdjserver_uuid',
       'data_processing.0.vdjserver_uuid'],
      dtype='object', length=110)

In [11]:
print("\n".join(df.columns))

repertoire_id
repertoire_name
repertoire_description
study.study_id
study.study_title
study.study_type.label
study.study_type.id
study.study_description
study.inclusion_exclusion_criteria
study.grants
study.collected_by
study.lab_name
study.lab_address
study.submitted_by
study.pub_ids
study.keywords_study.0
subject.subject_id
subject.synthetic
subject.species.label
subject.species.id
subject.organism.label
subject.organism.id
subject.sex
subject.age
subject.age_min
subject.age_max
subject.age_unit.label
subject.age_unit.id
subject.age_event
subject.ancestry_population
subject.ethnicity
subject.race
subject.strain_name
subject.linked_subjects
subject.link_type
subject.diagnosis.0.study_group_description
subject.diagnosis.0.disease_diagnosis.label
subject.diagnosis.0.disease_diagnosis.id
subject.diagnosis.0.disease_length
subject.diagnosis.0.disease_stage
subject.diagnosis.0.prior_therapies
subject.diagnosis.0.immunogen
subject.diagnosis.0.intervention
subject.diagnosis.0.medical_history

In [12]:
df[df.columns[df.columns.str.startswith("study.")]].drop_duplicates()

Unnamed: 0,study.study_id,study.study_title,study.study_type.label,study.study_type.id,study.study_description,study.inclusion_exclusion_criteria,study.grants,study.collected_by,study.lab_name,study.lab_address,study.submitted_by,study.pub_ids,study.keywords_study.0,study.keywords_study.1,study.vdjserver_uuid
0,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,Sunghoon Kwon skwon@snu.ac.kr; Junho Chung jj...,Department of Biochemistry and Molecular Biology,Seoul National University College of Medicine,"Sang Il Kim, Jinsung Noh, Sunghoon Kwon, Junho...",DOI: 10.1101/2020.06.26.174557,contains_ig,,
16,PRJNA645245,Dynamics of B-cell repertoires and emergence o...,Longitudinal Study,NCIT:C15273,COVID-19 patients show varying severity of the...,,This work was supported by DFG grant (SFB1310)...,"Zachary Montague and Huibin Lv, University of ...",Armita Nourmohammad and Chris Ka Pun Mok,Fred Hutchinson Cancer Research Center and The...,"Scott Christley, scott.christley@utsouthwester...",https://doi.org/10.1101/2020.07.13.20153114,contains_ig,,2827127721591172630-242ac116-0001-012


In [13]:
study_names = {"PRJNA648677": "Kim", "PRJNA645245": "Montague"}
study_names

{'PRJNA648677': 'Kim', 'PRJNA645245': 'Montague'}

In [14]:
for study_id in study_names.keys():

    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id"]
            + df.columns[df.columns.str.startswith("subject.")].tolist()
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.synthetic,subject.species.label,subject.species.id,subject.sex,subject.age_min,subject.age_max,subject.age_unit.label,subject.age_unit.id,subject.age_event,subject.race,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,subject.diagnosis.0.disease_diagnosis.id,subject.diagnosis.0.disease_length,subject.diagnosis.0.disease_stage,subject.diagnosis.0.prior_therapies,subject.diagnosis.0.medical_history
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,f,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,11 days since symptom onset,Extensive Pneumonic infiltrates,"Oxygen therapy, Lopinavir/ritonavir, Levofloxacin",
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,f,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,17 days since symptom onset,Extensive Pneumonic infiltrates,"Oxygen therapy, Lopinavir/ritonavir, Levofloxacin",
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,f,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,45 days since symptom onset,Extensive Pneumonic infiltrates,"Oxygen therapy, Lopinavir/ritonavir, Levofloxacin",
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,f,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,10 days since symptom onset,Limited Pneumonic infiltrates,,"diabetes mellitus, dyslipidemia, hypertension"
4,5f21e815e1adeb2edc126140,PRJNA648677,B,f,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,19 days since symptom onset,Limited Pneumonic infiltrates,,"diabetes mellitus, dyslipidemia, hypertension"
5,5f21e816e1adeb2edc126141,PRJNA648677,C,f,Homo sapiens,NCBITAXON:9606,female,53.0,53.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,6 days since symptom onset,Limited Pneumonic infiltrates,,
6,5f21e816e1adeb2edc126142,PRJNA648677,C,f,Homo sapiens,NCBITAXON:9606,female,53.0,53.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,15 days since symptom onset,Limited Pneumonic infiltrates,,
7,5f21e816e1adeb2edc126143,PRJNA648677,D,f,Homo sapiens,NCBITAXON:9606,male,24.0,24.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,6 days since symptom onset,Limited Pneumonic infiltrates,,
8,5f21e817e1adeb2edc126144,PRJNA648677,D,f,Homo sapiens,NCBITAXON:9606,male,24.0,24.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,28 days since symptom onset,Limited Pneumonic infiltrates,,
9,5f21e817e1adeb2edc126145,PRJNA648677,E,f,Homo sapiens,NCBITAXON:9606,male,48.0,48.0,year,UO:0000036,Sample Collection,Chinese,Case,COVID-19,DOID:0080600,23 days since symptom onset,Extensive Pneumonic infiltrates,Lopinavir/ritonavir,


Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.synthetic,subject.species.label,subject.species.id,subject.sex,subject.age_min,subject.age_max,subject.age_unit.label,subject.age_unit.id,subject.age_event,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,subject.diagnosis.0.disease_diagnosis.id,subject.vdjserver_uuid
16,5549400184724853226-242ac116-0001-012,PRJNA645245,1,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Mild,COVID-19,DOID:0080600,2208259225820533226-242ac116-0001-012
17,5563272929090933226-242ac116-0001-012,PRJNA645245,1,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Mild,COVID-19,DOID:0080600,2208259225820533226-242ac116-0001-012
18,5578047616589173226-242ac116-0001-012,PRJNA645245,1,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Mild,COVID-19,DOID:0080600,2208259225820533226-242ac116-0001-012
19,5594669140024693226-242ac116-0001-012,PRJNA645245,1,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Mild,COVID-19,DOID:0080600,2208259225820533226-242ac116-0001-012
20,5609272028831093226-242ac116-0001-012,PRJNA645245,2,f,Homo sapiens,NCBITAXON:9606,female,37.0,37.0,year,UO:0000036,sampling,Mild,COVID-19,DOID:0080600,2223377510702453226-242ac116-0001-012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Severe,COVID-19,DOID:0080600,2491383469972853226-242ac116-0001-012
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Severe,COVID-19,DOID:0080600,2491383469972853226-242ac116-0001-012
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Severe,COVID-19,DOID:0080600,2491383469972853226-242ac116-0001-012
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,f,Homo sapiens,NCBITAXON:9606,female,62.0,62.0,year,UO:0000036,sampling,Severe,COVID-19,DOID:0080600,2491383469972853226-242ac116-0001-012


In [15]:
for study_id in study_names.keys():

    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id", "subject.subject_id"]
            + df.columns[df.columns.str.startswith("sample.")].tolist()
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,sample.0.sample_processing_id,sample.0.sample_id,sample.0.sample_type,sample.0.tissue.label,sample.0.tissue.id,sample.0.collection_time_point_relative,sample.0.collection_time_point_reference,...,sample.0.template_amount,sample.0.library_generation_method,sample.0.library_generation_protocol,sample.0.pcr_target.0.pcr_target_locus,sample.0.pcr_target.0.forward_pcr_primer_target_location,sample.0.pcr_target.0.reverse_pcr_primer_target_location,sample.0.complete_sequences,sample.0.physical_linkage,sample.0.total_reads_passing_qc_filter,sample.0.sequencing_platform
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,5f21e814e1adeb2edc12613c,A_d11,chronological blood samples were drawn,blood,UBERON:0000178,d11,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,294633.0,Illumina MiSeq
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,5f21e814e1adeb2edc12613d,A_d17,chronological blood samples were drawn,blood,UBERON:0000178,d17,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,240012.0,Illumina MiSeq
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,5f21e815e1adeb2edc12613e,A_d45,chronological blood samples were drawn,blood,UBERON:0000178,d45,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,215355.0,Illumina MiSeq
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,5f21e815e1adeb2edc12613f,B_d10,chronological blood samples were drawn,blood,UBERON:0000178,d10,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,97819.0,Illumina MiSeq
4,5f21e815e1adeb2edc126140,PRJNA648677,B,5f21e815e1adeb2edc126140,B_d19,chronological blood samples were drawn,blood,UBERON:0000178,d19,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,250000.0,Illumina MiSeq
5,5f21e816e1adeb2edc126141,PRJNA648677,C,5f21e816e1adeb2edc126141,C_d6,chronological blood samples were drawn,blood,UBERON:0000178,d6,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,246327.0,Illumina MiSeq
6,5f21e816e1adeb2edc126142,PRJNA648677,C,5f21e816e1adeb2edc126142,C_d15,chronological blood samples were drawn,blood,UBERON:0000178,d15,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,250000.0,Illumina MiSeq
7,5f21e816e1adeb2edc126143,PRJNA648677,D,5f21e816e1adeb2edc126143,D_d6,chronological blood samples were drawn,blood,UBERON:0000178,d6,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,246333.0,Illumina MiSeq
8,5f21e817e1adeb2edc126144,PRJNA648677,D,5f21e817e1adeb2edc126144,D_d28,chronological blood samples were drawn,blood,UBERON:0000178,d28,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,142798.0,Illumina MiSeq
9,5f21e817e1adeb2edc126145,PRJNA648677,E,5f21e817e1adeb2edc126145,E_d23,chronological blood samples were drawn,blood,UBERON:0000178,d23,Symptom onset,...,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,205326.0,Illumina MiSeq


Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,sample.0.sample_id,sample.0.sample_type,sample.0.tissue.id,sample.0.collection_time_point_relative,sample.0.collection_time_point_reference,sample.0.biomaterial_provider,sample.0.single_cell,...,sample.0.sequencing_platform,sample.0.sequencing_files.file_type,sample.0.sequencing_files.filename,sample.0.sequencing_files.read_length,sample.0.sequencing_files.paired_filename,sample.0.sequencing_files.paired_read_length,sample.0.tissue.value,sample.0.cell_subset,sample.0.cell_species,sample.0.vdjserver_uuid
16,5549400184724853226-242ac116-0001-012,PRJNA645245,1,IgG24-2,patient 1 day 43 replicate 1,UBERON:0000178,43 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190283_1.fastq,301.0,SRR12190283_2.fastq,301.0,blood,{},{},2625944795356533226-242ac116-0001-012
17,5563272929090933226-242ac116-0001-012,PRJNA645245,1,IgG24-1,patient 1 day 43 replicate 0,UBERON:0000178,43 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190285_1.fastq,301.0,SRR12190285_2.fastq,301.0,blood,{},{},2643339412905333226-242ac116-0001-012
18,5578047616589173226-242ac116-0001-012,PRJNA645245,1,IgG21-2,patient 1 day 22 replicate 1,UBERON:0000178,22 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190290_1.fastq,301.0,SRR12190290_2.fastq,301.0,blood,{},{},2659574389284213226-242ac116-0001-012
19,5594669140024693226-242ac116-0001-012,PRJNA645245,1,IgG21-1,patient 1 day 22 replicate 0,UBERON:0000178,22 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190291_1.fastq,301.0,SRR12190291_2.fastq,301.0,blood,{},{},2675852315336053226-242ac116-0001-012
20,5609272028831093226-242ac116-0001-012,PRJNA645245,2,IgG6-0,patient 2 day 2 replicate 0,UBERON:0000178,2 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190251_1.fastq,300.0,SRR12190251_2.fastq,300.0,blood,{},{},2695093768822133226-242ac116-0001-012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,IgG17-1,patient 18 day 30 replicate 1,UBERON:0000178,30 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190299_1.fastq,301.0,SRR12190299_2.fastq,301.0,blood,{},{},4022453411650933226-242ac116-0001-012
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,IgG17-0,patient 18 day 30 replicate 0,UBERON:0000178,30 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190300_1.fastq,300.0,SRR12190300_2.fastq,300.0,blood,{},{},4042253210885493226-242ac116-0001-012
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,IgG12-2,patient 18 day 8 replicate 1,UBERON:0000178,8 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190310_1.fastq,301.0,SRR12190310_2.fastq,301.0,blood,{},{},4070642944712053226-242ac116-0001-012
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,IgG12-1,patient 18 day 8 replicate 0,UBERON:0000178,8 days,appearance of clinical symptoms,"Chris Ka Pun Mok, 2. HKU-Pasteur Research Pole...",f,...,Illumina HiSeq 3000,fastq,SRR12190311_1.fastq,301.0,SRR12190311_2.fastq,301.0,blood,{},{},4090786341330293226-242ac116-0001-012


In [16]:
for study_id in study_names.keys():

    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id", "subject.subject_id"]
            + [
                "subject.sex",
                "subject.age_min",
                "subject.race",
                "subject.diagnosis.0.study_group_description",
                "subject.diagnosis.0.disease_diagnosis.label",
            ]
            + ["sample.0.sample_id", "sample.0.collection_time_point_relative"]
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.sex,subject.age_min,subject.race,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,sample.0.sample_id,sample.0.collection_time_point_relative
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,male,55.0,Korean,Case,COVID-19,A_d11,d11
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,male,55.0,Korean,Case,COVID-19,A_d17,d17
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,male,55.0,Korean,Case,COVID-19,A_d45,d45
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,male,55.0,Korean,Case,COVID-19,B_d10,d10
4,5f21e815e1adeb2edc126140,PRJNA648677,B,male,55.0,Korean,Case,COVID-19,B_d19,d19
5,5f21e816e1adeb2edc126141,PRJNA648677,C,female,53.0,Korean,Case,COVID-19,C_d6,d6
6,5f21e816e1adeb2edc126142,PRJNA648677,C,female,53.0,Korean,Case,COVID-19,C_d15,d15
7,5f21e816e1adeb2edc126143,PRJNA648677,D,male,24.0,Korean,Case,COVID-19,D_d6,d6
8,5f21e817e1adeb2edc126144,PRJNA648677,D,male,24.0,Korean,Case,COVID-19,D_d28,d28
9,5f21e817e1adeb2edc126145,PRJNA648677,E,male,48.0,Chinese,Case,COVID-19,E_d23,d23


Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.sex,subject.age_min,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,sample.0.sample_id,sample.0.collection_time_point_relative
16,5549400184724853226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19,IgG24-2,43 days
17,5563272929090933226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19,IgG24-1,43 days
18,5578047616589173226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19,IgG21-2,22 days
19,5594669140024693226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19,IgG21-1,22 days
20,5609272028831093226-242ac116-0001-012,PRJNA645245,2,female,37.0,Mild,COVID-19,IgG6-0,2 days
...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19,IgG17-1,30 days
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19,IgG17-0,30 days
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19,IgG12-2,8 days
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19,IgG12-1,8 days


## create patient IDs, and extract some patient-level metadata

In [17]:
# find the right columns...
for study_id in study_names.keys():

    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id", "subject.subject_id"]
            + [
                "subject.sex",
                "subject.age_min",
                "subject.race",
                "subject.diagnosis.0.study_group_description",
                "subject.diagnosis.0.disease_diagnosis.label",
            ]
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.sex,subject.age_min,subject.race,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,male,55.0,Korean,Case,COVID-19
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,male,55.0,Korean,Case,COVID-19
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,male,55.0,Korean,Case,COVID-19
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,male,55.0,Korean,Case,COVID-19
4,5f21e815e1adeb2edc126140,PRJNA648677,B,male,55.0,Korean,Case,COVID-19
5,5f21e816e1adeb2edc126141,PRJNA648677,C,female,53.0,Korean,Case,COVID-19
6,5f21e816e1adeb2edc126142,PRJNA648677,C,female,53.0,Korean,Case,COVID-19
7,5f21e816e1adeb2edc126143,PRJNA648677,D,male,24.0,Korean,Case,COVID-19
8,5f21e817e1adeb2edc126144,PRJNA648677,D,male,24.0,Korean,Case,COVID-19
9,5f21e817e1adeb2edc126145,PRJNA648677,E,male,48.0,Chinese,Case,COVID-19


Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.sex,subject.age_min,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label
16,5549400184724853226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19
17,5563272929090933226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19
18,5578047616589173226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19
19,5594669140024693226-242ac116-0001-012,PRJNA645245,1,female,62.0,Mild,COVID-19
20,5609272028831093226-242ac116-0001-012,PRJNA645245,2,female,37.0,Mild,COVID-19
...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,female,62.0,Severe,COVID-19


In [18]:
specimens_df = (
    df[df["study.study_id"].isin(study_names.keys())][
        [
            "repertoire_id",
            "study.study_id",
            "subject.subject_id",
            "subject.sex",
            "subject.age_min",
            "subject.race",
            "subject.diagnosis.0.study_group_description",
            "subject.diagnosis.0.disease_diagnosis.label",
            "sample.0.collection_time_point_relative",
        ]
    ]
    .dropna(how="all", axis=1)
    .drop_duplicates()
    .rename(
        columns={
            "subject.subject_id": "patient_id_within_study",
            "study.study_id": "study_id",
            "subject.sex": "sex",
            "subject.age_min": "age",
            "subject.race": "ethnicity",
            "subject.diagnosis.0.study_group_description": "disease_subtype",
            "subject.diagnosis.0.disease_diagnosis.label": "disease",
            "sample.0.collection_time_point_relative": "timepoint",
            "repertoire_id": "specimen_label",
        }
    )
)
specimens_df["sex"] = specimens_df["sex"].replace({"male": "M", "female": "F"})
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d11
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d17
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d45
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d10
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d19
...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30 days
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30 days
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8 days
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8 days


In [19]:
specimens_df["ethnicity"].value_counts()

Korean     11
Chinese     5
Name: ethnicity, dtype: int64

In [20]:
specimens_df["ethnicity"].isna().value_counts()

True     80
False    16
Name: ethnicity, dtype: int64

In [21]:
# create ethnicity_condensed
specimens_df["ethnicity_condensed"] = specimens_df["ethnicity"].replace(
    {"Korean": "Asian", "Chinese": "Asian"}
)
specimens_df["ethnicity_condensed"].value_counts()

Asian    16
Name: ethnicity_condensed, dtype: int64

In [22]:
specimens_df["ethnicity_condensed"].isna().value_counts()

True     80
False    16
Name: ethnicity_condensed, dtype: int64

In [23]:
# Are there any non-NaN ethnicity values that we did not remap?
specimens_df[specimens_df["ethnicity_condensed"].isna()]["ethnicity"].value_counts()

Series([], Name: ethnicity, dtype: int64)

In [24]:
specimens_df["study_name"] = specimens_df["study_id"].replace(study_names)
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d11,Asian,Kim
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d17,Asian,Kim
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d45,Asian,Kim
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d10,Asian,Kim
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d19,Asian,Kim
...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30 days,,Montague
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30 days,,Montague
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8 days,,Montague
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8 days,,Montague


In [25]:
specimens_df["participant_label"] = (
    specimens_df["study_name"].str.strip()
    + "_"
    + specimens_df["patient_id_within_study"].str.strip()
)
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name,participant_label
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d11,Asian,Kim,Kim_A
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d17,Asian,Kim,Kim_A
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d45,Asian,Kim,Kim_A
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d10,Asian,Kim,Kim_B
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d19,Asian,Kim,Kim_B
...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30 days,,Montague,Montague_18
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30 days,,Montague,Montague_18
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8 days,,Montague,Montague_18
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8 days,,Montague,Montague_18


In [26]:
# extract number
specimens_df["timepoint"] = specimens_df["timepoint"].str.extract("(\d+)").astype(int)
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name,participant_label
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,11,Asian,Kim,Kim_A
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,17,Asian,Kim,Kim_A
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,45,Asian,Kim,Kim_A
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,10,Asian,Kim,Kim_B
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,19,Asian,Kim,Kim_B
...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30,,Montague,Montague_18
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,30,,Montague,Montague_18
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8,,Montague,Montague_18
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,COVID-19,8,,Montague,Montague_18


In [27]:
specimens_df["disease"] = specimens_df["disease"].replace({"COVID-19": "Covid19"})

In [28]:
specimens_df.shape, specimens_df["participant_label"].nunique()

((96, 12), 26)

## Look at timepoints, decide which ones are peak

In [29]:
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name,participant_label
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,Covid19,11,Asian,Kim,Kim_A
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,Asian,Kim,Kim_A
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,Covid19,45,Asian,Kim,Kim_A
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,Covid19,10,Asian,Kim,Kim_B
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,Asian,Kim,Kim_B
...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,30,,Montague,Montague_18
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,30,,Montague,Montague_18
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,8,,Montague,Montague_18
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,8,,Montague,Montague_18


In [30]:
specimens_df.groupby("participant_label").size().sort_values(ascending=False)

participant_label
Montague_15    9
Montague_2     7
Montague_8     6
Montague_3     6
Montague_17    5
Montague_18    5
Montague_4     4
Montague_1     4
Montague_10    4
Montague_11    4
Montague_12    4
Montague_13    4
Montague_14    4
Montague_16    4
Montague_7     3
Kim_A          3
Kim_E          3
Kim_B          2
Kim_G          2
Kim_F          2
Montague_5     2
Montague_6     2
Kim_D          2
Kim_C          2
Montague_9     2
Montague_19    1
dtype: int64

In [31]:
specimens_df.groupby(["participant_label", "timepoint"]).size().sort_values(
    ascending=False
).to_frame("num_replicates").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_replicates
participant_label,timepoint,Unnamed: 2_level_1
Montague_15,71,4
Montague_2,34,3
Montague_3,38,3
Montague_15,41,3
Montague_3,8,3


In [32]:
# # can't do it this way because this will choose only one row as peak per patient, whereas we want all replicates from the peak timepoint to be marked as peak
# specimens_df['is_peak'] = False
# specimens_df.loc[specimens_df.groupby("participant_label", observed=True)["timepoint"].idxmax(), 'is_peak'] = True
# specimens_df['is_peak'].value_counts()

In [33]:
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name,participant_label
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,Covid19,11,Asian,Kim,Kim_A
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,Asian,Kim,Kim_A
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,Covid19,45,Asian,Kim,Kim_A
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,Covid19,10,Asian,Kim,Kim_B
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,Asian,Kim,Kim_B
...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,30,,Montague,Montague_18
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,30,,Montague,Montague_18
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,8,,Montague,Montague_18
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,8,,Montague,Montague_18


In [34]:
# choose peak timepoints per patient, with constraints on the timepoint range
# and exclude patients known to have mild disease (e.g. from Montague et al study, subjects 1-2 are mild disease)

# reset index to make sure .loc[idxmin] works properly
peak_timepoint_per_patient = (
    specimens_df[
        (specimens_df["timepoint"] >= 10)
        & (specimens_df["timepoint"] <= 45)
        & (specimens_df["disease_subtype"] != "Mild")
    ]
).reset_index(drop=True)


# choose the timepoint closest to day 15
# choose one row per group
peak_timepoint_per_patient["timepoint_diff_from_15"] = (
    peak_timepoint_per_patient["timepoint"] - 15
).abs()
peak_timepoint_per_patient = peak_timepoint_per_patient.loc[
    peak_timepoint_per_patient.groupby("participant_label", observed=True)[
        "timepoint_diff_from_15"
    ].idxmin()
].assign(is_peak=True)
peak_timepoint_per_patient

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name,participant_label,timepoint_diff_from_15,is_peak
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,Asian,Kim,Kim_A,2,True
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,Asian,Kim,Kim_B,4,True
5,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,Covid19,15,Asian,Kim,Kim_C,0,True
6,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,Covid19,28,Asian,Kim,Kim_D,13,True
7,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,Covid19,23,Asian,Kim,Kim_E,8,True
9,5f21e818e1adeb2edc126148,PRJNA648677,F,F,40.0,Chinese,Case,Covid19,14,Asian,Kim,Kim_F,1,True
11,5f21e819e1adeb2edc12614b,PRJNA648677,G,F,59.0,Korean,Case,Covid19,22,Asian,Kim,Kim_G,7,True
33,6162506766228853226-242ac116-0001-012,PRJNA645245,10,M,27.0,,Moderate,Covid19,14,,Montague,Montague_10,1,True
35,6235177612877173226-242ac116-0001-012,PRJNA645245,11,F,20.0,,Moderate,Covid19,15,,Montague,Montague_11,0,True
39,6301749605965173226-242ac116-0001-012,PRJNA645245,12,F,48.0,,Moderate,Covid19,11,,Montague,Montague_12,4,True


In [35]:
# Note that peak timepoint may have many replicates!
specimens_df2 = pd.merge(
    specimens_df,
    peak_timepoint_per_patient[["participant_label", "timepoint", "is_peak"]],
    on=["participant_label", "timepoint"],
    how="left",
)
specimens_df2["is_peak"].fillna(False, inplace=True)
assert specimens_df2.shape[0] == specimens_df.shape[0]
specimens_df = specimens_df2
specimens_df

Unnamed: 0,specimen_label,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,ethnicity_condensed,study_name,participant_label,is_peak
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,Covid19,11,Asian,Kim,Kim_A,False
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,Asian,Kim,Kim_A,True
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,Covid19,45,Asian,Kim,Kim_A,False
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,Covid19,10,Asian,Kim,Kim_B,False
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,Asian,Kim,Kim_B,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,6768183054310773226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,30,,Montague,Montague_18,True
92,6781110905871733226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,30,,Montague,Montague_18,True
93,6795112499256693226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,8,,Montague,Montague_18,False
94,6821827195837813226-242ac116-0001-012,PRJNA645245,18,F,62.0,,Severe,Covid19,8,,Montague,Montague_18,False


In [36]:
# not all patients have any peak timepoints chosen
specimens_df["participant_label"].nunique(), specimens_df[specimens_df["is_peak"]][
    "participant_label"
].nunique()

(26, 22)

In [37]:
# not all patients have any peak timepoints chosen
set(specimens_df["participant_label"]) - set(
    specimens_df[specimens_df["is_peak"]]["participant_label"]
)

{'Montague_1', 'Montague_14', 'Montague_19', 'Montague_2'}

In [38]:
# how many replicates chosen as peak per patient (should be more than 1 replicate for many)
specimens_df[specimens_df["is_peak"]].groupby(
    ["participant_label", "timepoint"]
).size().sort_values(ascending=False)

participant_label  timepoint
Montague_18        30           3
Montague_3         38           3
Montague_15        39           2
Montague_8         14           2
Montague_4         18           2
Montague_10        14           2
Montague_11        15           2
Montague_12        11           2
Montague_13        13           2
Montague_16        44           2
Montague_17        22           2
Montague_7         16           1
Montague_6         13           1
Montague_5         10           1
Kim_A              17           1
Kim_B              19           1
Kim_G              22           1
Kim_F              14           1
Kim_E              23           1
Kim_D              28           1
Kim_C              15           1
Montague_9         18           1
dtype: int64

In [39]:
# which were chosen
specimens_df[specimens_df["is_peak"]][
    ["participant_label", "timepoint", "is_peak"]
].sort_values(["participant_label", "timepoint"])

Unnamed: 0,participant_label,timepoint,is_peak
1,Kim_A,17,True
4,Kim_B,19,True
6,Kim_C,15,True
8,Kim_D,28,True
9,Kim_E,23,True
12,Kim_F,14,True
15,Kim_G,22,True
52,Montague_10,14,True
53,Montague_10,14,True
56,Montague_11,15,True


## export

In [40]:
specimen_metadata_extra = (
    specimens_df[["specimen_label", "participant_label", "timepoint", "is_peak"]]
    .drop_duplicates()
    .sort_values(["participant_label", "timepoint"])
)
specimen_metadata_extra.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.specimen_metadata_extra.tsv",
    sep="\t",
    index=None,
)
specimen_metadata_extra

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak
0,5f21e814e1adeb2edc12613c,Kim_A,11,False
1,5f21e814e1adeb2edc12613d,Kim_A,17,True
2,5f21e815e1adeb2edc12613e,Kim_A,45,False
3,5f21e815e1adeb2edc12613f,Kim_B,10,False
4,5f21e815e1adeb2edc126140,Kim_B,19,True
...,...,...,...,...
47,6028546736266613226-242ac116-0001-012,Montague_8,32,False
44,5977479575117173226-242ac116-0001-012,Montague_8,37,False
45,5994186997898613226-242ac116-0001-012,Montague_8,37,False
51,6148719921208693226-242ac116-0001-012,Montague_9,5,False


In [41]:
participant_df = specimens_df[
    [
        "participant_label",
        "study_id",
        "patient_id_within_study",
        "sex",
        "age",
        "ethnicity_condensed",
        "disease_subtype",
        "disease",
        "study_name",
    ]
].drop_duplicates()
participant_df.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.participant_metadata.tsv",
    sep="\t",
    index=None,
)
participant_df

Unnamed: 0,participant_label,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name
0,Kim_A,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
3,Kim_B,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
5,Kim_C,PRJNA648677,C,F,53.0,Asian,Case,Covid19,Kim
7,Kim_D,PRJNA648677,D,M,24.0,Asian,Case,Covid19,Kim
9,Kim_E,PRJNA648677,E,M,48.0,Asian,Case,Covid19,Kim
12,Kim_F,PRJNA648677,F,F,40.0,Asian,Case,Covid19,Kim
14,Kim_G,PRJNA648677,G,F,59.0,Asian,Case,Covid19,Kim
16,Montague_1,PRJNA645245,1,F,62.0,,Mild,Covid19,Montague
20,Montague_2,PRJNA645245,2,F,37.0,,Mild,Covid19,Montague
27,Montague_3,PRJNA645245,3,M,47.0,,Moderate,Covid19,Montague


In [42]:
participant_df.shape, specimen_metadata_extra.shape

((26, 9), (96, 4))

# now for briney-healthy

we already have `patient_id` in `ireceptor_data.briney_healthy_sequences`

soon we will add more replicates i.e. more samples though. see `repertoire_id` TODO comments.

for now, just add some simple metadata to create an "all external participants" metadata file

In [43]:
briney_patients = pd.read_csv(
    config.paths.base_data_dir
    / "external_cohorts/raw_data/briney_healthy_as_part_tables/exported.metadata.tsv",
    sep="\t",
).rename(columns={"repertoire_id": "specimen_label"})
briney_patients["disease"] = healthy_label
# all healthy are "peak" and 0 timepoint
briney_patients["is_peak"] = True
briney_patients["timepoint"] = 0
print(briney_patients.shape)
briney_patients

(7, 6)


Unnamed: 0,participant_label,specimen_label,study_name,disease,is_peak,timepoint
0,D103,D103_1,Briney,Healthy/Background,True,0
1,326797,326797_1,Briney,Healthy/Background,True,0
2,327059,327059_1,Briney,Healthy/Background,True,0
3,326650,326650_1,Briney,Healthy/Background,True,0
4,326737,326737_1,Briney,Healthy/Background,True,0
5,326780,326780_1,Briney,Healthy/Background,True,0
6,316188,316188_1,Briney,Healthy/Background,True,0


In [44]:
# Original paper table is wrong: 326907 is listed twice with different values; 326737 is missing. One of the dupes should be 326737.
# Fixed based on:
# https://www.ncbi.nlm.nih.gov/biosample/10331432
# https://www.ncbi.nlm.nih.gov/biosample/10331429
briney_demographics = pd.read_csv(config.paths.metadata_dir / "briney_demographics.csv")
assert not briney_demographics["subject"].duplicated().any()
print(briney_demographics.shape)
briney_demographics["sex"] = briney_demographics["sex"].replace(
    {"male": "M", "female": "F"}
)
briney_demographics

(10, 4)


Unnamed: 0,subject,age,sex,ethnicity
0,316188,30,F,African American
1,326650,18,F,Caucasian
2,326651,19,M,African American
3,326713,25,F,African American
4,326780,29,M,Caucasian
5,326797,21,F,Caucasian
6,326737,29,M,Caucasian
7,326907,29,F,African American
8,327059,26,M,African American / Caucasian
9,D103,25,M,Caucasian


In [45]:
briney_demographics["ethnicity"].value_counts()

Caucasian                       5
African American                4
African American / Caucasian    1
Name: ethnicity, dtype: int64

In [46]:
briney_demographics["ethnicity_condensed"] = briney_demographics["ethnicity"].replace(
    {"African American": "African", "African American / Caucasian": np.nan}
)
briney_demographics["ethnicity_condensed"].value_counts()

Caucasian    5
African      4
Name: ethnicity_condensed, dtype: int64

In [47]:
briney_demographics["ethnicity_condensed"].isna().value_counts()

False    9
True     1
Name: ethnicity_condensed, dtype: int64

In [48]:
# Are there any non-NaN ethnicity values that we did not remap?
briney_demographics[briney_demographics["ethnicity_condensed"].isna()][
    "ethnicity"
].value_counts()

African American / Caucasian    1
Name: ethnicity, dtype: int64

In [49]:
briney_patients = pd.merge(
    briney_patients,
    briney_demographics.set_index("subject"),
    how="left",
    validate="1:1",
    left_on="participant_label",
    right_index=True,
)
briney_patients

Unnamed: 0,participant_label,specimen_label,study_name,disease,is_peak,timepoint,age,sex,ethnicity,ethnicity_condensed
0,D103,D103_1,Briney,Healthy/Background,True,0,25,M,Caucasian,Caucasian
1,326797,326797_1,Briney,Healthy/Background,True,0,21,F,Caucasian,Caucasian
2,327059,327059_1,Briney,Healthy/Background,True,0,26,M,African American / Caucasian,
3,326650,326650_1,Briney,Healthy/Background,True,0,18,F,Caucasian,Caucasian
4,326737,326737_1,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian
5,326780,326780_1,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian
6,316188,316188_1,Briney,Healthy/Background,True,0,30,F,African American,African


In [50]:
briney_patients.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_bcr.participant_metadata.tsv",
    sep="\t",
    index=None,
)

In [51]:
participant_df_plus_briney = pd.concat(
    [
        participant_df[["participant_label", "disease", "study_name"]],
        briney_patients[["participant_label", "disease", "study_name"]],
    ],
    axis=0,
)
participant_df_plus_briney

Unnamed: 0,participant_label,disease,study_name
0,Kim_A,Covid19,Kim
3,Kim_B,Covid19,Kim
5,Kim_C,Covid19,Kim
7,Kim_D,Covid19,Kim
9,Kim_E,Covid19,Kim
12,Kim_F,Covid19,Kim
14,Kim_G,Covid19,Kim
16,Montague_1,Covid19,Montague
20,Montague_2,Covid19,Montague
27,Montague_3,Covid19,Montague


In [52]:
participant_df_plus_briney.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.all_bcr.participant_metadata.tsv",
    sep="\t",
    index=None,
)

# what to do with this metadata

Conceivably we could stick these back into the database for richer joins:

- `participant_df` becomes `ireceptor_data.external_patients`:
   - patient ID primary key
   - extra metadata columns summarized from per-sample metadata entries -- e.g. ethnicity, age.
   - can also include briney-healthy individuals
- `specimen_metadata_extra` can be a complementary table to `ireceptor_data.covid_metadata`:
   - `repertoire_id` primary key that's 1-to-1 with `ireceptor_data.covid_metadata`
   - patient ID foreign key that's many-to-1 with `external_patients`

For now, we'll avoid this and just do the joins in our post-processing in python