In [1]:
import sys
import os
import numpy as np
import pandas as pd

In [2]:
import glob

In [3]:
from malid import config, helpers
from malid.datamodels import healthy_label

In [4]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

# produce metadata about external cohorts

produces:

- all cohorts
    - `metadata/generated.external_cohorts.all_bcr.participant_metadata.tsv`
- covid
    - `metadata/generated.external_cohorts.covid19_bcr.specimen_metadata_extra.tsv`
    - `metadata/generated.external_cohorts.covid19_bcr.participant_metadata.tsv`
- healthy
    - `metadata/generated.external_cohorts.healthy_bcr.participant_metadata.tsv`

# external covid cohorts

## load metadata

`repertoire_id`'s are like specimen IDs. they are many-to-one with patient IDs. below, we will introduce actual patient IDs for these external cohorts.

In [5]:
df = pd.read_csv(
    config.paths.external_raw_data / "Kim" / "airr_covid19_metadata.tsv",
    sep="\t",
)
df.shape

(184, 103)

In [6]:
df.head()

Unnamed: 0,repertoire_id,repertoire_name,repertoire_description,study.study_id,study.study_title,study.study_type.label,study.study_type.id,study.study_description,study.inclusion_exclusion_criteria,study.grants,study.collected_by,study.lab_name,study.lab_address,study.submitted_by,study.pub_ids,study.keywords_study.0,subject.subject_id,subject.synthetic,subject.species.label,subject.species.id,subject.organism.label,subject.organism.id,subject.sex,subject.age,subject.age_min,subject.age_max,subject.age_unit.label,subject.age_unit.id,subject.age_event,subject.ancestry_population,subject.ethnicity,subject.race,subject.strain_name,subject.linked_subjects,subject.link_type,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,subject.diagnosis.0.disease_diagnosis.id,subject.diagnosis.0.disease_length,subject.diagnosis.0.disease_stage,subject.diagnosis.0.prior_therapies,subject.diagnosis.0.immunogen,subject.diagnosis.0.intervention,subject.diagnosis.0.medical_history,sample.0.sample_processing_id,sample.0.sample_id,sample.0.sample_type,sample.0.tissue.label,sample.0.tissue.id,sample.0.anatomic_site,sample.0.disease_state_sample,sample.0.collection_time_point_relative,sample.0.collection_time_point_reference,sample.0.biomaterial_provider,sample.0.tissue_processing,sample.0.cell_subset.label,sample.0.cell_subset.id,sample.0.cell_phenotype,sample.0.cell_species.label,sample.0.cell_species.id,sample.0.single_cell,sample.0.cell_number,sample.0.cells_per_reaction,sample.0.cell_storage,sample.0.cell_quality,sample.0.cell_isolation,sample.0.cell_processing_protocol,sample.0.template_class,sample.0.template_quality,sample.0.template_amount,sample.0.library_generation_method,sample.0.library_generation_protocol,sample.0.library_generation_kit_version,sample.0.pcr_target.0.pcr_target_locus,sample.0.pcr_target.0.forward_pcr_primer_target_location,sample.0.pcr_target.0.reverse_pcr_primer_target_location,sample.0.complete_sequences,sample.0.physical_linkage,sample.0.sequencing_run_id,sample.0.total_reads_passing_qc_filter,sample.0.sequencing_platform,sample.0.sequencing_facility,sample.0.sequencing_run_date,sample.0.sequencing_kit,sample.0.sequencing_files.file_type,sample.0.sequencing_files.filename,sample.0.sequencing_files.read_direction,sample.0.sequencing_files.read_length,sample.0.sequencing_files.paired_filename,sample.0.sequencing_files.paired_read_direction,sample.0.sequencing_files.paired_read_length,data_processing.0.data_processing_id,data_processing.0.primary_annotation,data_processing.0.software_versions,data_processing.0.paired_reads_assembly,data_processing.0.quality_thresholds,data_processing.0.primer_match_cutoffs,data_processing.0.collapsing_method,data_processing.0.data_processing_protocols,data_processing.0.data_processing_files.0,data_processing.0.germline_database,data_processing.0.analysis_provenance_id,study.keywords_study.1
0,5ed6859e99011334ac05e847,,,PRJNA628125,B cell clonal expansion and convergent antibod...,Case-Control Study,NCIT:C15197,COVID-19 Study,,NIH/NIAID T32AI007502-23; NIH/NHLBI K23HL12566...,"S. Boyd, sboyd1@stanford.edu","Scott Boyd Lab, Department of Pathology",Stanford University,"S. Nielsen, F. Yang, S. Boyd",DOI:10.21203/rs.3.rs-27220/v1,contains_ig,7450,False,Homo sapiens,NCBITAXON:9606,,,female,,73.0,73.0,year,UO:0000036,Sample Collection,,,,,,,Case,COVID-19,DOID:0080600,,COVID_active,,,,,5ed6859e99011334ac05e847,M369-S008,Venipuncture blood samples were collected in K...,venous blood,UBERON:0013756,,,0 d,Hospital Admission,"Scott Boyd lab, R214 Edwards, 300 Pasteur Driv...",,,,,Homo sapiens,NCBITAXON:9606,False,355342.0,,False,,,The AllPrep DNA/RNA kit (Qiagen) was used to e...,DNA,,100 ng/library,PCR,"For each blood sample, six independent gDNA li...",,IGH,IGHV_FR1 or IGHV_FR2,IGHJ,partial,none,,289023.0,Illumina MiSeq (PE300),,,,fasta,SRR11610494.fasta,,317.0,,,,5ed6859e99011334ac05e847,True,"fastx-toolkit/0.0.14, IgBLAST/1.16.0",FLASH20,,,,Adaptors and primers removed by authors. Igbla...,SRR11610494.fasta.fmt19-filtered.fmt19,IMGT,,
1,5ed685a099011334ac05e848,,,PRJNA628125,B cell clonal expansion and convergent antibod...,Case-Control Study,NCIT:C15197,COVID-19 Study,,NIH/NIAID T32AI007502-23; NIH/NHLBI K23HL12566...,"S. Boyd, sboyd1@stanford.edu","Scott Boyd Lab, Department of Pathology",Stanford University,"S. Nielsen, F. Yang, S. Boyd",DOI:10.21203/rs.3.rs-27220/v1,contains_ig,7451,False,Homo sapiens,NCBITAXON:9606,,,male,,62.0,62.0,year,UO:0000036,Sample Collection,,,,,,,Case,COVID-19,DOID:0080600,,COVID_active,,,,,5ed685a099011334ac05e848,M369-S009,Venipuncture blood samples were collected in K...,venous blood,UBERON:0013756,,,0 d,Hospital Admission,"Scott Boyd lab, R214 Edwards, 300 Pasteur Driv...",,,,,Homo sapiens,NCBITAXON:9606,False,347058.0,,False,,,The AllPrep DNA/RNA kit (Qiagen) was used to e...,DNA,,100 ng/library,PCR,"For each blood sample, six independent gDNA li...",,IGH,IGHV_FR1 or IGHV_FR2,IGHJ,partial,none,,279041.0,Illumina MiSeq (PE300),,,,fasta,SRR11610493.fasta,,317.0,,,,5ed685a099011334ac05e848,True,"fastx-toolkit/0.0.14, IgBLAST/1.16.0",FLASH20,,,,Adaptors and primers removed by authors. Igbla...,SRR11610493.fasta.fmt19-filtered.fmt19,IMGT,,
2,5ed685a099011334ac05e849,,,PRJNA628125,B cell clonal expansion and convergent antibod...,Case-Control Study,NCIT:C15197,COVID-19 Study,,NIH/NIAID T32AI007502-23; NIH/NHLBI K23HL12566...,"S. Boyd, sboyd1@stanford.edu","Scott Boyd Lab, Department of Pathology",Stanford University,"S. Nielsen, F. Yang, S. Boyd",DOI:10.21203/rs.3.rs-27220/v1,contains_ig,7452,False,Homo sapiens,NCBITAXON:9606,,,female,,61.0,61.0,year,UO:0000036,Sample Collection,,,,,,,Case,COVID-19,DOID:0080600,,COVID_active,,,,,5ed685a099011334ac05e849,M369-S010,Venipuncture blood samples were collected in K...,venous blood,UBERON:0013756,,,0 d,Hospital Admission,"Scott Boyd lab, R214 Edwards, 300 Pasteur Driv...",,,,,Homo sapiens,NCBITAXON:9606,False,365783.0,,False,,,The AllPrep DNA/RNA kit (Qiagen) was used to e...,DNA,,100 ng/library,PCR,"For each blood sample, six independent gDNA li...",,IGH,IGHV_FR1 or IGHV_FR2,IGHJ,partial,none,,283204.0,Illumina MiSeq (PE300),,,,fasta,SRR11610492.fasta,,317.0,,,,5ed685a099011334ac05e849,True,"fastx-toolkit/0.0.14, IgBLAST/1.16.0",FLASH20,,,,Adaptors and primers removed by authors. Igbla...,SRR11610492.fasta.fmt19-filtered.fmt19,IMGT,,
3,5ed685a099011334ac05e84a,,,PRJNA628125,B cell clonal expansion and convergent antibod...,Case-Control Study,NCIT:C15197,COVID-19 Study,,NIH/NIAID T32AI007502-23; NIH/NHLBI K23HL12566...,"S. Boyd, sboyd1@stanford.edu","Scott Boyd Lab, Department of Pathology",Stanford University,"S. Nielsen, F. Yang, S. Boyd",DOI:10.21203/rs.3.rs-27220/v1,contains_ig,7453-D0,False,Homo sapiens,NCBITAXON:9606,,,male,,64.0,64.0,year,UO:0000036,Sample Collection,,,,,,,Case,COVID-19,DOID:0080600,,COVID_active,,,,,5ed685a099011334ac05e84a,M369-S011,Venipuncture blood samples were collected in K...,venous blood,UBERON:0013756,,,0 d,Hospital Admission,"Scott Boyd lab, R214 Edwards, 300 Pasteur Driv...",,,,,Homo sapiens,NCBITAXON:9606,False,349566.0,,False,,,The AllPrep DNA/RNA kit (Qiagen) was used to e...,DNA,,25 ng/library,PCR,"For each blood sample, six independent gDNA li...",,IGH,IGHV_FR1 or IGHV_FR2,IGHJ,partial,none,,276877.0,Illumina MiSeq (PE300),,,,fasta,SRR11610503.fasta,,317.0,,,,5ed685a099011334ac05e84a,True,"fastx-toolkit/0.0.14, IgBLAST/1.16.0",FLASH20,,,,Adaptors and primers removed by authors. Igbla...,SRR11610503.fasta.fmt19-filtered.fmt19,IMGT,,
4,5ed685a099011334ac05e84b,,,PRJNA628125,B cell clonal expansion and convergent antibod...,Case-Control Study,NCIT:C15197,COVID-19 Study,,NIH/NIAID T32AI007502-23; NIH/NHLBI K23HL12566...,"S. Boyd, sboyd1@stanford.edu","Scott Boyd Lab, Department of Pathology",Stanford University,"S. Nielsen, F. Yang, S. Boyd",DOI:10.21203/rs.3.rs-27220/v1,contains_ig,7453-D2,False,Homo sapiens,NCBITAXON:9606,,,male,,64.0,64.0,year,UO:0000036,Sample Collection,,,,,,,Case,COVID-19,DOID:0080600,,COVID_active,,,,,5ed685a099011334ac05e84b,M369-S012,Venipuncture blood samples were collected in K...,venous blood,UBERON:0013756,,,2 d,Hospital Admission,"Scott Boyd lab, R214 Edwards, 300 Pasteur Driv...",,,,,Homo sapiens,NCBITAXON:9606,False,398829.0,,False,,,The AllPrep DNA/RNA kit (Qiagen) was used to e...,DNA,,100 ng/library,PCR,"For each blood sample, six independent gDNA li...",,IGH,IGHV_FR1 or IGHV_FR2,IGHJ,partial,none,,321208.0,Illumina MiSeq (PE300),,,,fasta,SRR11610502.fasta,,317.0,,,,5ed685a099011334ac05e84b,True,"fastx-toolkit/0.0.14, IgBLAST/1.16.0",FLASH20,,,,Adaptors and primers removed by authors. Igbla...,SRR11610502.fasta.fmt19-filtered.fmt19,IMGT,,


In [7]:
df[
    [
        "repertoire_id",
        "study.study_id",
        "subject.subject_id",
        "sample.0.collection_time_point_relative",
        "sample.0.collection_time_point_reference",
    ]
].head()

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,sample.0.collection_time_point_relative,sample.0.collection_time_point_reference
0,5ed6859e99011334ac05e847,PRJNA628125,7450,0 d,Hospital Admission
1,5ed685a099011334ac05e848,PRJNA628125,7451,0 d,Hospital Admission
2,5ed685a099011334ac05e849,PRJNA628125,7452,0 d,Hospital Admission
3,5ed685a099011334ac05e84a,PRJNA628125,7453-D0,0 d,Hospital Admission
4,5ed685a099011334ac05e84b,PRJNA628125,7453-D2,2 d,Hospital Admission


In [8]:
df["sample.0.collection_time_point_relative"].value_counts()

Sample collection     19
0 d                   12
14 d                  12
8 d                    6
25 d                   6
7 d                    6
17 d                   6
13 d                   6
5 d                    6
16 d                   6
d20                    5
Less than 7 days       5
d39                    5
More than 14 days      5
d23                    4
1 d                    4
d11                    3
d16                    3
d45                    3
d6                     3
d37                    3
d9                     3
d15                    3
d36                    2
d46                    2
d51                    2
d14                    2
d28                    2
d18                    2
d22                    2
d44                    2
2 d                    2
d1                     2
d5                     2
d24                    2
d31                    2
d27                    2
d38                    2
d40                    1
d19                    1


In [9]:
df["sample.0.collection_time_point_reference"].value_counts()

first_symptoms                  67
Symptom onset                   58
0 d                             19
Symptom onset                   16
Hospital Admission              14
First negative COVID-19 test    10
Name: sample.0.collection_time_point_reference, dtype: int64

In [10]:
assert not df["sample.0.collection_time_point_relative"].isna().any()
assert not df["sample.0.collection_time_point_reference"].isna().any()

In [11]:
df.columns

Index(['repertoire_id', 'repertoire_name', 'repertoire_description', 'study.study_id', 'study.study_title', 'study.study_type.label', 'study.study_type.id', 'study.study_description', 'study.inclusion_exclusion_criteria', 'study.grants',
       ...
       'data_processing.0.software_versions', 'data_processing.0.paired_reads_assembly', 'data_processing.0.quality_thresholds', 'data_processing.0.primer_match_cutoffs', 'data_processing.0.collapsing_method', 'data_processing.0.data_processing_protocols', 'data_processing.0.data_processing_files.0', 'data_processing.0.germline_database', 'data_processing.0.analysis_provenance_id', 'study.keywords_study.1'], dtype='object', length=103)

In [12]:
print("\n".join(df.columns))

repertoire_id
repertoire_name
repertoire_description
study.study_id
study.study_title
study.study_type.label
study.study_type.id
study.study_description
study.inclusion_exclusion_criteria
study.grants
study.collected_by
study.lab_name
study.lab_address
study.submitted_by
study.pub_ids
study.keywords_study.0
subject.subject_id
subject.synthetic
subject.species.label
subject.species.id
subject.organism.label
subject.organism.id
subject.sex
subject.age
subject.age_min
subject.age_max
subject.age_unit.label
subject.age_unit.id
subject.age_event
subject.ancestry_population
subject.ethnicity
subject.race
subject.strain_name
subject.linked_subjects
subject.link_type
subject.diagnosis.0.study_group_description
subject.diagnosis.0.disease_diagnosis.label
subject.diagnosis.0.disease_diagnosis.id
subject.diagnosis.0.disease_length
subject.diagnosis.0.disease_stage
subject.diagnosis.0.prior_therapies
subject.diagnosis.0.immunogen
subject.diagnosis.0.intervention
subject.diagnosis.0.medical_history

In [13]:
df[df.columns[df.columns.str.startswith("study.")]].drop_duplicates()

Unnamed: 0,study.study_id,study.study_title,study.study_type.label,study.study_type.id,study.study_description,study.inclusion_exclusion_criteria,study.grants,study.collected_by,study.lab_name,study.lab_address,study.submitted_by,study.pub_ids,study.keywords_study.0,study.keywords_study.1
0,PRJNA628125,B cell clonal expansion and convergent antibod...,Case-Control Study,NCIT:C15197,COVID-19 Study,,NIH/NIAID T32AI007502-23; NIH/NHLBI K23HL12566...,"S. Boyd, sboyd1@stanford.edu","Scott Boyd Lab, Department of Pathology",Stanford University,"S. Nielsen, F. Yang, S. Boyd",DOI:10.21203/rs.3.rs-27220/v1,contains_ig,
14,PRJNA638224,Deep sequencing of B cell receptor repertoires...,Case-Control Study,NCIT:C15197,To drive a deeper understanding of the nature ...,,Engineering and Physical Sciences Research Cou...,"jake@alchemab.com, jane@alchemab.com",Alchemab Therapeutics Ltd,"55-56 Russell Square, London, WC1B 4HP, UK","Jacob Galson, jake@alchemab.com",DOI:10.1101/2020.05.20.106294,contains_ig,
33,IR-Binder-000001,Next Generation Sequencing of T and B cell rec...,Study,NCIT:C63536,Immunological profiling of Covid-19 patients w...,,,"Mascha.Binder@uk-halle.de, Donjete.Simnica@uk-...","Department of Internal Medicine IV, Oncology/H...",Martin-Luther-University Halle-Wittenberg,Edith.Willscher@uk-halle.de,DOI: 10.1016/j.immuni.2020.06.024,contains_ig,
100,PRJNA648677,Stereotypic Neutralizing VH Clonotypes Against...,Case-Control Study,NCIT:C15197,"In response to SARS-CoV-2 infection, most huma...",,The National Research Foundation of Korea [NRF...,Sunghoon Kwon skwon@snu.ac.kr; Junho Chung jj...,Department of Biochemistry and Molecular Biology,Seoul National University College of Medicine,"Sang Il Kim, Jinsung Noh, Sunghoon Kwon, Junho...",DOI: 10.1101/2020.06.26.174557,contains_ig,
116,PRJNA630455,Comprehensive mapping of immune perturbations ...,Case-Control Study,NCIT:C15197,Identified extensive induction and activation ...,Two severe COVID-19+ individuals were excluded...,The University of Pennsylvania Institute for I...,"Leticia Kuri-Cervantes, M. Betina Pampena, E. ...",Department of Microbiology and Institute for I...,"Perelman School of Medicine, University of Pen...",Michael R. Betts (betts@pennmedicine.upenn.edu),PMID: 32669287,contains_ig,
174,PRJCA002413,Immune cell profiling of COVID-19 patients in ...,Case-Control Study,NCIT:C15197,An inflammatory immune signature in the early ...,Include: COVID,"National Natural Science Foundation of China, ...","xiaochuanle@126.com, cw0226@foxmail.com, hywan...",National Center for Liver Cancer,"Second Military Medical University, 200438 Sha...","Chuanle Xiao, xiaochuanle@126.com",PMID: 32377375,contains_ig,contains_paired_chain


In [14]:
study_names = {"PRJNA648677": "Kim"}  # could add more here
study_names

{'PRJNA648677': 'Kim'}

In [15]:
for study_id in study_names.keys():
    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id"]
            + df.columns[df.columns.str.startswith("subject.")].tolist()
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.synthetic,subject.species.label,subject.species.id,subject.sex,subject.age_min,subject.age_max,subject.age_unit.label,subject.age_unit.id,subject.age_event,subject.race,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,subject.diagnosis.0.disease_diagnosis.id,subject.diagnosis.0.disease_length,subject.diagnosis.0.disease_stage,subject.diagnosis.0.prior_therapies,subject.diagnosis.0.medical_history
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,False,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,11 days since symptom onset,Extensive Pneumonic infiltrates,"Oxygen therapy, Lopinavir/ritonavir, Levofloxacin",
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,False,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,17 days since symptom onset,Extensive Pneumonic infiltrates,"Oxygen therapy, Lopinavir/ritonavir, Levofloxacin",
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,False,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,45 days since symptom onset,Extensive Pneumonic infiltrates,"Oxygen therapy, Lopinavir/ritonavir, Levofloxacin",
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,False,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,10 days since symptom onset,Limited Pneumonic infiltrates,,"diabetes mellitus, dyslipidemia, hypertension"
104,5f21e815e1adeb2edc126140,PRJNA648677,B,False,Homo sapiens,NCBITAXON:9606,male,55.0,55.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,19 days since symptom onset,Limited Pneumonic infiltrates,,"diabetes mellitus, dyslipidemia, hypertension"
105,5f21e816e1adeb2edc126141,PRJNA648677,C,False,Homo sapiens,NCBITAXON:9606,female,53.0,53.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,6 days since symptom onset,Limited Pneumonic infiltrates,,
106,5f21e816e1adeb2edc126142,PRJNA648677,C,False,Homo sapiens,NCBITAXON:9606,female,53.0,53.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,15 days since symptom onset,Limited Pneumonic infiltrates,,
107,5f21e816e1adeb2edc126143,PRJNA648677,D,False,Homo sapiens,NCBITAXON:9606,male,24.0,24.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,6 days since symptom onset,Limited Pneumonic infiltrates,,
108,5f21e817e1adeb2edc126144,PRJNA648677,D,False,Homo sapiens,NCBITAXON:9606,male,24.0,24.0,year,UO:0000036,Sample Collection,Korean,Case,COVID-19,DOID:0080600,28 days since symptom onset,Limited Pneumonic infiltrates,,
109,5f21e817e1adeb2edc126145,PRJNA648677,E,False,Homo sapiens,NCBITAXON:9606,male,48.0,48.0,year,UO:0000036,Sample Collection,Chinese,Case,COVID-19,DOID:0080600,23 days since symptom onset,Extensive Pneumonic infiltrates,Lopinavir/ritonavir,


In [16]:
for study_id in study_names.keys():
    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id", "subject.subject_id"]
            + df.columns[df.columns.str.startswith("sample.")].tolist()
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,sample.0.sample_processing_id,sample.0.sample_id,sample.0.sample_type,sample.0.tissue.label,sample.0.tissue.id,sample.0.collection_time_point_relative,sample.0.collection_time_point_reference,sample.0.biomaterial_provider,sample.0.tissue_processing,sample.0.cell_species.label,sample.0.cell_species.id,sample.0.single_cell,sample.0.cell_storage,sample.0.cell_processing_protocol,sample.0.template_class,sample.0.template_amount,sample.0.library_generation_method,sample.0.library_generation_protocol,sample.0.pcr_target.0.pcr_target_locus,sample.0.pcr_target.0.forward_pcr_primer_target_location,sample.0.pcr_target.0.reverse_pcr_primer_target_location,sample.0.complete_sequences,sample.0.physical_linkage,sample.0.total_reads_passing_qc_filter,sample.0.sequencing_platform
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,5f21e814e1adeb2edc12613c,A_d11,chronological blood samples were drawn,blood,UBERON:0000178,d11,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,294633.0,Illumina MiSeq
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,5f21e814e1adeb2edc12613d,A_d17,chronological blood samples were drawn,blood,UBERON:0000178,d17,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,240012.0,Illumina MiSeq
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,5f21e815e1adeb2edc12613e,A_d45,chronological blood samples were drawn,blood,UBERON:0000178,d45,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,215355.0,Illumina MiSeq
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,5f21e815e1adeb2edc12613f,B_d10,chronological blood samples were drawn,blood,UBERON:0000178,d10,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,97819.0,Illumina MiSeq
104,5f21e815e1adeb2edc126140,PRJNA648677,B,5f21e815e1adeb2edc126140,B_d19,chronological blood samples were drawn,blood,UBERON:0000178,d19,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,250000.0,Illumina MiSeq
105,5f21e816e1adeb2edc126141,PRJNA648677,C,5f21e816e1adeb2edc126141,C_d6,chronological blood samples were drawn,blood,UBERON:0000178,d6,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,246327.0,Illumina MiSeq
106,5f21e816e1adeb2edc126142,PRJNA648677,C,5f21e816e1adeb2edc126142,C_d15,chronological blood samples were drawn,blood,UBERON:0000178,d15,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,250000.0,Illumina MiSeq
107,5f21e816e1adeb2edc126143,PRJNA648677,D,5f21e816e1adeb2edc126143,D_d6,chronological blood samples were drawn,blood,UBERON:0000178,d6,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,246333.0,Illumina MiSeq
108,5f21e817e1adeb2edc126144,PRJNA648677,D,5f21e817e1adeb2edc126144,D_d28,chronological blood samples were drawn,blood,UBERON:0000178,d28,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,142798.0,Illumina MiSeq
109,5f21e817e1adeb2edc126145,PRJNA648677,E,5f21e817e1adeb2edc126145,E_d23,chronological blood samples were drawn,blood,UBERON:0000178,d23,Symptom onset,Seoul National University Hospital,PBMCs and plasma were isolated using Lymphopre...,Homo sapiens,NCBITAXON:9606,False,False,The PBMCs were subjected to total RNA isolatio...,RNA,500 ng,RT(specific+UMI)+PCR,Sampled 5ug of mRNA was used to synthesize cDN...,IGH,IGHV,"IGHC1 of each isotype (IgM, IgD, IgG, IgA, and...",partial,none,205326.0,Illumina MiSeq


In [17]:
for study_id in study_names.keys():
    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id", "subject.subject_id"]
            + [
                "subject.sex",
                "subject.age_min",
                "subject.race",
                "subject.diagnosis.0.study_group_description",
                "subject.diagnosis.0.disease_diagnosis.label",
            ]
            + ["sample.0.sample_id", "sample.0.collection_time_point_relative"]
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.sex,subject.age_min,subject.race,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label,sample.0.sample_id,sample.0.collection_time_point_relative
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,male,55.0,Korean,Case,COVID-19,A_d11,d11
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,male,55.0,Korean,Case,COVID-19,A_d17,d17
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,male,55.0,Korean,Case,COVID-19,A_d45,d45
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,male,55.0,Korean,Case,COVID-19,B_d10,d10
104,5f21e815e1adeb2edc126140,PRJNA648677,B,male,55.0,Korean,Case,COVID-19,B_d19,d19
105,5f21e816e1adeb2edc126141,PRJNA648677,C,female,53.0,Korean,Case,COVID-19,C_d6,d6
106,5f21e816e1adeb2edc126142,PRJNA648677,C,female,53.0,Korean,Case,COVID-19,C_d15,d15
107,5f21e816e1adeb2edc126143,PRJNA648677,D,male,24.0,Korean,Case,COVID-19,D_d6,d6
108,5f21e817e1adeb2edc126144,PRJNA648677,D,male,24.0,Korean,Case,COVID-19,D_d28,d28
109,5f21e817e1adeb2edc126145,PRJNA648677,E,male,48.0,Chinese,Case,COVID-19,E_d23,d23


## create patient IDs, and extract some patient-level metadata

In [18]:
# find the right columns...
for study_id in study_names.keys():
    display(
        df[df["study.study_id"] == study_id][
            ["repertoire_id", "study.study_id", "subject.subject_id"]
            + [
                "subject.sex",
                "subject.age_min",
                "subject.race",
                "subject.diagnosis.0.study_group_description",
                "subject.diagnosis.0.disease_diagnosis.label",
            ]
        ]
        .dropna(how="all", axis=1)
        .drop_duplicates()
    )

Unnamed: 0,repertoire_id,study.study_id,subject.subject_id,subject.sex,subject.age_min,subject.race,subject.diagnosis.0.study_group_description,subject.diagnosis.0.disease_diagnosis.label
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,male,55.0,Korean,Case,COVID-19
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,male,55.0,Korean,Case,COVID-19
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,male,55.0,Korean,Case,COVID-19
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,male,55.0,Korean,Case,COVID-19
104,5f21e815e1adeb2edc126140,PRJNA648677,B,male,55.0,Korean,Case,COVID-19
105,5f21e816e1adeb2edc126141,PRJNA648677,C,female,53.0,Korean,Case,COVID-19
106,5f21e816e1adeb2edc126142,PRJNA648677,C,female,53.0,Korean,Case,COVID-19
107,5f21e816e1adeb2edc126143,PRJNA648677,D,male,24.0,Korean,Case,COVID-19
108,5f21e817e1adeb2edc126144,PRJNA648677,D,male,24.0,Korean,Case,COVID-19
109,5f21e817e1adeb2edc126145,PRJNA648677,E,male,48.0,Chinese,Case,COVID-19


In [19]:
specimens_df = (
    df[df["study.study_id"].isin(study_names.keys())][
        [
            "repertoire_id",  # the internal repertoire ID
            "study.study_id",
            "subject.subject_id",
            "subject.sex",
            "subject.age_min",
            "subject.race",
            "subject.diagnosis.0.study_group_description",
            "subject.diagnosis.0.disease_diagnosis.label",
            "sample.0.collection_time_point_relative",
            "sample.0.sample_id",
        ]
    ]
    .dropna(how="all", axis=1)
    .drop_duplicates()
    .rename(
        columns={
            "subject.subject_id": "patient_id_within_study",
            "study.study_id": "study_id",
            "subject.sex": "sex",
            "subject.age_min": "age",
            "subject.race": "ethnicity",
            "subject.diagnosis.0.study_group_description": "disease_subtype",
            "subject.diagnosis.0.disease_diagnosis.label": "disease",
            "sample.0.collection_time_point_relative": "timepoint",
            "sample.0.sample_id": "specimen_label",
        }
    )
)
specimens_df["sex"] = specimens_df["sex"].replace({"male": "M", "female": "F"})
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d11,A_d11
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d17,A_d17
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d45,A_d45
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d10,B_d10
104,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d19,B_d19
105,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,d6,C_d6
106,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,d15,C_d15
107,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,d6,D_d6
108,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,d28,D_d28
109,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,COVID-19,d23,E_d23


In [20]:
specimens_df["ethnicity"].value_counts()

Korean     11
Chinese     5
Name: ethnicity, dtype: int64

In [21]:
specimens_df["ethnicity"].isna().value_counts()

False    16
Name: ethnicity, dtype: int64

In [22]:
# create ethnicity_condensed
specimens_df["ethnicity_condensed"] = specimens_df["ethnicity"].replace(
    {"Korean": "Asian", "Chinese": "Asian"}
)
specimens_df["ethnicity_condensed"].value_counts()

Asian    16
Name: ethnicity_condensed, dtype: int64

In [23]:
specimens_df["ethnicity_condensed"].isna().value_counts()

False    16
Name: ethnicity_condensed, dtype: int64

In [24]:
# Are there any non-NaN ethnicity values that we did not remap?
specimens_df[specimens_df["ethnicity_condensed"].isna()]["ethnicity"].value_counts()

Series([], Name: ethnicity, dtype: int64)

In [25]:
specimens_df["study_name"] = specimens_df["study_id"].replace(study_names)
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d11,A_d11,Asian,Kim
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d17,A_d17,Asian,Kim
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d45,A_d45,Asian,Kim
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d10,B_d10,Asian,Kim
104,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d19,B_d19,Asian,Kim
105,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,d6,C_d6,Asian,Kim
106,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,d15,C_d15,Asian,Kim
107,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,d6,D_d6,Asian,Kim
108,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,d28,D_d28,Asian,Kim
109,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,COVID-19,d23,E_d23,Asian,Kim


In [26]:
specimens_df["participant_label"] = (
    specimens_df["study_name"].str.strip()
    + "_"
    + specimens_df["patient_id_within_study"].str.strip()
)
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name,participant_label
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d11,A_d11,Asian,Kim,Kim_A
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d17,A_d17,Asian,Kim,Kim_A
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,d45,A_d45,Asian,Kim,Kim_A
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d10,B_d10,Asian,Kim,Kim_B
104,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,d19,B_d19,Asian,Kim,Kim_B
105,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,d6,C_d6,Asian,Kim,Kim_C
106,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,d15,C_d15,Asian,Kim,Kim_C
107,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,d6,D_d6,Asian,Kim,Kim_D
108,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,d28,D_d28,Asian,Kim,Kim_D
109,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,COVID-19,d23,E_d23,Asian,Kim,Kim_E


In [27]:
# extract number
specimens_df["timepoint"] = specimens_df["timepoint"].str.extract("(\d+)").astype(int)
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name,participant_label
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,11,A_d11,Asian,Kim,Kim_A
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,17,A_d17,Asian,Kim,Kim_A
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,COVID-19,45,A_d45,Asian,Kim,Kim_A
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,10,B_d10,Asian,Kim,Kim_B
104,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,COVID-19,19,B_d19,Asian,Kim,Kim_B
105,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,6,C_d6,Asian,Kim,Kim_C
106,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,COVID-19,15,C_d15,Asian,Kim,Kim_C
107,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,6,D_d6,Asian,Kim,Kim_D
108,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,COVID-19,28,D_d28,Asian,Kim,Kim_D
109,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,COVID-19,23,E_d23,Asian,Kim,Kim_E


In [28]:
specimens_df["disease"] = specimens_df["disease"].replace({"COVID-19": "Covid19"})

In [29]:
specimens_df.shape, specimens_df["participant_label"].nunique()

((16, 13), 7)

## Look at timepoints, decide which ones are peak

In [30]:
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name,participant_label
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,Covid19,11,A_d11,Asian,Kim,Kim_A
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,A_d17,Asian,Kim,Kim_A
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,Covid19,45,A_d45,Asian,Kim,Kim_A
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,Covid19,10,B_d10,Asian,Kim,Kim_B
104,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,B_d19,Asian,Kim,Kim_B
105,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,Covid19,6,C_d6,Asian,Kim,Kim_C
106,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,Covid19,15,C_d15,Asian,Kim,Kim_C
107,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,Covid19,6,D_d6,Asian,Kim,Kim_D
108,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,Covid19,28,D_d28,Asian,Kim,Kim_D
109,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,Covid19,23,E_d23,Asian,Kim,Kim_E


In [31]:
specimens_df.groupby("participant_label").size().sort_values(ascending=False)

participant_label
Kim_A    3
Kim_E    3
Kim_B    2
Kim_C    2
Kim_D    2
Kim_F    2
Kim_G    2
dtype: int64

In [32]:
specimens_df.groupby(["participant_label", "timepoint"]).size().sort_values(
    ascending=False
).to_frame("num_replicates").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_replicates
participant_label,timepoint,Unnamed: 2_level_1
Kim_A,11,1
Kim_A,17,1
Kim_A,45,1
Kim_B,10,1
Kim_B,19,1


In [33]:
# # can't do it this way because this will choose only one row as peak per patient, whereas we want all replicates from the peak timepoint to be marked as peak
# specimens_df['is_peak'] = False
# specimens_df.loc[specimens_df.groupby("participant_label", observed=True)["timepoint"].idxmax(), 'is_peak'] = True
# specimens_df['is_peak'].value_counts()

In [34]:
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name,participant_label
100,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,Covid19,11,A_d11,Asian,Kim,Kim_A
101,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,A_d17,Asian,Kim,Kim_A
102,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,Covid19,45,A_d45,Asian,Kim,Kim_A
103,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,Covid19,10,B_d10,Asian,Kim,Kim_B
104,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,B_d19,Asian,Kim,Kim_B
105,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,Covid19,6,C_d6,Asian,Kim,Kim_C
106,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,Covid19,15,C_d15,Asian,Kim,Kim_C
107,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,Covid19,6,D_d6,Asian,Kim,Kim_D
108,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,Covid19,28,D_d28,Asian,Kim,Kim_D
109,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,Covid19,23,E_d23,Asian,Kim,Kim_E


In [35]:
# choose peak timepoints per patient, with constraints on the timepoint range
# and exclude patients known to have mild disease (e.g. from Montague et al study, subjects 1-2 are mild disease)

# reset index to make sure .loc[idxmin] works properly
peak_timepoint_per_patient = (
    specimens_df[
        (specimens_df["timepoint"] >= 10)
        & (specimens_df["timepoint"] <= 45)
        & (specimens_df["disease_subtype"] != "Mild")
    ]
).reset_index(drop=True)


# choose the timepoint closest to day 15
# choose one row per group
peak_timepoint_per_patient["timepoint_diff_from_15"] = (
    peak_timepoint_per_patient["timepoint"] - 15
).abs()
peak_timepoint_per_patient = peak_timepoint_per_patient.loc[
    peak_timepoint_per_patient.groupby("participant_label", observed=True)[
        "timepoint_diff_from_15"
    ].idxmin()
].assign(is_peak=True)
peak_timepoint_per_patient

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name,participant_label,timepoint_diff_from_15,is_peak
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,A_d17,Asian,Kim,Kim_A,2,True
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,B_d19,Asian,Kim,Kim_B,4,True
5,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,Covid19,15,C_d15,Asian,Kim,Kim_C,0,True
6,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,Covid19,28,D_d28,Asian,Kim,Kim_D,13,True
7,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,Covid19,23,E_d23,Asian,Kim,Kim_E,8,True
9,5f21e818e1adeb2edc126148,PRJNA648677,F,F,40.0,Chinese,Case,Covid19,14,F_d14,Asian,Kim,Kim_F,1,True
11,5f21e819e1adeb2edc12614b,PRJNA648677,G,F,59.0,Korean,Case,Covid19,22,G_d22,Asian,Kim,Kim_G,7,True


In [36]:
# Note that peak timepoint may have many replicates!
specimens_df2 = pd.merge(
    specimens_df,
    peak_timepoint_per_patient[["participant_label", "timepoint", "is_peak"]],
    on=["participant_label", "timepoint"],
    how="left",
)
specimens_df2["is_peak"].fillna(False, inplace=True)
assert specimens_df2.shape[0] == specimens_df.shape[0]
specimens_df = specimens_df2
specimens_df

Unnamed: 0,repertoire_id,study_id,patient_id_within_study,sex,age,ethnicity,disease_subtype,disease,timepoint,specimen_label,ethnicity_condensed,study_name,participant_label,is_peak
0,5f21e814e1adeb2edc12613c,PRJNA648677,A,M,55.0,Korean,Case,Covid19,11,A_d11,Asian,Kim,Kim_A,False
1,5f21e814e1adeb2edc12613d,PRJNA648677,A,M,55.0,Korean,Case,Covid19,17,A_d17,Asian,Kim,Kim_A,True
2,5f21e815e1adeb2edc12613e,PRJNA648677,A,M,55.0,Korean,Case,Covid19,45,A_d45,Asian,Kim,Kim_A,False
3,5f21e815e1adeb2edc12613f,PRJNA648677,B,M,55.0,Korean,Case,Covid19,10,B_d10,Asian,Kim,Kim_B,False
4,5f21e815e1adeb2edc126140,PRJNA648677,B,M,55.0,Korean,Case,Covid19,19,B_d19,Asian,Kim,Kim_B,True
5,5f21e816e1adeb2edc126141,PRJNA648677,C,F,53.0,Korean,Case,Covid19,6,C_d6,Asian,Kim,Kim_C,False
6,5f21e816e1adeb2edc126142,PRJNA648677,C,F,53.0,Korean,Case,Covid19,15,C_d15,Asian,Kim,Kim_C,True
7,5f21e816e1adeb2edc126143,PRJNA648677,D,M,24.0,Korean,Case,Covid19,6,D_d6,Asian,Kim,Kim_D,False
8,5f21e817e1adeb2edc126144,PRJNA648677,D,M,24.0,Korean,Case,Covid19,28,D_d28,Asian,Kim,Kim_D,True
9,5f21e817e1adeb2edc126145,PRJNA648677,E,M,48.0,Chinese,Case,Covid19,23,E_d23,Asian,Kim,Kim_E,True


In [37]:
# not all patients have any peak timepoints chosen
specimens_df["participant_label"].nunique(), specimens_df[specimens_df["is_peak"]][
    "participant_label"
].nunique()

(7, 7)

In [38]:
# not all patients have any peak timepoints chosen
set(specimens_df["participant_label"]) - set(
    specimens_df[specimens_df["is_peak"]]["participant_label"]
)

set()

In [39]:
# how many replicates chosen as peak per patient (should be more than 1 replicate for many)
specimens_df[specimens_df["is_peak"]].groupby(
    ["participant_label", "timepoint"]
).size().sort_values(ascending=False)

participant_label  timepoint
Kim_A              17           1
Kim_B              19           1
Kim_C              15           1
Kim_D              28           1
Kim_E              23           1
Kim_F              14           1
Kim_G              22           1
dtype: int64

In [40]:
# which were chosen
specimens_df[specimens_df["is_peak"]][
    ["participant_label", "timepoint", "is_peak"]
].sort_values(["participant_label", "timepoint"])

Unnamed: 0,participant_label,timepoint,is_peak
1,Kim_A,17,True
4,Kim_B,19,True
6,Kim_C,15,True
8,Kim_D,28,True
9,Kim_E,23,True
12,Kim_F,14,True
15,Kim_G,22,True


## export

In [41]:
specimen_metadata_extra = (
    specimens_df[["specimen_label", "participant_label", "timepoint", "is_peak"]]
    .drop_duplicates()
    .sort_values(["participant_label", "timepoint"])
)
specimen_metadata_extra.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.specimen_metadata_extra.tsv",
    sep="\t",
    index=None,
)
specimen_metadata_extra

Unnamed: 0,specimen_label,participant_label,timepoint,is_peak
0,A_d11,Kim_A,11,False
1,A_d17,Kim_A,17,True
2,A_d45,Kim_A,45,False
3,B_d10,Kim_B,10,False
4,B_d19,Kim_B,19,True
5,C_d6,Kim_C,6,False
6,C_d15,Kim_C,15,True
7,D_d6,Kim_D,6,False
8,D_d28,Kim_D,28,True
9,E_d23,Kim_E,23,True


In [42]:
participant_df = specimens_df[
    [
        "participant_label",
        "study_id",
        "patient_id_within_study",
        "sex",
        "age",
        "ethnicity_condensed",
        "disease_subtype",
        "disease",
        "study_name",
    ]
].drop_duplicates()
participant_df.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.covid19_bcr.participant_metadata.tsv",
    sep="\t",
    index=None,
)
participant_df

Unnamed: 0,participant_label,study_id,patient_id_within_study,sex,age,ethnicity_condensed,disease_subtype,disease,study_name
0,Kim_A,PRJNA648677,A,M,55.0,Asian,Case,Covid19,Kim
3,Kim_B,PRJNA648677,B,M,55.0,Asian,Case,Covid19,Kim
5,Kim_C,PRJNA648677,C,F,53.0,Asian,Case,Covid19,Kim
7,Kim_D,PRJNA648677,D,M,24.0,Asian,Case,Covid19,Kim
9,Kim_E,PRJNA648677,E,M,48.0,Asian,Case,Covid19,Kim
12,Kim_F,PRJNA648677,F,F,40.0,Asian,Case,Covid19,Kim
14,Kim_G,PRJNA648677,G,F,59.0,Asian,Case,Covid19,Kim


In [43]:
participant_df.shape, specimen_metadata_extra.shape

((7, 9), (16, 4))

# now for briney-healthy

we already have `patient_id` in `ireceptor_data.briney_healthy_sequences`

soon we will add more replicates i.e. more samples though. see `repertoire_id` TODO comments.

for now, just add some simple metadata to create an "all external participants" metadata file

In [44]:
briney_patients = pd.DataFrame(
    {
        "specimen_label": [
            "D103_1",
            "326780_1",
            "326650_1",
            "326737_1",
            "327059_1",
            "326907_1",
            "316188_1",
            "326797_1",
        ]
    }
)
briney_patients["participant_label"] = (
    briney_patients["specimen_label"].str.split("_").str[0]
)
briney_patients["study_name"] = "Briney"
briney_patients["disease"] = healthy_label
# all healthy are "peak" and 0 timepoint
briney_patients["is_peak"] = True
briney_patients["timepoint"] = 0
print(briney_patients.shape)
briney_patients

(8, 6)


Unnamed: 0,specimen_label,participant_label,study_name,disease,is_peak,timepoint
0,D103_1,D103,Briney,Healthy/Background,True,0
1,326780_1,326780,Briney,Healthy/Background,True,0
2,326650_1,326650,Briney,Healthy/Background,True,0
3,326737_1,326737,Briney,Healthy/Background,True,0
4,327059_1,327059,Briney,Healthy/Background,True,0
5,326907_1,326907,Briney,Healthy/Background,True,0
6,316188_1,316188,Briney,Healthy/Background,True,0
7,326797_1,326797,Briney,Healthy/Background,True,0


In [45]:
# Original paper table is wrong: 326907 is listed twice with different values; 326737 is missing. One of the dupes should be 326737.
# Fixed based on:
# https://www.ncbi.nlm.nih.gov/biosample/10331432
# https://www.ncbi.nlm.nih.gov/biosample/10331429
briney_demographics = pd.read_csv(config.paths.metadata_dir / "briney_demographics.csv")
assert not briney_demographics["subject"].duplicated().any()
print(briney_demographics.shape)
briney_demographics["sex"] = briney_demographics["sex"].replace(
    {"male": "M", "female": "F"}
)
briney_demographics

(10, 4)


Unnamed: 0,subject,age,sex,ethnicity
0,316188,30,F,African American
1,326650,18,F,Caucasian
2,326651,19,M,African American
3,326713,25,F,African American
4,326780,29,M,Caucasian
5,326797,21,F,Caucasian
6,326737,29,M,Caucasian
7,326907,29,F,African American
8,327059,26,M,African American / Caucasian
9,D103,25,M,Caucasian


In [46]:
briney_demographics["ethnicity"].value_counts()

Caucasian                       5
African American                4
African American / Caucasian    1
Name: ethnicity, dtype: int64

In [47]:
briney_demographics["ethnicity_condensed"] = briney_demographics["ethnicity"].replace(
    {"African American": "African", "African American / Caucasian": np.nan}
)
briney_demographics["ethnicity_condensed"].value_counts()

Caucasian    5
African      4
Name: ethnicity_condensed, dtype: int64

In [48]:
briney_demographics["ethnicity_condensed"].isna().value_counts()

False    9
True     1
Name: ethnicity_condensed, dtype: int64

In [49]:
# Are there any non-NaN ethnicity values that we did not remap?
briney_demographics[briney_demographics["ethnicity_condensed"].isna()][
    "ethnicity"
].value_counts()

African American / Caucasian    1
Name: ethnicity, dtype: int64

In [50]:
briney_patients = pd.merge(
    briney_patients,
    briney_demographics.set_index("subject"),
    how="left",
    validate="1:1",
    left_on="participant_label",
    right_index=True,
)
briney_patients

Unnamed: 0,specimen_label,participant_label,study_name,disease,is_peak,timepoint,age,sex,ethnicity,ethnicity_condensed
0,D103_1,D103,Briney,Healthy/Background,True,0,25,M,Caucasian,Caucasian
1,326780_1,326780,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian
2,326650_1,326650,Briney,Healthy/Background,True,0,18,F,Caucasian,Caucasian
3,326737_1,326737,Briney,Healthy/Background,True,0,29,M,Caucasian,Caucasian
4,327059_1,327059,Briney,Healthy/Background,True,0,26,M,African American / Caucasian,
5,326907_1,326907,Briney,Healthy/Background,True,0,29,F,African American,African
6,316188_1,316188,Briney,Healthy/Background,True,0,30,F,African American,African
7,326797_1,326797,Briney,Healthy/Background,True,0,21,F,Caucasian,Caucasian


In [51]:
briney_patients.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_bcr.participant_metadata.tsv",
    sep="\t",
    index=None,
)

In [52]:
participant_df_plus_briney = pd.concat(
    [
        participant_df[["participant_label", "disease", "study_name"]],
        briney_patients[["participant_label", "disease", "study_name"]],
    ],
    axis=0,
)
participant_df_plus_briney

Unnamed: 0,participant_label,disease,study_name
0,Kim_A,Covid19,Kim
3,Kim_B,Covid19,Kim
5,Kim_C,Covid19,Kim
7,Kim_D,Covid19,Kim
9,Kim_E,Covid19,Kim
12,Kim_F,Covid19,Kim
14,Kim_G,Covid19,Kim
0,D103,Healthy/Background,Briney
1,326780,Healthy/Background,Briney
2,326650,Healthy/Background,Briney


In [53]:
participant_df_plus_briney.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.all_bcr.participant_metadata.tsv",
    sep="\t",
    index=None,
)