In [1]:
from malid import config
import pandas as pd
import numpy as np

# Filter ImmuneCODE dataset to "Covid peak-timepoint" samples for external validation.

Goal is no comorbidities with immune effects

In [2]:
df = pd.read_csv(
    config.paths.external_raw_data
    / "immunecode"
    / "ImmuneCODE-Repertoire-Tags-002.2.tsv",
    sep="\t",
)
df.shape

(1486, 158)

In [3]:
df

Unnamed: 0,ImmuneCODERelease,Dataset,sample_name,subject_id,Virus Diseases,Age,Biological Sex,Racial Group,Tissue Source,Ethnic Group,...,uses_ace_inhibitor,uses_arb,uses_asthma_quick_relief,uses_autoimmune_medications,uses_corticosteroids_for_asthma,uses_immunosuppressant,uses_nsaid,visit,weight_kg,who_ordinal_scale
0,2.0,COVID-19-NIH/NIAID,BS-GIGI_61-replacement_TCRB,0000105,COVID-19 Positive,71 Years,Male,Caucasian,"Blood,gDNA",,...,,,,,,,,,,
1,2.0,COVID-19-HUniv12Oct,860011221_TCRB,109731,COVID-19 Positive,64 Years,Male,Hispanic,Blood,,...,,,,,,,,,,
2,2.0,COVID-19-NIH/NIAID,1246-BM-1050_TCRB,,,,,,"Buffy Coat,gDNA",,...,,,,,,,,,,
3,2.0,COVID-19-Adaptive,ADIRP0000106_TCRB,ADIRP0000106,COVID-19 Positive,41 Years,Female,Hispanic,Blood,,...,,,,,,False,,1.0,,
4,2.0,COVID-19-NIH/NIAID,BS-HS-185_TCRB,,,,,,"Blood,gDNA",,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,2.0,COVID-19-NIH/NIAID,BS-HS-189_TCRB,,,,,,"Blood,gDNA",,...,,,,,,,,,,
1482,2.0,COVID-19-HUniv12Oct,860011252_TCRB,391599,COVID-19 Positive,35 Years,Male,Caucasian,Blood,,...,,,,,,,,,,
1483,2.0,COVID-19-NIH/NIAID,1127-BA-739_TCRB,,,,,,"Buffy Coat,gDNA",,...,,,,,,,,,,
1484,2.0,COVID-19-Adaptive,ADIRP0001311_TCRB,ADIRP0001311,COVID-19 Positive,36 Years,Female,Caucasian,Blood,,...,,,,,,False,,1.0,,


In [4]:
df["Dataset"].value_counts()

COVID-19-DLS                     433
COVID-19-NIH/NIAID               357
COVID-19-HUniv12Oct              193
COVID-19-Adaptive                160
COVID-19-ISB                     157
COVID-19-Adaptive-MIRAMatched     72
COVID-19-IRST/AUSL                64
COVID-19-BWNW                     50
Name: Dataset, dtype: int64

In [5]:
df = df[df["Virus Diseases"] == "COVID-19 Positive"]
df.shape

(1170, 158)

In [6]:
df = df[df["days_from_symptom_onset_to_sample"] >= 11]
df.shape

(388, 158)

In [7]:
df = df[df["days_from_symptom_onset_to_sample"] <= 21]
df.shape

(137, 158)

In [8]:
df["Tissue Source"].value_counts()

Blood         74
Blood,gDNA    62
PBMC           1
Name: Tissue Source, dtype: int64

In [9]:
df["Dataset"].value_counts()

COVID-19-NIH/NIAID               62
COVID-19-HUniv12Oct              34
COVID-19-ISB                     29
COVID-19-Adaptive                11
COVID-19-Adaptive-MIRAMatched     1
Name: Dataset, dtype: int64

In [10]:
df["diseases"].str.split(";").explode().value_counts()

covid--positive                      136
hypertension--positive                11
ckd--positive                          4
t2d--positive                          4
coronary_artery_disease--positive      3
copd--positive                         3
cll--positive                          2
t1d--positive                          1
Name: diseases, dtype: int64

In [11]:
df["diseases"].str.startswith("covid--positive").value_counts()

True    136
Name: diseases, dtype: int64

In [12]:
# remove other diseases
df = df[df["diseases"] == "covid--positive"]
df.shape

(120, 158)

In [13]:
df["Dataset"].value_counts()

COVID-19-NIH/NIAID     62
COVID-19-HUniv12Oct    34
COVID-19-ISB           13
COVID-19-Adaptive      11
Name: Dataset, dtype: int64

Filter by other tags, which are not universal to all cohorts:

In [14]:
df = df[df["covid_category"] != "Recovered"]
df.shape

(118, 158)

In [15]:
df = df[df["covid_category"] != "Exposed"]
df.shape

(118, 158)

In [16]:
df = df[df["cancer_diagnosed"] != True]
df.shape

(118, 158)

In [17]:
df = df[df["cancer_type"].isna()]
df.shape

(118, 158)

In [18]:
df = df[df["describe_autoimmune_diagnoses"].isna()]
df.shape

(118, 158)

In [19]:
df = df[df["describe_autoimmune_medications"].isna()]
df.shape

(116, 158)

In [20]:
df = df[df["describe_cancers"].isna()]
df.shape

(116, 158)

In [21]:
df = df[df["describe_immunosupressants"].replace("None", np.nan).isna()]
df.shape

(116, 158)

In [22]:
df = df[df["describe_other_diagnoses"].isna()]
df.shape

(116, 158)

In [23]:
df = df[df["diabetes_type"].replace("No", np.nan).isna()]
df.shape

(116, 158)

In [24]:
df = df[df["has_hiv"] != True]
df.shape

(116, 158)

In [25]:
df = df[df["is_immunocompromised"] != True]
df.shape

(116, 158)

In [26]:
df = df[df["selected_autoimmune_diagnoses"].replace("None", np.nan).isna()]
df.shape

(116, 158)

In [27]:
df = df[df["selected_other_diagnoses"].replace("None", np.nan).isna()]
df.shape

(114, 158)

In [28]:
df = df[df["uses_autoimmune_medications"].isna()]
df.shape

(114, 158)

In [29]:
df = df[df["uses_immunosuppressant"] != True]
df.shape

(114, 158)

In [30]:
df["Dataset"].value_counts()

COVID-19-NIH/NIAID     62
COVID-19-HUniv12Oct    34
COVID-19-ISB           13
COVID-19-Adaptive       5
Name: Dataset, dtype: int64

In [31]:
# Maybe we should go further and guarantee that not HIV? We have that info for COVID-19-ISB only
# But we can avoid that, because we are going to use Covid-vs-healthy model only.

In [32]:
# Set subtype
df[["Dataset", "hospitalized", "icu_admit"]]

Unnamed: 0,Dataset,hospitalized,icu_admit
6,COVID-19-HUniv12Oct,True,False
8,COVID-19-ISB,,
36,COVID-19-HUniv12Oct,True,False
37,COVID-19-ISB,,
51,COVID-19-NIH/NIAID,True,False
...,...,...,...
1455,COVID-19-ISB,,
1457,COVID-19-NIH/NIAID,True,False
1464,COVID-19-NIH/NIAID,True,True
1477,COVID-19-NIH/NIAID,True,True


In [33]:
df["icu_admit"] = df["icu_admit"].map({True: "ICU"})
df["hospitalized"] = df["hospitalized"].map({True: "Hospitalized"})
df[["Dataset", "hospitalized", "icu_admit"]]

Unnamed: 0,Dataset,hospitalized,icu_admit
6,COVID-19-HUniv12Oct,Hospitalized,
8,COVID-19-ISB,,
36,COVID-19-HUniv12Oct,Hospitalized,
37,COVID-19-ISB,,
51,COVID-19-NIH/NIAID,Hospitalized,
...,...,...,...
1455,COVID-19-ISB,,
1457,COVID-19-NIH/NIAID,Hospitalized,
1464,COVID-19-NIH/NIAID,Hospitalized,ICU
1477,COVID-19-NIH/NIAID,Hospitalized,ICU


In [34]:
df["disease_subtype"] = df[["Dataset", "hospitalized", "icu_admit"]].apply(
    lambda row: " - ".join(row.dropna()), axis=1
)
df["disease_subtype"]

6            COVID-19-HUniv12Oct - Hospitalized
8                                  COVID-19-ISB
36           COVID-19-HUniv12Oct - Hospitalized
37                                 COVID-19-ISB
51            COVID-19-NIH/NIAID - Hospitalized
                         ...                   
1455                               COVID-19-ISB
1457          COVID-19-NIH/NIAID - Hospitalized
1464    COVID-19-NIH/NIAID - Hospitalized - ICU
1477    COVID-19-NIH/NIAID - Hospitalized - ICU
1485         COVID-19-HUniv12Oct - Hospitalized
Name: disease_subtype, Length: 114, dtype: object

In [35]:
# Get first row for each subject_id (can have multiple sample_name's)
df = df.groupby("subject_id").head(n=1)
df.shape

(93, 159)

In [36]:
# Set identifiers
df = df.assign(
    participant_label="ImmuneCode-" + df["subject_id"].astype(str),
    disease="Covid19",
)

df.rename(columns={"sample_name": "specimen_label"}, inplace=True)

assert not df["specimen_label"].duplicated().any()
assert not df["participant_label"].duplicated().any()

In [37]:
df["sex"] = df["Biological Sex"].str.lower().replace({"male": "M", "female": "F"})

In [38]:
# extract number of years
df["age"] = df["Age"].str.extract("(\d+)")
df["age"].value_counts()

53    5
59    5
52    4
60    4
55    4
54    4
49    3
66    3
56    3
75    2
58    2
48    2
88    2
89    2
50    2
79    2
84    2
34    2
39    2
70    2
76    2
78    2
77    2
85    2
57    2
40    2
71    2
29    1
82    1
51    1
08    1
62    1
38    1
45    1
46    1
27    1
42    1
35    1
24    1
80    1
67    1
23    1
26    1
63    1
30    1
36    1
33    1
61    1
72    1
Name: age, dtype: int64

In [39]:
df["ethnicity"] = (
    df["Racial Group"].fillna("Unknown") + " - " + df["Ethnic Group"].fillna("Unknown")
)
df["ethnicity"].value_counts()

Caucasian - Unknown                                   73
Hispanic - Unknown                                     8
Unknown racial group - Hispanic or Latino              4
Caucasian - Non-Hispanic or Latino                     3
Unknown - Unknown                                      2
Unknown racial group - Non-Hispanic or Latino          1
Asian or Pacific Islander - Non-Hispanic or Latino     1
Asian or Pacific Islander - Unknown                    1
Name: ethnicity, dtype: int64

In [40]:
df["ethnicity_condensed"] = df["ethnicity"].replace(
    {
        "Caucasian - Unknown": "Caucasian",
        "Hispanic - Unknown": "Hispanic/Latino",
        "Unknown racial group - Hispanic or Latino": "Hispanic/Latino",
        "Caucasian - Non-Hispanic or Latino": "Caucasian",
        "Unknown - Unknown": np.nan,
        "Unknown racial group - Non-Hispanic or Latino": np.nan,
        "Asian or Pacific Islander - Non-Hispanic or Latino": "Asian",
        "Asian or Pacific Islander - Unknown": "Asian",
    }
)
df["ethnicity_condensed"].value_counts()

Caucasian          76
Hispanic/Latino    12
Asian               2
Name: ethnicity_condensed, dtype: int64

In [41]:
df = df[
    [
        "participant_label",
        "specimen_label",
        "disease",
        "disease_subtype",
        "age",
        "sex",
        "ethnicity_condensed",
    ]
]
df

Unnamed: 0,participant_label,specimen_label,disease,disease_subtype,age,sex,ethnicity_condensed
6,ImmuneCode-190921,860011232_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,53,F,Hispanic/Latino
8,ImmuneCode-026,INCOV026-AC-3_TCRB,Covid19,COVID-19-ISB,33,M,Hispanic/Latino
36,ImmuneCode-321977,860011116_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,52,F,Caucasian
37,ImmuneCode-087,INCOV087-BL-3_TCRB,Covid19,COVID-19-ISB,56,M,Hispanic/Latino
51,ImmuneCode-0000051,BS-EQ-0014-T2-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,55,M,Caucasian
...,...,...,...,...,...,...,...
1422,ImmuneCode-ADIRP0001958,ADIRP0001958_TCRB,Covid19,COVID-19-Adaptive,49,M,Caucasian
1434,ImmuneCode-775827,860011106_TCRB,Covid19,COVID-19-HUniv12Oct - Hospitalized,57,F,Caucasian
1457,ImmuneCode-0000446,BS-GIGI_10-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized,75,M,Caucasian
1477,ImmuneCode-0000160,BS-EQ-25-T1_BS-GIGI-71-replacement_TCRB,Covid19,COVID-19-NIH/NIAID - Hospitalized - ICU,72,M,Caucasian


In [42]:
df.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.adaptive_covid_tcr.specimens.tsv",
    sep="\t",
    index=None,
)