In [1]:
import pandas as pd
from malid import config
from malid.datamodels import healthy_label

In [2]:
base_dir = config.paths.external_raw_data / "chudakov_aging"
external_metadata = pd.read_csv(base_dir / "metadata.txt", sep="\t").assign(
    ethnicity_condensed="Caucasian"
)
# The A* in sample identifier is the batch ID.
# Age and sex data is provided in the metadata.txt file.
# Samples having age "0" are umbilical cord blood samples.
external_metadata

Unnamed: 0,file_name,sample_id,sex,age,label,..filter..,ethnicity_condensed
0,A3-i101.txt,A3-i101,F,36,p1-F36,conv:MiTcr,Caucasian
1,A3-i102.txt,A3-i102,F,43,p2-F43,conv:MiTcr,Caucasian
2,A3-i106.txt,A3-i106,F,43,p3-F43,conv:MiTcr,Caucasian
3,A3-i107.txt,A3-i107,F,39,p4-F39,conv:MiTcr,Caucasian
4,A3-i110.txt,A3-i110,F,34,p5-F34,conv:MiTcr,Caucasian
...,...,...,...,...,...,...,...
74,A6-I204ob.txt,A6-I204ob,F,99,p74-F99,conv:MiTcr,Caucasian
75,A6-I205ob.txt,A6-I205ob,F,95,p75-F95,conv:MiTcr,Caucasian
76,A6-I200ob.txt,A6-I200ob,M,27,p76-M27,conv:MiTcr,Caucasian
77,A6-I201ob.txt,A6-I201ob,M,30,p76-M30,conv:MiTcr,Caucasian


In [3]:
external_metadata["label"].duplicated().any()

True

In [4]:
external_metadata["label"].value_counts()

p36-M25    2
p7-F87     2
p1-F36     1
p51-F92    1
p58-M0     1
          ..
p27-M64    1
p26-M66    1
p25-M85    1
p24-F89    1
p53-M47    1
Name: label, Length: 77, dtype: int64

In [5]:
external_metadata["label"].str.split("-").str[0].duplicated().any()

True

In [6]:
external_metadata["label"].str.split("-").str[0].value_counts()

p76    2
p36    2
p7     2
p53    2
p50    1
      ..
p27    1
p26    1
p25    1
p24    1
p39    1
Name: label, Length: 75, dtype: int64

In [7]:
# p53, p76 suspicious that years apart. but checks out against paper 2's explanation of two time points.
# the other two must be the two individuals listed in the paper with replicate samples - yes the ages match
external_metadata[
    external_metadata["label"].str.split("-").str[0].duplicated(keep=False)
]

Unnamed: 0,file_name,sample_id,sex,age,label,..filter..,ethnicity_condensed
6,A3-i150.txt,A3-i150,F,87,p7-F87,conv:MiTcr,Caucasian
7,A3-i151.txt,A3-i151,F,87,p7-F87,conv:MiTcr,Caucasian
35,A4-i189.txt,A4-i189,M,25,p36-M25,conv:MiTcr,Caucasian
36,A4-i190.txt,A4-i190,M,25,p36-M25,conv:MiTcr,Caucasian
53,A5-S23.txt,A5-S23,M,50,p53-M50,conv:MiTcr,Caucasian
76,A6-I200ob.txt,A6-I200ob,M,27,p76-M27,conv:MiTcr,Caucasian
77,A6-I201ob.txt,A6-I201ob,M,30,p76-M30,conv:MiTcr,Caucasian
78,A6-I202ob.txt,A6-I202ob,M,47,p53-M47,conv:MiTcr,Caucasian


In [8]:
external_metadata["participant_label"] = (
    external_metadata["label"].str.split("-").str[0]
)
external_metadata

Unnamed: 0,file_name,sample_id,sex,age,label,..filter..,ethnicity_condensed,participant_label
0,A3-i101.txt,A3-i101,F,36,p1-F36,conv:MiTcr,Caucasian,p1
1,A3-i102.txt,A3-i102,F,43,p2-F43,conv:MiTcr,Caucasian,p2
2,A3-i106.txt,A3-i106,F,43,p3-F43,conv:MiTcr,Caucasian,p3
3,A3-i107.txt,A3-i107,F,39,p4-F39,conv:MiTcr,Caucasian,p4
4,A3-i110.txt,A3-i110,F,34,p5-F34,conv:MiTcr,Caucasian,p5
...,...,...,...,...,...,...,...,...
74,A6-I204ob.txt,A6-I204ob,F,99,p74-F99,conv:MiTcr,Caucasian,p74
75,A6-I205ob.txt,A6-I205ob,F,95,p75-F95,conv:MiTcr,Caucasian,p75
76,A6-I200ob.txt,A6-I200ob,M,27,p76-M27,conv:MiTcr,Caucasian,p76
77,A6-I201ob.txt,A6-I201ob,M,30,p76-M30,conv:MiTcr,Caucasian,p76


In [9]:
assert (external_metadata["file_name"] == external_metadata["sample_id"] + ".txt").all()

In [10]:
external_metadata["..filter.."].value_counts()

conv:MiTcr    79
Name: ..filter.., dtype: int64

In [11]:
# get rid of extreme ages like cord blood
external_metadata["age"].describe()

count     79.000000
mean      54.177215
std       32.747973
min        0.000000
25%       26.000000
50%       51.000000
75%       87.500000
max      103.000000
Name: age, dtype: float64

In [12]:
external_metadata = external_metadata[
    (external_metadata["age"] >= 20) & (external_metadata["age"] <= 80)
]
external_metadata["age"].describe()

count    39.000000
mean     46.769231
std      15.827128
min      20.000000
25%      36.000000
50%      50.000000
75%      61.000000
max      75.000000
Name: age, dtype: float64

In [13]:
external_metadata = external_metadata.rename(
    columns={"sample_id": "specimen_label"}
).assign(
    disease=healthy_label,
    study_name="Britanova",
    disease_subtype=f"{healthy_label} - Britanova",
)
external_metadata

Unnamed: 0,file_name,specimen_label,sex,age,label,..filter..,ethnicity_condensed,participant_label,disease,study_name,disease_subtype
0,A3-i101.txt,A3-i101,F,36,p1-F36,conv:MiTcr,Caucasian,p1,Healthy/Background,Britanova,Healthy/Background - Britanova
1,A3-i102.txt,A3-i102,F,43,p2-F43,conv:MiTcr,Caucasian,p2,Healthy/Background,Britanova,Healthy/Background - Britanova
2,A3-i106.txt,A3-i106,F,43,p3-F43,conv:MiTcr,Caucasian,p3,Healthy/Background,Britanova,Healthy/Background - Britanova
3,A3-i107.txt,A3-i107,F,39,p4-F39,conv:MiTcr,Caucasian,p4,Healthy/Background,Britanova,Healthy/Background - Britanova
4,A3-i110.txt,A3-i110,F,34,p5-F34,conv:MiTcr,Caucasian,p5,Healthy/Background,Britanova,Healthy/Background - Britanova
14,A2-i138.txt,A2-i138,F,74,p15-F74,conv:MiTcr,Caucasian,p15,Healthy/Background,Britanova,Healthy/Background - Britanova
15,A2-i139.txt,A2-i139,M,75,p16-M75,conv:MiTcr,Caucasian,p16,Healthy/Background,Britanova,Healthy/Background - Britanova
16,A2-i140.txt,A2-i140,F,73,p17-F73,conv:MiTcr,Caucasian,p17,Healthy/Background,Britanova,Healthy/Background - Britanova
17,A2-i141.txt,A2-i141,M,71,p18-M71,conv:MiTcr,Caucasian,p18,Healthy/Background,Britanova,Healthy/Background - Britanova
18,A4-i101.txt,A4-i101,M,36,p19-M36,conv:MiTcr,Caucasian,p19,Healthy/Background,Britanova,Healthy/Background - Britanova


In [14]:
external_metadata_export = external_metadata[
    [
        "specimen_label",
        "sex",
        "age",
        "ethnicity_condensed",
        "participant_label",
        "disease",
        "study_name",
        "disease_subtype",
    ]
]
external_metadata_export

Unnamed: 0,specimen_label,sex,age,ethnicity_condensed,participant_label,disease,study_name,disease_subtype
0,A3-i101,F,36,Caucasian,p1,Healthy/Background,Britanova,Healthy/Background - Britanova
1,A3-i102,F,43,Caucasian,p2,Healthy/Background,Britanova,Healthy/Background - Britanova
2,A3-i106,F,43,Caucasian,p3,Healthy/Background,Britanova,Healthy/Background - Britanova
3,A3-i107,F,39,Caucasian,p4,Healthy/Background,Britanova,Healthy/Background - Britanova
4,A3-i110,F,34,Caucasian,p5,Healthy/Background,Britanova,Healthy/Background - Britanova
14,A2-i138,F,74,Caucasian,p15,Healthy/Background,Britanova,Healthy/Background - Britanova
15,A2-i139,M,75,Caucasian,p16,Healthy/Background,Britanova,Healthy/Background - Britanova
16,A2-i140,F,73,Caucasian,p17,Healthy/Background,Britanova,Healthy/Background - Britanova
17,A2-i141,M,71,Caucasian,p18,Healthy/Background,Britanova,Healthy/Background - Britanova
18,A4-i101,M,36,Caucasian,p19,Healthy/Background,Britanova,Healthy/Background - Britanova


In [15]:
# remember, some patients have multiple samples as we investigated above
assert external_metadata_export["participant_label"].duplicated().any()
external_metadata_export[
    external_metadata_export["participant_label"].duplicated(keep=False)
].sort_values("participant_label")

Unnamed: 0,specimen_label,sex,age,ethnicity_condensed,participant_label,disease,study_name,disease_subtype
35,A4-i189,M,25,Caucasian,p36,Healthy/Background,Britanova,Healthy/Background - Britanova
36,A4-i190,M,25,Caucasian,p36,Healthy/Background,Britanova,Healthy/Background - Britanova
53,A5-S23,M,50,Caucasian,p53,Healthy/Background,Britanova,Healthy/Background - Britanova
78,A6-I202ob,M,47,Caucasian,p53,Healthy/Background,Britanova,Healthy/Background - Britanova
76,A6-I200ob,M,27,Caucasian,p76,Healthy/Background,Britanova,Healthy/Background - Britanova
77,A6-I201ob,M,30,Caucasian,p76,Healthy/Background,Britanova,Healthy/Background - Britanova


In [16]:
# but each specimen listed once
assert not external_metadata_export["specimen_label"].duplicated().any()

In [17]:
external_metadata_export.to_csv(
    config.paths.metadata_dir
    / "generated.external_cohorts.healthy_tcr_britanova.participant_metadata.tsv",
    sep="\t",
    index=None,
)