In [1]:
import numpy as np
import matplotlib.pyplot as plt
import genetools
import seaborn as sns

sns.set_style("dark")

In [2]:
import pandas as pd

In [3]:
import dask
import dask.dataframe as dd

In [4]:
import os

In [5]:
from malid import config

Raise worker connection timeouts (see other dask notebooks):

In [6]:
import distributed

# These only seem to be picked up by scheduler, not by individual workers

dask.config.set(
    {
        "distributed.comm.timeouts.tcp": "120s",
        "distributed.comm.timeouts.connect": "120s",
        "distributed.comm.retry.count": 5,
    }
)

<dask.config.set at 0x7feaf92f37f0>

In [7]:
# These will be picked up by individual workers

with open(os.path.expandvars("$HOME/.config/dask/distributed.yaml"), "w") as w:
    w.write(
        """distributed:
  comm:
    retry:
      count: 5
    timeouts:
      connect: 120s          # time before connecting fails
      tcp: 120s              # time before calling an unresponsive connection dead
    """
    )

In [8]:
from dask.distributed import Client

# multi-processing backend
# access dashbaord at http://127.0.0.1:61083
# if already opened from another notebook, see https://stackoverflow.com/questions/60115736/dask-how-to-connect-to-running-cluster-scheduler-and-access-total-occupancy
client = Client(
    scheduler_port=61084,
    dashboard_address=":61083",
    n_workers=7,
    processes=True,
    threads_per_worker=8,
    memory_limit="auto",
    worker_dashboard_address=":0",  # start worker dashboards on random ports
)
display(client)
# for debugging: client.restart()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:61083/status,

0,1
Dashboard: http://127.0.0.1:61083/status,Workers: 7
Total threads: 56,Total memory: 1.15 TiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61084,Workers: 7
Dashboard: http://127.0.0.1:61083/status,Total threads: 56
Started: Just now,Total memory: 1.15 TiB

0,1
Comm: tcp://127.0.0.1:36185,Total threads: 8
Dashboard: http://127.0.0.1:45089/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:45929,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-sn7lw9tp,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-sn7lw9tp
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:39897,Total threads: 8
Dashboard: http://127.0.0.1:45243/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:39633,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-ty8lxwdx,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-ty8lxwdx
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:33523,Total threads: 8
Dashboard: http://127.0.0.1:35909/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:36787,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-tgbday5k,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-tgbday5k
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:33607,Total threads: 8
Dashboard: http://127.0.0.1:35323/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:40155,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-rbed373f,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-rbed373f
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:34507,Total threads: 8
Dashboard: http://127.0.0.1:39869/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:33489,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-8juidt25,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-8juidt25
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:42649,Total threads: 8
Dashboard: http://127.0.0.1:32857/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:44993,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-tn5d4bms,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-tn5d4bms
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:33667,Total threads: 8
Dashboard: http://127.0.0.1:37095/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:42467,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-kpeonepv,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-kpeonepv
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB


In [9]:
desired_cols = [
    "participant_label",
    "specimen_label",
    "disease",
    "disease_subtype",
    "specimen_time_point",
    "participant_age",
    "participant_description",
]

In [10]:
debug_filters = None
# debug_filters = [("participant_label", "==", "BFI-0007450")]

In [11]:
# Don't use fastparquet, because it changes specimen labels like M54-049 to 2049-01-01 00:00:54 -- i.e. it coerces partition names to numbers or dates
df = dd.read_parquet(
    config.paths.sequences,
    columns=desired_cols,
    filters=debug_filters,
    engine="pyarrow",
)

In [12]:
df

Unnamed: 0_level_0,participant_label,specimen_label,disease,disease_subtype,specimen_time_point,participant_age,participant_description
npartitions=717,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,category[known],category[known],category[unknown],category[unknown],object,object,object
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [13]:
# df = df.drop_duplicates()
# display(df)
# dask.visualize(df)
# this has an aggregation step

In [14]:
# use map_partitions to avoid agg step that is unnecessary based on our partitioning strategy

metadata_df = df.map_partitions(lambda part: part.drop_duplicates())

In [15]:
metadata_df

Unnamed: 0_level_0,participant_label,specimen_label,disease,disease_subtype,specimen_time_point,participant_age,participant_description
npartitions=717,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,category[known],category[known],category[unknown],category[unknown],object,object,object
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [16]:
# dask.visualize(metadata_df, filename="participant_specimen_metadata.dask_task_graph.pdf")

In [17]:
metadata_df_c = metadata_df.compute()

In [18]:
metadata_df_c

Unnamed: 0,participant_label,specimen_label,disease,disease_subtype,specimen_time_point,participant_age,participant_description
0,BFI-0000234,M124-S014,Healthy/Background,Healthy/Background - HIV Negative,,27,Location: USA
74701,BFI-0000234,M132-S014,Healthy/Background,Healthy/Background - HIV Negative,,27,Location: USA
0,BFI-0000254,M111-S037,HIV,HIV Broad Neutralizing,,48,Location: Tanzania
98181,BFI-0000254,M114-S037,HIV,HIV Broad Neutralizing,,48,Location: Tanzania
0,BFI-0000255,M111-S033,HIV,HIV Broad Neutralizing,,33,Location: Tanzania
...,...,...,...,...,...,...,...
0,BFI-0010241,M464-S042,Healthy/Background,Healthy/Background (children),,15,
0,BFI-0010242,M464-S043,Healthy/Background,Healthy/Background (children),,14,
0,BFI-0010243,M464-S044,Healthy/Background,Healthy/Background (children),,13,
0,BFI-0010244,M464-S045,Healthy/Background,Healthy/Background (children),,12,


In [19]:
metadata_df_c = metadata_df_c.sort_values(["disease", "participant_label"])
metadata_df_c

Unnamed: 0,participant_label,specimen_label,disease,disease_subtype,specimen_time_point,participant_age,participant_description
0,BFI-0007450,M369-S001,Covid19,Covid19 - Sero-positive (ICU),9 days,,COVID-19 project
495704,BFI-0007450,M371-S003,Covid19,Covid19 - Sero-positive (ICU),22 days,,COVID-19 project
348716,BFI-0007450,M371-S014,Covid19,Covid19 - Sero-positive (ICU),27 days,,COVID-19 project
460228,BFI-0007453,M369-S004,Covid19,Covid19 - Sero-negative (ICU),8 days,,COVID-19 project
0,BFI-0007453,M369-S005,Covid19,Covid19 - Sero-negative (ICU),11 days,,COVID-19 project
...,...,...,...,...,...,...,...
0,BFI-0010060,M456-S001,Lupus,SLE Patient,,51,SLE Patient
0,BFI-0010061,M456-S002,Lupus,SLE Patient,,34,SLE Patient
0,BFI-0010065,M456-S006,Lupus,SLE Patient,,46,SLE Patient
0,BFI-0010067,M456-S008,Lupus,SLE Patient,,49,SLE Patient


In [20]:
# sanity check: one entry per participant + specimen
assert all(
    metadata_df_c.groupby(["participant_label", "specimen_label"], observed=True).size()
    == 1
)

In [21]:
metadata_df_c.to_csv(
    config.paths.dataset_specific_metadata / "participant_specimen_disease_map.tsv",
    sep="\t",
    index=None,
)

In [22]:
metadata_df_c = pd.read_csv(
    config.paths.dataset_specific_metadata / "participant_specimen_disease_map.tsv",
    sep="\t",
)

In [23]:
metadata_df_c["disease_subtype"].value_counts()

HIV Non Neutralizing                               105
HIV Broad Neutralizing                              92
Healthy/Background - HIV Negative                   86
Healthy/Background - CMV Negative                   61
Healthy/Background - CMV Positive                   52
Covid19 - Admit                                     52
Healthy/Background (children)                       46
SLE Patient                                         43
Healthy/Background - SLE Negative                   31
Pediatric SLE - nephritis                           23
Pediatric SLE - no nephritis                        20
Covid19 - ICU                                       18
Covid19 - Sero-positive (ICU)                       18
SLE Multiple aAbs / SLE dsDNA WITH Nephritis        10
SLE Multiple aAbs                                    9
SLE Multiple aAbs / SLE dsDNA WITHOUT Nephritis      9
Covid19 - Sero-positive (Admit)                      6
Unaffected Control                                   6
Covid19 - 

Confirm HIV patient numbers -- we expect:

```
43	HIV Negative
46	HIV Broad Neutralizing
50	HIV Non Neutralizing
```

In [24]:
# Specimens
metadata_df_c[metadata_df_c["disease"] == "HIV"]["disease_subtype"].astype(
    "category"
).cat.remove_unused_categories().value_counts()

HIV Non Neutralizing      105
HIV Broad Neutralizing     92
Name: disease_subtype, dtype: int64

In [25]:
# Patients
metadata_df_c[metadata_df_c["disease"] == "HIV"].groupby(
    "disease_subtype", observed=True
)["participant_label"].nunique()

disease_subtype
HIV Broad Neutralizing    46
HIV Non Neutralizing      50
Name: participant_label, dtype: int64

In [26]:
# healthy specimens
metadata_df_c[metadata_df_c["disease"] == "Healthy/Background"][
    "disease_subtype"
].astype("category").cat.remove_unused_categories().value_counts()

Healthy/Background - HIV Negative    86
Healthy/Background - CMV Negative    61
Healthy/Background - CMV Positive    52
Healthy/Background (children)        46
Healthy/Background - SLE Negative    31
Unaffected Control                    6
Healthy/Background - CMV Unknown      1
Name: disease_subtype, dtype: int64

In [27]:
# healthy patients
metadata_df_c[metadata_df_c["disease"] == "Healthy/Background"].groupby(
    "disease_subtype", observed=True
)["participant_label"].nunique()

disease_subtype
Healthy/Background (children)        46
Healthy/Background - CMV Negative    61
Healthy/Background - CMV Positive    52
Healthy/Background - CMV Unknown      1
Healthy/Background - HIV Negative    43
Healthy/Background - SLE Negative    24
Unaffected Control                    6
Name: participant_label, dtype: int64

In [28]:
metadata_df_c[metadata_df_c["disease_subtype"] == "Healthy/Background - CMV Unknown"]

Unnamed: 0,participant_label,specimen_label,disease,disease_subtype,specimen_time_point,participant_age,participant_description
425,BFI-0003144,M64-095,Healthy/Background,Healthy/Background - CMV Unknown,,22.0,Healthy Human Control


In [29]:
# covid specimens
metadata_df_c[metadata_df_c["disease"] == "Covid19"]["disease_subtype"].astype(
    "category"
).cat.remove_unused_categories().value_counts()

Covid19 - Admit                    52
Covid19 - ICU                      18
Covid19 - Sero-positive (ICU)      18
Covid19 - Sero-positive (Admit)     6
Covid19 - Acute 1                   5
Covid19 - Acute 2                   5
Covid19 - Convalescence             5
Covid19 - Sero-negative (Admit)     3
Covid19 - Sero-negative (ICU)       2
Name: disease_subtype, dtype: int64

In [30]:
# covid patients
metadata_df_c[metadata_df_c["disease"] == "Covid19"].groupby(
    "disease_subtype", observed=True
)["participant_label"].nunique()

disease_subtype
Covid19 - Acute 1                   5
Covid19 - Acute 2                   5
Covid19 - Admit                    41
Covid19 - Convalescence             5
Covid19 - ICU                      15
Covid19 - Sero-negative (Admit)     1
Covid19 - Sero-negative (ICU)       1
Covid19 - Sero-positive (Admit)     3
Covid19 - Sero-positive (ICU)       7
Name: participant_label, dtype: int64

In [31]:
client.shutdown()