# Record which specimens remain after QC filtering in `sample_sequences.ipynb`

In [1]:
import numpy as np
import pandas as pd
from malid import config, helpers, logger

In [2]:
import dask
import dask.dataframe as dd

In [3]:
from dask.distributed import Client

# multi-processing backend
# if already opened from another notebook, see https://stackoverflow.com/questions/60115736/dask-how-to-connect-to-running-cluster-scheduler-and-access-total-occupancy
client = Client(
    scheduler_port=config.dask_scheduler_port,
    dashboard_address=config.dask_dashboard_address,
    n_workers=config.dask_n_workers,
    processes=True,
    threads_per_worker=8,
    memory_limit="125GB",  # per worker
)
display(client)
# for debugging: client.restart()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:61093/status,

0,1
Dashboard: http://127.0.0.1:61093/status,Workers: 8
Total threads: 64,Total memory: 0.91 TiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61094,Workers: 8
Dashboard: http://127.0.0.1:61093/status,Total threads: 64
Started: Just now,Total memory: 0.91 TiB

0,1
Comm: tcp://127.0.0.1:39089,Total threads: 8
Dashboard: http://127.0.0.1:40639/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:42127,
Local directory: /tmp/dask-worker-space/worker-y4tf84gj,Local directory: /tmp/dask-worker-space/worker-y4tf84gj

0,1
Comm: tcp://127.0.0.1:42271,Total threads: 8
Dashboard: http://127.0.0.1:36251/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:44389,
Local directory: /tmp/dask-worker-space/worker-j5a0nqf2,Local directory: /tmp/dask-worker-space/worker-j5a0nqf2

0,1
Comm: tcp://127.0.0.1:39993,Total threads: 8
Dashboard: http://127.0.0.1:43095/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:40301,
Local directory: /tmp/dask-worker-space/worker-77wj7i7m,Local directory: /tmp/dask-worker-space/worker-77wj7i7m

0,1
Comm: tcp://127.0.0.1:45217,Total threads: 8
Dashboard: http://127.0.0.1:41195/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:36877,
Local directory: /tmp/dask-worker-space/worker-_xz3oy98,Local directory: /tmp/dask-worker-space/worker-_xz3oy98

0,1
Comm: tcp://127.0.0.1:45657,Total threads: 8
Dashboard: http://127.0.0.1:45201/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:46265,
Local directory: /tmp/dask-worker-space/worker-vn32451q,Local directory: /tmp/dask-worker-space/worker-vn32451q

0,1
Comm: tcp://127.0.0.1:43351,Total threads: 8
Dashboard: http://127.0.0.1:32863/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:39007,
Local directory: /tmp/dask-worker-space/worker-9u9bty76,Local directory: /tmp/dask-worker-space/worker-9u9bty76

0,1
Comm: tcp://127.0.0.1:43569,Total threads: 8
Dashboard: http://127.0.0.1:35745/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:45829,
Local directory: /tmp/dask-worker-space/worker-bwfi7ski,Local directory: /tmp/dask-worker-space/worker-bwfi7ski

0,1
Comm: tcp://127.0.0.1:42145,Total threads: 8
Dashboard: http://127.0.0.1:46191/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:46503,
Local directory: /tmp/dask-worker-space/worker-n2irujrj,Local directory: /tmp/dask-worker-space/worker-n2irujrj


In [4]:
# Don't use fastparquet, because it changes specimen labels like M54-049 to 2049-01-01 00:00:54 -- i.e. it coerces partition names to numbers or dates
df = dd.read_parquet(config.paths.sequences_sampled, engine="pyarrow")
df

Unnamed: 0_level_0,amplification_label,v_gene,j_gene,disease,disease_subtype,fr1_seq_aa_q_trim,cdr1_seq_aa_q_trim,fr2_seq_aa_q_trim,cdr2_seq_aa_q_trim,fr3_seq_aa_q_trim,cdr3_seq_aa_q_trim,post_seq_aa_q_trim,cdr3_aa_sequence_trim_len,extracted_isotype,isotype_supergroup,v_mut,num_reads,igh_or_tcrb_clone_id,total_clone_num_reads,num_clone_members,participant_label,specimen_label
npartitions=2329,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,category[unknown],category[unknown],category[unknown],category[unknown],category[unknown],object,object,object,object,object,object,object,int64,category[unknown],category[unknown],float64,int64,int64,int64,int64,category[known],category[known]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [5]:
# each partition is a specimen
df.npartitions

2329

In [6]:
df.columns

Index(['amplification_label', 'v_gene', 'j_gene', 'disease', 'disease_subtype',
       'fr1_seq_aa_q_trim', 'cdr1_seq_aa_q_trim', 'fr2_seq_aa_q_trim',
       'cdr2_seq_aa_q_trim', 'fr3_seq_aa_q_trim', 'cdr3_seq_aa_q_trim',
       'post_seq_aa_q_trim', 'cdr3_aa_sequence_trim_len', 'extracted_isotype',
       'isotype_supergroup', 'v_mut', 'num_reads', 'igh_or_tcrb_clone_id',
       'total_clone_num_reads', 'num_clone_members', 'participant_label',
       'specimen_label'],
      dtype='object')

# Get all specimens available from ETL - meaning the ones that passed `sample_sequences` filters

In [7]:
# groupby participant, specimen, disease - get total sequence count
specimens = (
    df.groupby(
        ["participant_label", "specimen_label", "disease"],
        observed=True,
    )
    .size()
    .rename("total_sequence_count")
    .reset_index()
)
specimens

Unnamed: 0_level_0,participant_label,specimen_label,disease,total_sequence_count
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,category[known],category[known],category[unknown],int64
,...,...,...,...


In [8]:
specimens = specimens.compute()
specimens

Unnamed: 0,participant_label,specimen_label,disease,total_sequence_count
0,BFI-0000234,M124-S014,Healthy/Background,67139
1,BFI-0000254,M111-S037,HIV,73771
2,BFI-0000255,M111-S033,HIV,69192
3,BFI-0000256,M111-S038,HIV,99595
4,BFI-0000258,M111-S055,HIV,105617
...,...,...,...,...
2324,towlerton-2022-hiv_1026,towlerton-2022-hiv_015V08002633_CFAR,HIV,25999
2325,towlerton-2022-hiv_1027,towlerton-2022-hiv_015V09002862_CFAR,HIV,44435
2326,towlerton-2022-hiv_1028,towlerton-2022-hiv_015V11002805_CFAR,HIV,23025
2327,towlerton-2022-hiv_1029,towlerton-2022-hiv_015V11001839_CFAR,HIV,47858


In [9]:
assert specimens.shape[0] == df.npartitions

In [10]:
assert not specimens["specimen_label"].duplicated().any()

In [11]:
# Export list of specimens remaining after QC filtering in sample_sequences.ipynb.
# Not all specimens survived to this step - some are thrown out for not having enough sequences or not having all isotypes.
# However, these aren't yet filtered to is_selected_for_cv_strategy specimens that are particular to the selected cross validation strategy.
specimens.to_csv(
    config.paths.dataset_specific_metadata
    / "specimens_that_survived_qc_filters_in_sample_sequences_notebook.tsv",
    sep="\t",
    index=None,
)