# Sample sequences from our full ETL load

- subselect sequences
- subselect columns
- remove specimens that violate some constraints: too few sequences, or not all isotypes found

both peak + off-peak are still included after this.

In [1]:
from malid import config, helpers
from malid.sample_sequences import sample_sequences

**If regenerating, clear `config.paths.sequences_sampled` first with `rm -r`**

In [2]:
config.paths.sequences_sampled

PosixPath('/users/maximz/code/boyd-immune-repertoire-classification/data/data_v_20221224/sequences.sampled.parquet')

In [3]:
import dask
import dask.dataframe as dd
import time

In [4]:
from dask.distributed import Client

# multi-processing backend
# access dashbaord at http://127.0.0.1:61083
client = Client(
    scheduler_port=61084,
    dashboard_address=":61083",
    n_workers=4,
    processes=True,
    threads_per_worker=8,
    memory_limit="125GB",  # per worker
)
display(client)
# for debugging: client.restart()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:61083/status,

0,1
Dashboard: http://127.0.0.1:61083/status,Workers: 4
Total threads: 32,Total memory: 465.66 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61084,Workers: 4
Dashboard: http://127.0.0.1:61083/status,Total threads: 32
Started: Just now,Total memory: 465.66 GiB

0,1
Comm: tcp://127.0.0.1:33435,Total threads: 8
Dashboard: http://127.0.0.1:33061/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:35009,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-ht8xg5nb,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-ht8xg5nb
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:36403,Total threads: 8
Dashboard: http://127.0.0.1:44987/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:43767,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-dvmkddm4,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-dvmkddm4
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:37291,Total threads: 8
Dashboard: http://127.0.0.1:37977/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:34087,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-q8pvapcr,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-q8pvapcr
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:40167,Total threads: 8
Dashboard: http://127.0.0.1:35689/status,Memory: 116.42 GiB
Nanny: tcp://127.0.0.1:37391,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-l1sm0pco,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-l1sm0pco
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB


In [5]:
# Input columns
desired_columns = [
    "specimen_label",
    "participant_label",
    "v_gene",
    "j_gene",
    "disease",
    "disease_subtype",
    "cdr1_seq_aa_q_trim",
    "cdr2_seq_aa_q_trim",
    "cdr3_seq_aa_q_trim",
    "cdr3_aa_sequence_trim_len",
    "extracted_isotype",
    "isotype_supergroup",
    "v_mut",
    "num_reads",
    "igh_or_tcrb_clone_id",
]

In [6]:
# Don't use fastparquet, because it changes specimen labels like M54-049 to 2049-01-01 00:00:54 -- i.e. it coerces partition names to numbers or dates
df = dd.read_parquet(config.paths.sequences, columns=desired_columns, engine="pyarrow")

In [7]:
# each partition is a specimen
df.npartitions

717

In [8]:
df

Unnamed: 0_level_0,specimen_label,participant_label,v_gene,j_gene,disease,disease_subtype,cdr1_seq_aa_q_trim,cdr2_seq_aa_q_trim,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len,extracted_isotype,isotype_supergroup,v_mut,num_reads,igh_or_tcrb_clone_id
npartitions=717,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,category[known],category[known],category[unknown],category[unknown],category[unknown],category[unknown],object,object,object,int64,category[unknown],category[unknown],float64,int64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [9]:
# required_gene_loci=config.gene_loci_used
# Required gene loci may differ for each specimen. Prepare a dict
required_gene_loci = helpers._load_etl_metadata()["available_gene_loci"]
required_gene_loci

specimen_label
M418-S001    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M418-S007    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M418-S008    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M418-S009    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M418-S010    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
                              ...                  
M464-S042    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M464-S043    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M464-S044    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M464-S045    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
M464-S046    (((GeneLocus.BCR)), ((GeneLocus.TCR)))
Name: available_gene_loci, Length: 717, dtype: object

In [10]:
required_gene_loci.value_counts()

(((GeneLocus.BCR)), ((GeneLocus.TCR)))    634
(((GeneLocus.BCR)))                        83
Name: available_gene_loci, dtype: int64

In [11]:
# pass empty df as meta, along with the new columns created by sample_sequences
meta = df.head(0).assign(total_clone_num_reads=0, num_clone_members=0)
df_sampled = df.map_partitions(
    sample_sequences, required_gene_loci=required_gene_loci.to_dict(), meta=meta
)
df_sampled

Unnamed: 0_level_0,specimen_label,participant_label,v_gene,j_gene,disease,disease_subtype,cdr1_seq_aa_q_trim,cdr2_seq_aa_q_trim,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len,extracted_isotype,isotype_supergroup,v_mut,num_reads,igh_or_tcrb_clone_id,total_clone_num_reads,num_clone_members
npartitions=717,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
,category[known],category[known],category[known],category[known],category[known],category[known],object,object,object,int64,category[known],category[known],float64,int64,int64,int64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
itime = time.time()

# This can behave weirdly with empty partitions. https://github.com/dask/dask/issues/8832

df_sampled.to_parquet(
    config.paths.sequences_sampled,
    overwrite=True,
    compression="snappy",  # gzip
    engine="pyarrow",
    # schema arg only accepted by pyarrow engine:
    # Set schema to "infer" if we have any empty partitions and using pyarrow.
    # schema="infer" is no longer slow as of https://github.com/dask/dask/pull/9131
    # schema=None breaks downstream readers.
    schema="infer",
    # also, do empty partitions even make it to disk, or are they eliminated? they seem eliminated.
    write_metadata_file=False,
    partition_on=["participant_label", "specimen_label"],
)

print(time.time() - itime)

2023-01-14 03:27:49,896 - malid.sample_sequences - INFO - Participant BFI-0000234 specimen M132-S014 is empty


2023-01-14 03:27:53,930 - malid.sample_sequences - INFO - Participant BFI-0003708 specimen M132-S009 is empty


2023-01-14 03:27:54,285 - malid.sample_sequences - INFO - Participant BFI-0003709 specimen M132-S010 is empty
2023-01-14 03:27:54,414 - malid.sample_sequences - INFO - Participant BFI-0003707 specimen M132-S008 is empty


2023-01-14 03:27:55,094 - malid.sample_sequences - INFO - Participant BFI-0003710 specimen M132-S011 is empty
2023-01-14 03:27:55,122 - malid.sample_sequences - INFO - Participant BFI-0003711 specimen M132-S013 is empty


2023-01-14 03:27:58,423 - malid.sample_sequences - INFO - Participant BFI-0002859 specimen M114-S034 is empty


2023-01-14 03:28:04,010 - malid.sample_sequences - INFO - Participant BFI-0003712 specimen M132-S015 is empty


2023-01-14 03:28:08,233 - malid.sample_sequences - INFO - Participant BFI-0002868 specimen M132-S038 is empty
2023-01-14 03:28:08,388 - malid.sample_sequences - INFO - Removing BFI-0005432 specimen M281redo-S033 because it did not have enough clones. Clone count by isotype: {'IGHG': 17, 'IGHA': 47, 'IGHD-M': 4064}
2023-01-14 03:28:08,437 - malid.sample_sequences - INFO - Removing BFI-0005431 specimen M281redo-S032 because it did not have enough clones. Clone count by isotype: {'IGHG': 24, 'IGHA': 7, 'IGHD-M': 1944}


2023-01-14 03:28:26,365 - malid.sample_sequences - INFO - Removing BFI-0005442 specimen M281redo-S043 because it did not have enough clones. Clone count by isotype: {'IGHG': 1, 'IGHA': 1610, 'IGHD-M': 23571}


2023-01-14 03:28:43,072 - malid.sample_sequences - INFO - Removing BFI-0005448 specimen M281redo-S049 because it did not have enough clones. Clone count by isotype: {'IGHG': 0.0, 'IGHA': 10.0, 'IGHD-M': 3420.0}


2023-01-14 03:28:45,911 - malid.sample_sequences - INFO - Participant BFI-0002870 specimen M114-S035 is empty


2023-01-14 03:28:50,173 - malid.sample_sequences - INFO - Removing BFI-0005453 specimen M281redo-S055 because it did not have enough clones. Clone count by isotype: {'IGHG': 87, 'IGHA': 131, 'IGHD-M': 13993}


2023-01-14 03:28:54,577 - malid.sample_sequences - INFO - Participant BFI-0003713 specimen M132-S016 is empty
2023-01-14 03:28:54,763 - malid.sample_sequences - INFO - Participant BFI-0003714 specimen M132-S017 is empty


2023-01-14 03:28:56,795 - malid.sample_sequences - INFO - Participant BFI-0003715 specimen M132-S018 is empty


2023-01-14 03:28:58,468 - malid.sample_sequences - INFO - Participant BFI-0003716 specimen M132-S019 is empty


2023-01-14 03:29:03,170 - malid.sample_sequences - INFO - Participant BFI-0003717 specimen M132-S020 is empty


2023-01-14 03:29:08,361 - malid.sample_sequences - INFO - Participant BFI-0003718 specimen M132-S021 is empty


2023-01-14 03:29:13,789 - malid.sample_sequences - INFO - Removing BFI-0003105 specimen M64-056 because it did not have enough clones. Clone count by isotype: {'IGHG': 75, 'IGHA': 123, 'IGHD-M': 195, 'TCRB': 1663}


2023-01-14 03:29:14,357 - malid.sample_sequences - INFO - Participant BFI-0000258 specimen M132-S070 is empty


2023-01-14 03:29:39,478 - malid.sample_sequences - INFO - Participant BFI-0003719 specimen M132-S022 is empty


2023-01-14 03:29:48,389 - malid.sample_sequences - INFO - Participant BFI-0003720 specimen M132-S023 is empty


2023-01-14 03:29:58,844 - malid.sample_sequences - INFO - Participant BFI-0003721 specimen M132-S024 is empty
2023-01-14 03:29:58,858 - malid.sample_sequences - INFO - Participant BFI-0002861 specimen M132-S037 is empty


2023-01-14 03:30:03,140 - malid.sample_sequences - INFO - Participant BFI-0003722 specimen M132-S025 is empty


2023-01-14 03:30:05,618 - malid.sample_sequences - INFO - Participant BFI-0003723 specimen M132-S026 is empty


2023-01-14 03:30:11,945 - malid.sample_sequences - INFO - Participant BFI-0003724 specimen M132-S027 is empty


2023-01-14 03:30:16,647 - malid.sample_sequences - INFO - Participant BFI-0003725 specimen M132-S028 is empty


2023-01-14 03:30:20,511 - malid.sample_sequences - INFO - Participant BFI-0003726 specimen M132-S029 is empty


2023-01-14 03:30:24,738 - malid.sample_sequences - INFO - Participant BFI-0000254 specimen M114-S037 is empty


2023-01-14 03:30:25,557 - malid.sample_sequences - INFO - Removing BFI-0003115 specimen M64-066 because it did not have enough clones. Clone count by isotype: {'IGHG': 60, 'IGHA': 181, 'IGHD-M': 279, 'TCRB': 39954}


2023-01-14 03:30:29,851 - malid.sample_sequences - INFO - Participant BFI-0003727 specimen M132-S030 is empty


2023-01-14 03:30:40,834 - malid.sample_sequences - INFO - Participant BFI-0003728 specimen M132-S031 is empty


2023-01-14 03:30:47,024 - malid.sample_sequences - INFO - Participant BFI-0003729 specimen M132-S032 is empty


2023-01-14 03:30:47,592 - malid.sample_sequences - INFO - Participant BFI-0003730 specimen M132-S033 is empty


2023-01-14 03:30:50,393 - malid.sample_sequences - INFO - Participant BFI-0002862 specimen M132-S036 is empty
2023-01-14 03:30:50,480 - malid.sample_sequences - INFO - Participant BFI-0003731 specimen M132-S034 is empty


2023-01-14 03:30:51,026 - malid.sample_sequences - INFO - Participant BFI-0003732 specimen M132-S041 is empty


2023-01-14 03:30:53,626 - malid.sample_sequences - INFO - Participant BFI-0003733 specimen M132-S044 is empty


2023-01-14 03:31:10,221 - malid.sample_sequences - INFO - Participant BFI-0003735 specimen M132-S046 is empty


2023-01-14 03:31:11,856 - malid.sample_sequences - INFO - Participant BFI-0003736 specimen M132-S047 is empty


2023-01-14 03:31:14,245 - malid.sample_sequences - INFO - Participant BFI-0003737 specimen M132-S048 is empty


2023-01-14 03:31:20,564 - malid.sample_sequences - INFO - Participant BFI-0002850 specimen M132-S040 is empty


2023-01-14 03:31:22,639 - malid.sample_sequences - INFO - Participant BFI-0003738 specimen M132-S049 is empty


2023-01-14 03:31:34,008 - malid.sample_sequences - INFO - Participant BFI-0003740 specimen M132-S051 is empty


2023-01-14 03:31:40,156 - malid.sample_sequences - INFO - Participant BFI-0003460 specimen M114-S004 is empty


2023-01-14 03:31:43,058 - malid.sample_sequences - INFO - Participant BFI-0003450 specimen M114-S022 is empty


2023-01-14 03:31:48,907 - malid.sample_sequences - INFO - Participant BFI-0002854 specimen M114-S042 is empty


2023-01-14 03:31:51,063 - malid.sample_sequences - INFO - Participant BFI-0002852 specimen M132-S012 is empty


2023-01-14 03:31:51,718 - malid.sample_sequences - INFO - Participant BFI-0003466 specimen M114-S003 is empty


2023-01-14 03:31:58,980 - malid.sample_sequences - INFO - Participant BFI-0003741 specimen M132-S052 is empty


2023-01-14 03:32:01,095 - malid.sample_sequences - INFO - Participant BFI-0002863 specimen M132-S035 is empty


2023-01-14 03:32:01,306 - malid.sample_sequences - INFO - Participant BFI-0003454 specimen M114-S030 is empty


2023-01-14 03:32:02,357 - malid.sample_sequences - INFO - Participant BFI-0003743 specimen M132-S054 is empty


2023-01-14 03:32:03,468 - malid.sample_sequences - INFO - Participant BFI-0003745 specimen M132-S056 is empty


2023-01-14 03:32:05,779 - malid.sample_sequences - INFO - Participant BFI-0003746 specimen M132-S057 is empty


2023-01-14 03:32:14,204 - malid.sample_sequences - INFO - Participant BFI-0003747 specimen M132-S058 is empty


2023-01-14 03:32:18,426 - malid.sample_sequences - INFO - Participant BFI-0003455 specimen M114-S026 is empty


2023-01-14 03:32:25,820 - malid.sample_sequences - INFO - Participant BFI-0003750 specimen M132-S061 is empty


2023-01-14 03:32:28,424 - malid.sample_sequences - INFO - Participant BFI-0003751 specimen M132-S062 is empty


2023-01-14 03:32:30,970 - malid.sample_sequences - INFO - Participant BFI-0003456 specimen M114-S032 is empty


2023-01-14 03:32:34,856 - malid.sample_sequences - INFO - Participant BFI-0003488 specimen M114-S029 is empty


2023-01-14 03:32:36,181 - malid.sample_sequences - INFO - Participant BFI-0003752 specimen M132-S063 is empty


2023-01-14 03:32:41,643 - malid.sample_sequences - INFO - Participant BFI-0003754 specimen M132-S065 is empty


2023-01-14 03:32:51,854 - malid.sample_sequences - INFO - Participant BFI-0003457 specimen M114-S031 is empty
2023-01-14 03:32:51,884 - malid.sample_sequences - INFO - Participant BFI-0003755 specimen M132-S066 is empty


2023-01-14 03:32:53,846 - malid.sample_sequences - INFO - Participant BFI-0003462 specimen M114-S001 is empty


2023-01-14 03:32:54,820 - malid.sample_sequences - INFO - Participant BFI-0003465 specimen M114-S024 is empty


2023-01-14 03:32:56,729 - malid.sample_sequences - INFO - Participant BFI-0003756 specimen M132-S067 is empty


2023-01-14 03:32:57,384 - malid.sample_sequences - INFO - Participant BFI-0003470 specimen M114-S045 is empty


2023-01-14 03:33:00,513 - malid.sample_sequences - INFO - Participant BFI-0003758 specimen M132-S069 is empty


2023-01-14 03:33:02,959 - malid.sample_sequences - INFO - Participant BFI-0003479 specimen M114-S015 is empty


2023-01-14 03:33:07,309 - malid.sample_sequences - INFO - Participant BFI-0003481 specimen M114-S002 is empty


2023-01-14 03:33:08,378 - malid.sample_sequences - INFO - Participant BFI-0003483 specimen M114-S012 is empty


2023-01-14 03:33:13,877 - malid.sample_sequences - INFO - Participant BFI-0003704 specimen M132-S005 is empty


2023-01-14 03:33:14,945 - malid.sample_sequences - INFO - Participant BFI-0003748 specimen M132-S059 is empty


2023-01-14 03:34:00,734 - malid.sample_sequences - INFO - Participant BFI-0003463 specimen M114-S005 is empty


2023-01-14 03:34:11,192 - malid.sample_sequences - INFO - Participant BFI-0003749 specimen M132-S060 is empty


2023-01-14 03:34:17,761 - malid.sample_sequences - INFO - Removing BFI-0007455 specimen M371-S032 because it did not have enough clones. Clone count by isotype: {'IGHG': 34, 'IGHA': 90, 'IGHD-M': 535, 'TCRB': 1900}


2023-01-14 03:34:28,816 - malid.sample_sequences - INFO - Participant BFI-0002851 specimen M132-S039 is empty


2023-01-14 03:34:50,024 - malid.sample_sequences - INFO - Participant BFI-0003757 specimen M132-S068 is empty


2023-01-14 03:34:54,453 - malid.sample_sequences - INFO - Participant BFI-0003471 specimen M114-S044 is empty


2023-01-14 03:34:54,660 - malid.sample_sequences - INFO - Participant BFI-0003760 specimen M132-S072 is empty


2023-01-14 03:34:57,483 - malid.sample_sequences - INFO - Removing BFI-0003151 specimen M64-102 because it did not have enough clones. Clone count by isotype: {'IGHG': 796, 'IGHA': 2236, 'IGHD-M': 13673, 'TCRB': 13}


2023-01-14 03:34:58,596 - malid.sample_sequences - INFO - Participant BFI-0003765 specimen M132-S077 is empty


2023-01-14 03:34:59,841 - malid.sample_sequences - INFO - Participant BFI-0002865 specimen M114-S041 is empty


2023-01-14 03:35:01,464 - malid.sample_sequences - INFO - Participant BFI-0003482 specimen M114-S048 is empty


2023-01-14 03:35:03,228 - malid.sample_sequences - INFO - Participant BFI-0003451 specimen M114-S019 is empty


2023-01-14 03:35:05,411 - malid.sample_sequences - INFO - Removing BFI-0007482 specimen M371-S020 because it did not have enough clones. Clone count by isotype: {'IGHG': 118.0, 'IGHA': 188.0, 'IGHD-M': 1710.0, 'TCRB': 0.0}


2023-01-14 03:35:06,147 - malid.sample_sequences - INFO - Participant BFI-0003487 specimen M114-S049 is empty


2023-01-14 03:35:09,112 - malid.sample_sequences - INFO - Participant BFI-0003468 specimen M114-S023 is empty


2023-01-14 03:35:09,790 - malid.sample_sequences - INFO - Participant BFI-0003472 specimen M114-S006 is empty


2023-01-14 03:35:13,143 - malid.sample_sequences - INFO - Participant BFI-0003452 specimen M114-S013 is empty


2023-01-14 03:35:14,798 - malid.sample_sequences - INFO - Participant BFI-0003484 specimen M114-S028 is empty


2023-01-14 03:35:15,558 - malid.sample_sequences - INFO - Participant BFI-0003480 specimen M114-S010 is empty


2023-01-14 03:35:24,663 - malid.sample_sequences - INFO - Participant BFI-0003734 specimen M132-S045 is empty


2023-01-14 03:35:30,276 - malid.sample_sequences - INFO - Participant BFI-0003739 specimen M132-S050 is empty


2023-01-14 03:35:31,537 - malid.sample_sequences - INFO - Participant BFI-0003742 specimen M132-S053 is empty


2023-01-14 03:35:37,411 - malid.sample_sequences - INFO - Removing BFI-0005466 specimen M281redo-S067 because it did not have enough clones. Clone count by isotype: {'IGHG': 169, 'IGHA': 37, 'IGHD-M': 7818}


2023-01-14 03:35:37,986 - malid.sample_sequences - INFO - Participant BFI-0002864 specimen M114-S040 is empty


2023-01-14 03:35:43,248 - malid.sample_sequences - INFO - Participant BFI-0003768 specimen M132-S080 is empty


2023-01-14 03:35:49,803 - malid.sample_sequences - INFO - Participant BFI-0003770 specimen M132-S082 is empty


2023-01-14 03:36:04,274 - malid.sample_sequences - INFO - Participant BFI-0003701 specimen M132-S002 is empty


2023-01-14 03:36:11,044 - malid.sample_sequences - INFO - Participant BFI-0003773 specimen M132-S064 is empty


2023-01-14 03:36:16,303 - malid.sample_sequences - INFO - Participant BFI-0003706 specimen M132-S007 is empty


2023-01-14 03:36:17,723 - malid.sample_sequences - INFO - Removing BFI-0005414 specimen M281redo-S015 because it did not have enough clones. Clone count by isotype: {'IGHG': 333, 'IGHA': 16, 'IGHD-M': 7111}


2023-01-14 03:36:18,274 - malid.sample_sequences - INFO - Participant BFI-0003759 specimen M132-S071 is empty


2023-01-14 03:36:19,560 - malid.sample_sequences - INFO - Participant BFI-0003453 specimen M114-S009 is empty


2023-01-14 03:36:23,019 - malid.sample_sequences - INFO - Participant BFI-0003761 specimen M132-S073 is empty


2023-01-14 03:36:23,536 - malid.sample_sequences - INFO - Participant BFI-0003762 specimen M132-S074 is empty


2023-01-14 03:36:24,185 - malid.sample_sequences - INFO - Removing BFI-0009029 specimen M418-S223 because it did not have enough clones. Clone count by isotype: {'IGHG': 158, 'IGHA': 32, 'IGHD-M': 11330, 'TCRB': 19718}


2023-01-14 03:36:24,448 - malid.sample_sequences - INFO - Participant BFI-0002871 specimen M114-S025 is empty


2023-01-14 03:36:25,854 - malid.sample_sequences - INFO - Participant BFI-0003475 specimen M114-S054 is empty
2023-01-14 03:36:25,910 - malid.sample_sequences - INFO - Participant BFI-0002856 specimen M114-S043 is empty


2023-01-14 03:36:29,436 - malid.sample_sequences - INFO - Participant BFI-0003763 specimen M132-S075 is empty


2023-01-14 03:36:29,674 - malid.sample_sequences - INFO - Participant BFI-0003764 specimen M132-S076 is empty


2023-01-14 03:36:33,720 - malid.sample_sequences - INFO - Participant BFI-0003766 specimen M132-S078 is empty


2023-01-14 03:36:35,387 - malid.sample_sequences - INFO - Participant BFI-0003767 specimen M132-S079 is empty


2023-01-14 03:36:42,821 - malid.sample_sequences - INFO - Participant BFI-0002875 specimen M114-S011 is empty


2023-01-14 03:36:50,538 - malid.sample_sequences - INFO - Participant BFI-0003769 specimen M132-S081 is empty


2023-01-14 03:37:07,666 - malid.sample_sequences - INFO - Participant BFI-0000255 specimen M114-S033 is empty


2023-01-14 03:37:10,438 - malid.sample_sequences - INFO - Participant BFI-0003771 specimen M132-S083 is empty


2023-01-14 03:37:13,403 - malid.sample_sequences - INFO - Participant BFI-0003772 specimen M132-S084 is empty
2023-01-14 03:37:13,413 - malid.sample_sequences - INFO - Participant BFI-0002866 specimen M132-S042 is empty


2023-01-14 03:37:14,308 - malid.sample_sequences - INFO - Participant BFI-0003458 specimen M114-S020 is empty


2023-01-14 03:37:15,187 - malid.sample_sequences - INFO - Participant BFI-0003775 specimen M132-S085 is empty


2023-01-14 03:37:18,402 - malid.sample_sequences - INFO - Participant BFI-0003775 specimen M132-S086 is empty
2023-01-14 03:37:18,513 - malid.sample_sequences - INFO - Participant BFI-0003459 specimen M114-S027 is empty


2023-01-14 03:37:18,975 - malid.sample_sequences - INFO - Participant BFI-0002877 specimen M114-S036 is empty
2023-01-14 03:37:19,143 - malid.sample_sequences - INFO - Participant BFI-0003461 specimen M114-S008 is empty


2023-01-14 03:37:22,178 - malid.sample_sequences - INFO - Removing BFI-0003464 specimen M111-S014 because it did not have enough clones. Clone count by isotype: {'IGHG': 3, 'IGHA': 9295, 'IGHD-M': 28806, 'TCRB': 57692}


2023-01-14 03:37:26,123 - malid.sample_sequences - INFO - Participant BFI-0003464 specimen M114-S014 is empty


2023-01-14 03:37:27,964 - malid.sample_sequences - INFO - Participant BFI-0003469 specimen M114-S007 is empty


2023-01-14 03:37:28,407 - malid.sample_sequences - INFO - Participant BFI-0002855 specimen M114-S016 is empty


2023-01-14 03:37:28,817 - malid.sample_sequences - INFO - Participant BFI-0003467 specimen M114-S053 is empty


2023-01-14 03:37:29,480 - malid.sample_sequences - INFO - Participant BFI-0003478 specimen M114-S017 is empty


2023-01-14 03:37:31,265 - malid.sample_sequences - INFO - Participant BFI-0003473 specimen M114-S021 is empty


2023-01-14 03:37:34,794 - malid.sample_sequences - INFO - Participant BFI-0003474 specimen M114-S046 is empty


2023-01-14 03:37:43,639 - malid.sample_sequences - INFO - Participant BFI-0003476 specimen M114-S047 is empty


2023-01-14 03:37:45,647 - malid.sample_sequences - INFO - Removing BFI-0003056 specimen M64-007 because it did not have enough clones. Clone count by isotype: {'IGHG': 44, 'IGHA': 60, 'IGHD-M': 70, 'TCRB': 20312}


2023-01-14 03:37:48,756 - malid.sample_sequences - INFO - Participant BFI-0003477 specimen M114-S050 is empty


2023-01-14 03:37:59,345 - malid.sample_sequences - INFO - Removing BFI-0010011 specimen M454-S012 because it did not have enough clones. Clone count by isotype: {'IGHG': 8, 'IGHA': 33, 'IGHD-M': 2110, 'TCRB': 16221}
2023-01-14 03:37:59,502 - malid.sample_sequences - INFO - Participant BFI-0002867 specimen M132-S043 is empty


2023-01-14 03:38:02,055 - malid.sample_sequences - INFO - Removing BFI-0010015 specimen M454-S016 because it did not have enough clones. Clone count by isotype: {'IGHG': 3, 'IGHA': 5, 'IGHD-M': 4, 'TCRB': 1918}


2023-01-14 03:38:02,397 - malid.sample_sequences - INFO - Participant BFI-0003705 specimen M132-S006 is empty


2023-01-14 03:38:10,850 - malid.sample_sequences - INFO - Removing BFI-0009052 specimen M418-S104 because it did not have enough clones. Clone count by isotype: {'IGHG': 177, 'IGHA': 56, 'IGHD-M': 889, 'TCRB': 2407}


2023-01-14 03:38:14,531 - malid.sample_sequences - INFO - Removing BFI-0003050 specimen M64-001 because it did not have enough clones. Clone count by isotype: {'IGHG': 22, 'IGHA': 41, 'IGHD-M': 90, 'TCRB': 29241}


2023-01-14 03:38:19,047 - malid.sample_sequences - INFO - Participant BFI-0002879 specimen M114-S039 is empty


2023-01-14 03:38:48,711 - malid.sample_sequences - INFO - Participant BFI-0003485 specimen M114-S051 is empty


2023-01-14 03:39:01,090 - malid.sample_sequences - INFO - Participant BFI-0003486 specimen M114-S052 is empty


2023-01-14 03:39:03,975 - malid.sample_sequences - INFO - Participant BFI-0002857 specimen M114-S018 is empty


2023-01-14 03:39:04,490 - malid.sample_sequences - INFO - Participant BFI-0003700 specimen M132-S001 is empty
2023-01-14 03:39:04,646 - malid.sample_sequences - INFO - Removing BFI-0009023 specimen M418-S217 because it did not have enough clones. Clone count by isotype: {'IGHG': 25, 'IGHA': 37, 'IGHD-M': 395, 'TCRB': 1549}


2023-01-14 03:39:06,669 - malid.sample_sequences - INFO - Removing BFI-0009038 specimen M418-S234 because it did not have enough clones. Clone count by isotype: {'IGHG': 80, 'IGHA': 209, 'IGHD-M': 2827, 'TCRB': 4809}


2023-01-14 03:39:10,681 - malid.sample_sequences - INFO - Participant BFI-0003703 specimen M132-S004 is empty
2023-01-14 03:39:10,803 - malid.sample_sequences - INFO - Participant BFI-0003702 specimen M132-S003 is empty


2023-01-14 03:39:10,948 - malid.sample_sequences - INFO - Removing BFI-0009155 specimen M418-S040 because it did not have enough clones. Clone count by isotype: {'IGHG': 2458, 'IGHA': 49, 'IGHD-M': 9087, 'TCRB': 9379}


2023-01-14 03:39:12,794 - malid.sample_sequences - INFO - Participant BFI-0003774 specimen M132-S055 is empty


2023-01-14 03:39:18,987 - malid.sample_sequences - INFO - Removing BFI-0010040 specimen M454-S046 because it did not have enough clones. Clone count by isotype: {'IGHG': 56, 'IGHA': 327, 'IGHD-M': 2021, 'TCRB': 25048}


2023-01-14 03:39:21,542 - malid.sample_sequences - INFO - Removing BFI-0010041 specimen M454-S048 because it did not have enough clones. Clone count by isotype: {'IGHG': 5, 'IGHA': 6, 'IGHD-M': 15, 'TCRB': 9716}


2023-01-14 03:39:40,342 - malid.sample_sequences - INFO - Removing BFI-0009031 specimen M418-S225 because it did not have enough clones. Clone count by isotype: {'IGHG': 182, 'IGHA': 28, 'IGHD-M': 10670, 'TCRB': 17170}


2023-01-14 03:39:46,984 - malid.sample_sequences - INFO - Removing BFI-0010003 specimen M454-S004 because it did not have enough clones. Clone count by isotype: {'IGHG': 14, 'IGHA': 86, 'IGHD-M': 116, 'TCRB': 16896}


2023-01-14 03:40:17,540 - malid.sample_sequences - INFO - Removing BFI-0010004 specimen M454-S005 because it did not have enough clones. Clone count by isotype: {'IGHG': 44, 'IGHA': 70, 'IGHD-M': 419, 'TCRB': 10817}


2023-01-14 03:40:18,223 - malid.sample_sequences - INFO - Removing BFI-0009165 specimen M418-S050 because it did not have enough clones. Clone count by isotype: {'IGHG': 1.0, 'IGHA': 2.0, 'IGHD-M': 0.0, 'TCRB': 6.0}


2023-01-14 03:40:20,409 - malid.sample_sequences - INFO - Removing BFI-0010006 specimen M454-S007 because it did not have enough clones. Clone count by isotype: {'IGHG': 126, 'IGHA': 94, 'IGHD-M': 1867, 'TCRB': 6124}


2023-01-14 03:40:23,636 - malid.sample_sequences - INFO - Removing BFI-0010008 specimen M454-S009 because it did not have enough clones. Clone count by isotype: {'IGHG': 30, 'IGHA': 116, 'IGHD-M': 1329, 'TCRB': 20634}


2023-01-14 03:40:53,368 - malid.sample_sequences - INFO - Removing BFI-0010005 specimen M454-S006 because it did not have enough clones. Clone count by isotype: {'IGHG': 3, 'IGHA': 1, 'IGHD-M': 14, 'TCRB': 5529}


2023-01-14 03:40:58,737 - malid.sample_sequences - INFO - Removing BFI-0010007 specimen M454-S008 because it did not have enough clones. Clone count by isotype: {'IGHG': 62, 'IGHA': 145, 'IGHD-M': 1223, 'TCRB': 9414}


2023-01-14 03:41:03,274 - malid.sample_sequences - INFO - Removing BFI-0010009 specimen M454-S010 because it did not have enough clones. Clone count by isotype: {'IGHG': 56, 'IGHA': 867, 'IGHD-M': 626, 'TCRB': 8218}


2023-01-14 03:41:07,726 - malid.sample_sequences - INFO - Removing BFI-0003060 specimen M64-011 because it did not have enough clones. Clone count by isotype: {'IGHG': 98, 'IGHA': 98, 'IGHD-M': 12493, 'TCRB': 13412}


2023-01-14 03:41:10,630 - malid.sample_sequences - INFO - Removing BFI-0010043 specimen M454-S051 because it did not have enough clones. Clone count by isotype: {'IGHG': 79, 'IGHA': 215, 'IGHD-M': 15305, 'TCRB': 22638}


2023-01-14 03:41:13,865 - malid.sample_sequences - INFO - Removing BFI-0010030 specimen M454-S034 because it did not have enough clones. Clone count by isotype: {'IGHG': 1, 'IGHA': 7, 'IGHD-M': 68, 'TCRB': 6689}


2023-01-14 03:41:19,850 - malid.sample_sequences - INFO - Removing BFI-0010034 specimen M454-S047 because it did not have enough clones. Clone count by isotype: {'IGHG': 168, 'IGHA': 647, 'IGHD-M': 206, 'TCRB': 29165}


2023-01-14 03:41:21,248 - malid.sample_sequences - INFO - Removing BFI-0010036 specimen M454-S042 because it did not have enough clones. Clone count by isotype: {'IGHG': 78, 'IGHA': 5, 'IGHD-M': 14985, 'TCRB': 95832}


2023-01-14 03:41:36,881 - malid.sample_sequences - INFO - Removing BFI-0010033 specimen M454-S038 because it did not have enough clones. Clone count by isotype: {'IGHG': 0.0, 'IGHA': 0.0, 'IGHD-M': 0.0, 'TCRB': 44348.0}


2023-01-14 03:41:45,023 - malid.sample_sequences - INFO - Removing BFI-0010033 specimen M454-S039 because it did not have enough clones. Clone count by isotype: {'IGHG': 0.0, 'IGHA': 0.0, 'IGHD-M': 0.0, 'TCRB': 5696.0}


2023-01-14 03:42:00,894 - malid.sample_sequences - INFO - Removing BFI-0010032 specimen M454-S036 because it did not have enough clones. Clone count by isotype: {'IGHG': 23, 'IGHA': 43, 'IGHD-M': 436, 'TCRB': 6785}


2023-01-14 03:42:47,356 - malid.sample_sequences - INFO - Removing BFI-0010046 specimen M454-S055 because it did not have enough clones. Clone count by isotype: {'IGHG': 19, 'IGHA': 40, 'IGHD-M': 603, 'TCRB': 9232}


2023-01-14 03:42:57,365 - malid.sample_sequences - INFO - Removing BFI-0010202 specimen M464-S003 because it did not have enough clones. Clone count by isotype: {'IGHG': 83, 'IGHA': 413, 'IGHD-M': 6309, 'TCRB': 7646}


2023-01-14 03:42:59,964 - malid.sample_sequences - INFO - Removing BFI-0010033 specimen M454-S037 because it did not have enough clones. Clone count by isotype: {'IGHG': 0.0, 'IGHA': 0.0, 'IGHD-M': 0.0, 'TCRB': 9008.0}


2023-01-14 03:43:00,786 - malid.sample_sequences - INFO - Participant BFI-0000256 specimen M114-S038 is empty


2023-01-14 03:43:01,458 - malid.sample_sequences - INFO - Removing BFI-0010034 specimen M454-S040 because it did not have enough clones. Clone count by isotype: {'IGHG': 0.0, 'IGHA': 0.0, 'IGHD-M': 0.0, 'TCRB': 39287.0}


2023-01-14 03:43:15,262 - malid.sample_sequences - INFO - Removing BFI-0003064 specimen M64-015 because it did not have enough clones. Clone count by isotype: {'IGHG': 138, 'IGHA': 86, 'IGHD-M': 955, 'TCRB': 9642}


2023-01-14 03:43:56,624 - malid.sample_sequences - INFO - Removing BFI-0010242 specimen M464-S043 because it did not have enough clones. Clone count by isotype: {'IGHG': 15, 'IGHA': 15, 'IGHD-M': 13, 'TCRB': 42127}


2023-01-14 03:44:02,102 - malid.sample_sequences - INFO - Removing BFI-0010237 specimen M464-S038 because it did not have enough clones. Clone count by isotype: {'IGHG': 14, 'IGHA': 44, 'IGHD-M': 95, 'TCRB': 1194}


2023-01-14 03:44:06,138 - malid.sample_sequences - INFO - Removing BFI-0003072 specimen M64-023 because it did not have enough clones. Clone count by isotype: {'IGHG': 135, 'IGHA': 177, 'IGHD-M': 235, 'TCRB': 8555}


2023-01-14 03:44:26,583 - malid.sample_sequences - INFO - Removing BFI-0010035 specimen M454-S041 because it did not have enough clones. Clone count by isotype: {'IGHG': 0.0, 'IGHA': 0.0, 'IGHD-M': 0.0, 'TCRB': 48478.0}


2023-01-14 03:45:20,821 - malid.sample_sequences - INFO - Removing BFI-0003069 specimen M64-020 because it did not have enough clones. Clone count by isotype: {'IGHG': 42, 'IGHA': 1419, 'IGHD-M': 24388, 'TCRB': 102040}


2023-01-14 03:45:23,044 - malid.sample_sequences - INFO - Removing BFI-0003073 specimen M64-024 because it did not have enough clones. Clone count by isotype: {'IGHG': 24, 'IGHA': 24, 'IGHD-M': 53, 'TCRB': 1406}
2023-01-14 03:45:23,049 - malid.sample_sequences - INFO - Removing BFI-0003074 specimen M64-025 because it did not have enough clones. Clone count by isotype: {'IGHG': 33, 'IGHA': 17, 'IGHD-M': 65, 'TCRB': 1003}


2023-01-14 03:45:32,435 - malid.sample_sequences - INFO - Removing BFI-0003094 specimen M64-045 because it did not have enough clones. Clone count by isotype: {'IGHG': 3808, 'IGHA': 25, 'IGHD-M': 99002, 'TCRB': 77262}


2023-01-14 03:45:54,488 - malid.sample_sequences - INFO - Participant BFI-0000258 specimen M114-S055 is empty


1149.2107574939728


In [13]:
df_sampled.dtypes

specimen_label               category
participant_label            category
v_gene                       category
j_gene                       category
disease                      category
disease_subtype              category
cdr1_seq_aa_q_trim             object
cdr2_seq_aa_q_trim             object
cdr3_seq_aa_q_trim             object
cdr3_aa_sequence_trim_len       int64
extracted_isotype            category
isotype_supergroup           category
v_mut                         float64
num_reads                       int64
igh_or_tcrb_clone_id            int64
total_clone_num_reads           int64
num_clone_members               int64
dtype: object

In [14]:
df_sampled2 = dd.read_parquet(config.paths.sequences_sampled, engine="pyarrow")
df_sampled2

Unnamed: 0_level_0,v_gene,j_gene,disease,disease_subtype,cdr1_seq_aa_q_trim,cdr2_seq_aa_q_trim,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len,extracted_isotype,isotype_supergroup,v_mut,num_reads,igh_or_tcrb_clone_id,total_clone_num_reads,num_clone_members,participant_label,specimen_label
npartitions=522,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
,category[unknown],category[unknown],category[unknown],category[unknown],object,object,object,int64,category[unknown],category[unknown],float64,int64,int64,int64,int64,category[known],category[known]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [15]:
# check dtypes
df_sampled2.dtypes

v_gene                       category
j_gene                       category
disease                      category
disease_subtype              category
cdr1_seq_aa_q_trim             object
cdr2_seq_aa_q_trim             object
cdr3_seq_aa_q_trim             object
cdr3_aa_sequence_trim_len       int64
extracted_isotype            category
isotype_supergroup           category
v_mut                         float64
num_reads                       int64
igh_or_tcrb_clone_id            int64
total_clone_num_reads           int64
num_clone_members               int64
participant_label            category
specimen_label               category
dtype: object

In [16]:
# expected lower because losing some empty specimens
df.npartitions, df_sampled.npartitions, df_sampled2.npartitions

(717, 717, 522)

In [17]:
client.shutdown()