# Get all V genes and decide on an order of them. Also get all J genes.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import genetools
import seaborn as sns

sns.set_style("dark")

In [2]:
import pandas as pd

In [3]:
import dask
import dask.dataframe as dd

In [4]:
import os

In [5]:
from malid import config, helpers

Raise worker connection timeouts (see other dask notebooks):

In [6]:
import distributed

# These only seem to be picked up by scheduler, not by individual workers

dask.config.set(
    {
        "distributed.comm.timeouts.tcp": "120s",
        "distributed.comm.timeouts.connect": "120s",
        "distributed.comm.retry.count": 5,
    }
)

<dask.config.set at 0x7fccd3a1d8e0>

In [7]:
# These will be picked up by individual workers

with open(os.path.expandvars("$HOME/.config/dask/distributed.yaml"), "w") as w:
    w.write(
        """distributed:
  comm:
    retry:
      count: 5
    timeouts:
      connect: 120s          # time before connecting fails
      tcp: 120s              # time before calling an unresponsive connection dead
    """
    )

In [8]:
from dask.distributed import Client

# multi-processing backend
# access dashbaord at http://127.0.0.1:61083
# if already opened from another notebook, see https://stackoverflow.com/questions/60115736/dask-how-to-connect-to-running-cluster-scheduler-and-access-total-occupancy
client = Client(
    scheduler_port=61084,
    dashboard_address=":61083",
    n_workers=7,
    processes=True,
    threads_per_worker=8,
    memory_limit="auto",
    worker_dashboard_address=":0",  # start worker dashboards on random ports
)
display(client)
# for debugging: client.restart()

  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:61083/status,

0,1
Dashboard: http://127.0.0.1:61083/status,Workers: 7
Total threads: 56,Total memory: 1.15 TiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61084,Workers: 7
Dashboard: http://127.0.0.1:61083/status,Total threads: 56
Started: Just now,Total memory: 1.15 TiB

0,1
Comm: tcp://127.0.0.1:33505,Total threads: 8
Dashboard: http://127.0.0.1:34379/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:45879,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-qf0rgxyd,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-qf0rgxyd
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:37027,Total threads: 8
Dashboard: http://127.0.0.1:45251/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:32939,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-r87ns2_s,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-r87ns2_s
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:40061,Total threads: 8
Dashboard: http://127.0.0.1:43227/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:39829,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-foumrrjc,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-foumrrjc
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:34305,Total threads: 8
Dashboard: http://127.0.0.1:41467/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:36745,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-834uwnqz,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-834uwnqz
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:34465,Total threads: 8
Dashboard: http://127.0.0.1:41031/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:46229,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-vq1lofz8,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-vq1lofz8
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:40375,Total threads: 8
Dashboard: http://127.0.0.1:45573/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:46083,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-38_a3yfk,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-38_a3yfk
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB

0,1
Comm: tcp://127.0.0.1:37287,Total threads: 8
Dashboard: http://127.0.0.1:42161/status,Memory: 167.92 GiB
Nanny: tcp://127.0.0.1:34377,
Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-9ltd48ea,Local directory: /users/maximz/code/boyd-immune-repertoire-classification/notebooks/dask-worker-space/worker-9ltd48ea
GPU: NVIDIA A100 80GB PCIe,GPU memory: 80.00 GiB


In [9]:
desired_cols = ["v_gene", "j_gene", "isotype_supergroup"]

In [10]:
debug_filters = None
# debug_filters = [("participant_label", "==", "BFI-0007450")]

In [11]:
# Don't use fastparquet, because it changes specimen labels like M54-049 to 2049-01-01 00:00:54 -- i.e. it coerces partition names to numbers or dates
df = dd.read_parquet(
    config.paths.sequences,
    columns=desired_cols,
    filters=debug_filters,
    engine="pyarrow",
)

In [12]:
df

Unnamed: 0_level_0,v_gene,j_gene,isotype_supergroup
npartitions=717,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,category[unknown],category[unknown],category[unknown]
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [13]:
for gene_locus, isotype_groups in helpers.isotype_groups_kept.items():
    # Not sure why this doesn't work:
    # v_genes = df.loc[df["isotype_supergroup"].compute().isin(isotype_groups)]["v_gene"].unique().compute().sort_values()

    # Instead, here's a manual version using map_partitions:
    v_gene_unique_lists = df.map_partitions(
        lambda partdf: set(
            partdf[partdf["isotype_supergroup"].isin(isotype_groups)]["v_gene"].unique()
        )
    )
    j_gene_unique_lists = df.map_partitions(
        lambda partdf: set(
            partdf[partdf["isotype_supergroup"].isin(isotype_groups)]["j_gene"].unique()
        )
    )

    # compute
    v_gene_unique_lists, j_gene_unique_lists = dask.compute(
        v_gene_unique_lists, j_gene_unique_lists
    )

    # extract
    v_genes = pd.Series(
        list(set.union(*(v_gene_unique_lists.values))),
        name="v_gene",
    ).sort_values()
    j_genes = pd.Series(
        list(set.union(*(j_gene_unique_lists.values))),
        name="j_gene",
    ).sort_values()
    print(gene_locus, v_genes)
    print(gene_locus, j_genes)

    v_genes.to_csv(
        config.paths.dataset_specific_metadata
        / f"all_v_genes.in_order.{gene_locus.name}.txt",
        index=None,
    )
    j_genes.to_csv(
        config.paths.dataset_specific_metadata
        / f"all_j_genes.in_order.{gene_locus.name}.txt",
        index=None,
    )

GeneLocus.BCR 11    IGHV1-18
68     IGHV1-2
34    IGHV1-24
42     IGHV1-3
16    IGHV1-45
        ...   
38     VH1-67P
25     VH3-41P
49     VH3-60P
20     VH3-65P
75     VH7-27P
Name: v_gene, Length: 82, dtype: object
GeneLocus.BCR 4    IGHJ1
0    IGHJ2
1    IGHJ3
3    IGHJ4
2    IGHJ5
5    IGHJ6
Name: j_gene, dtype: object


GeneLocus.TCR 11        TRBV10-1
35        TRBV10-2
44        TRBV10-3
24        TRBV11-1
48        TRBV11-2
4         TRBV11-3
1         TRBV12-2
37        TRBV12-3
56        TRBV12-4
25        TRBV12-5
8           TRBV13
49          TRBV14
5           TRBV15
27          TRBV16
36          TRBV17
18          TRBV18
53          TRBV19
15           TRBV2
6         TRBV20-1
45    TRBV20/OR9-2
22        TRBV24-1
54        TRBV25-1
26          TRBV26
33          TRBV27
19          TRBV28
30        TRBV29-1
43    TRBV29/OR9-2
13         TRBV3-1
47         TRBV3-2
42          TRBV30
40         TRBV4-1
9          TRBV4-2
41         TRBV4-3
34         TRBV5-1
46         TRBV5-3
39         TRBV5-4
17         TRBV5-5
32         TRBV5-6
10         TRBV5-7
38         TRBV5-8
50         TRBV6-1
29         TRBV6-2
3          TRBV6-4
14         TRBV6-5
51         TRBV6-6
0          TRBV6-7
2          TRBV6-8
31         TRBV6-9
12         TRBV7-1
55         TRBV7-2
52         TRBV7-3
28         TRBV7-

In [14]:
client.shutdown()