## Notebook to create an anndata object with the demuxlet identified samples mappings per sample pool via papermill

In [1]:
!date

Tue Oct  5 17:27:11 EDT 2021


#### import libraries and set notebook variables

In [2]:
import os
import papermill as pm
import ray

  from pyarrow import HadoopFileSystem


In [3]:
# naming
cohort = 'aging'

# directories
home_dir = f'/labshare/raph/notebooks/expression/adrd_neuro/{cohort}'

# base notebook to be run per tuple iteration
base_notebook = f'{home_dir}/glmmtmb_diffexp.ipynb'

# output path for the generated notebooks
out_nb_dir = f'{home_dir}/pm_gend_nbs'

# setup parameters to iterate
brain_regions = ['Entorhinal cortex', 'Putamen', 'Subventricular zone', 
                 'Middle temporal gyrus']
cell_types = ['Oligodendrocyte-1', 'SPN D1', 'SPN D2', 'Oligodendrocyte-2', 
              'Astrocyte', 'ExN CUX2 LAMP5', 'InN ADARB2 VIP', 'ExN FEZF2', 
              'OPC', 'ExN RORB THEMIS', 'InN LHX6 PVALB', 'Radial Glia', 
              'Microglia', 'InN ADARB2 LAMP5', 'ExN CUX2 ADARB2', 
              'InN LHX6 SST', 'SPN D1-2', 'Endothelial', 'ExN RORB', 
              'ExN LAMP5', 'Astrocyte-GFAP-Hi', 'SPN D2-2', 'ExN THEMIS']
groupings = {'Brain_region': brain_regions, 'new_anno': cell_types}

#### utility functions

In [4]:
@ray.remote
def run_pm_notebook(base_notebook: str, out_notebook: str, params: dict) -> str:
    ret_val = f'notebook: {out_notebook}\nparams: {params}'
    pm.execute_notebook(input_path=base_notebook, output_path=out_notebook, 
                        parameters=params, progress_bar=False)
    #running via notebook keep blowing up, even though actually runs
#     param_str = f'-p region_celltype "{params.get("region_celltype")}" -p obs_type {params.get("obs_type")} -p testing {params.get("testing")}'
#     this_cmd = f'papermill --no-progress-bar {base_notebook} {out_notebook} -p {param_str}'
# #     !{this_cmd}
    return ret_val

#### startup ray for parallelizing calls

In [5]:
ray.shutdown()
ray.init(log_to_driver=False)



{'node_ip_address': '156.40.49.26',
 'raylet_ip_address': '156.40.49.26',
 'redis_address': '156.40.49.26:6379',
 'object_store_address': '/tmp/ray/session_2021-10-05_17-27-11_996809_55844/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-10-05_17-27-11_996809_55844/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-10-05_17-27-11_996809_55844',
 'metrics_export_port': 50486,
 'node_id': '4a8b1c923cd0cf703ab9a59692b24b4483f7ac7b28f65f5678f34db4'}

#### check the notebook template

In [6]:
pm.inspect_notebook(base_notebook)

{'region_celltype': {'name': 'region_celltype',
  'inferred_type_name': 'None',
  'default': "''",
  'help': ''},
 'obs_type': {'name': 'obs_type',
  'inferred_type_name': 'None',
  'default': "''",
  'help': ''},
 'testing': {'name': 'testing',
  'inferred_type_name': 'None',
  'default': 'False',
  'help': ''}}

#### iterate over the list running the notebook per sample pool

In [7]:
%%time
# make sure the notebook output dir exists
os.makedirs(out_nb_dir, exist_ok=True)

futures = []
for g_type, groups in groupings.items():
    for grouping in groups:
        param_dict = {'region_celltype': grouping, 'obs_type': g_type, 'testing': True}
        out_notebook = f'{out_nb_dir}/{grouping.replace(" ", "_")}.glmmtmb_diffexp.ipynb'
        futures.append(run_pm_notebook.remote(base_notebook, out_notebook, param_dict))
#         futures.append(run_pm_notebook(base_notebook, out_notebook, param_dict))
results = ray.get(futures)        

2021-10-05 17:28:24,239	ERROR serialization.py:256 -- __init__() missing 5 required positional arguments: 'exec_count', 'source', 'ename', 'evalue', and 'traceback'
Traceback (most recent call last):
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 254, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 213, in _deserialize_object
    return RayError.from_bytes(obj)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ray/exceptions.py", line 28, in from_bytes
    return pickle.loads(ray_exception.serialized_exception)
TypeError: __init__() missing 5 required positional arguments: 'exec_count', 'source', 'ename', 'evalue', and 'traceback'


RaySystemError: System error: __init__() missing 5 required positional arguments: 'exec_count', 'source', 'ename', 'evalue', and 'traceback'
traceback: Traceback (most recent call last):
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 254, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ray/serialization.py", line 213, in _deserialize_object
    return RayError.from_bytes(obj)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ray/exceptions.py", line 28, in from_bytes
    return pickle.loads(ray_exception.serialized_exception)
TypeError: __init__() missing 5 required positional arguments: 'exec_count', 'source', 'ename', 'evalue', and 'traceback'


In [8]:
print(results)

NameError: name 'results' is not defined

#### shutdown ray

In [None]:
ray.shutdown()