Notebook to run the glm pseudo-bulk analysis per brain region and cell-type with papermill
originally coded to run in parallel out of a single notebook, but always have a probably passing a anndata object to ray or concurrent futures

so here running via papermill where notebook for each is run in parallel using threading library, have also previouls tried with ray and concurrent futures, all blow up papermill managing notebook blows up but individual notebooks keep running in parallel thru sucessful completion

In [1]:
!date

Mon Nov  8 12:28:07 EST 2021


#### import libraries and set notebook variables

In [2]:
import os
import papermill as pm
import threading

import warnings
warnings.simplefilter('ignore')

  from pyarrow import HadoopFileSystem


In [3]:
# naming
cohort = 'aging'

# directories
home_dir = f'/labshare/raph/notebooks/expression/adrd_neuro/{cohort}'

# base notebook to be run per tuple iteration
base_notebook = f'{home_dir}/glm_pb_diffexp.ipynb'

# output path for the generated notebooks
out_nb_dir = f'{home_dir}/pm_gend_nbs'

# setup parameters to iterate
brain_regions = ['Entorhinal cortex', 'Putamen', 'Subventricular zone', 
                 'Middle temporal gyrus']
cell_types = ['Oligodendrocyte-1', 'SPN D1', 'SPN D2', 'Oligodendrocyte-2', 
              'Astrocyte', 'ExN CUX2 LAMP5', 'InN ADARB2 VIP', 'ExN FEZF2', 
              'OPC', 'ExN RORB THEMIS', 'InN LHX6 PVALB', 'Radial Glia', 
              'Microglia', 'InN ADARB2 LAMP5', 'ExN CUX2 ADARB2', 
              'InN LHX6 SST', 'SPN D1-2', 'Endothelial', 'ExN RORB', 
              'ExN LAMP5', 'SPN D2-2', 'ExN THEMIS']
groupings = {'Brain_region': brain_regions, 'new_anno': cell_types}

testing = False

#### utility functions

In [4]:
def run_pm_notebook(base_notebook: str, out_notebook: str, params: dict) -> str:
    ret_val = f'notebook: {out_notebook}\nparams: {params}'
    pm.execute_notebook(input_path=base_notebook, output_path=out_notebook, 
                        parameters=params, progress_bar=False)
    return ret_val

#### check the notebook template

In [5]:
pm.inspect_notebook(base_notebook)

{'tissue': {'name': 'tissue',
  'inferred_type_name': 'None',
  'default': "''",
  'help': ''},
 'tissue_type': {'name': 'tissue_type',
  'inferred_type_name': 'None',
  'default': "''",
  'help': ''},
 'testing': {'name': 'testing',
  'inferred_type_name': 'None',
  'default': 'False',
  'help': ''}}

#### iterate over the list running the notebook per sample pool

In [6]:
%%time
# make sure the notebook output dir exists
os.makedirs(out_nb_dir, exist_ok=True)

job_threads = []
for g_type, groups in groupings.items():
    for grouping in groups:
        param_dict = {'tissue': grouping, 'tissue_type': g_type, 'testing': testing}
        out_notebook = f'{out_nb_dir}/{grouping.replace(" ", "_")}.glm_pb_diffexp.ipynb'
        print(f'notebook: {out_notebook}\nparams: {param_dict}')
        this_thread = threading.Thread(target=run_pm_notebook, 
                                       args=(base_notebook, out_notebook, param_dict,))
        job_threads.append(this_thread)
        this_thread.start()
        
for job_thread in job_threads:
    job_thread.join()     

notebook: /labshare/raph/notebooks/expression/adrd_neuro/aging/pm_gend_nbs/Entorhinal_cortex.glm_pb_diffexp.ipynb
params: {'tissue': 'Entorhinal cortex', 'tissue_type': 'Brain_region', 'testing': False}
notebook: /labshare/raph/notebooks/expression/adrd_neuro/aging/pm_gend_nbs/Putamen.glm_pb_diffexp.ipynb
params: {'tissue': 'Putamen', 'tissue_type': 'Brain_region', 'testing': False}
notebook: /labshare/raph/notebooks/expression/adrd_neuro/aging/pm_gend_nbs/Subventricular_zone.glm_pb_diffexp.ipynb
params: {'tissue': 'Subventricular zone', 'tissue_type': 'Brain_region', 'testing': False}
notebook: /labshare/raph/notebooks/expression/adrd_neuro/aging/pm_gend_nbs/Middle_temporal_gyrus.glm_pb_diffexp.ipynb
params: {'tissue': 'Middle temporal gyrus', 'tissue_type': 'Brain_region', 'testing': False}
notebook: /labshare/raph/notebooks/expression/adrd_neuro/aging/pm_gend_nbs/Oligodendrocyte-1.glm_pb_diffexp.ipynb
params: {'tissue': 'Oligodendrocyte-1', 'tissue_type': 'new_anno', 'testing': Fals

Traceback (most recent call last):
  File "/home/gibbsr/anaconda3/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/gibbsr/anaconda3/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
    app.initialize(argv)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/traitlets/config/application.py", line 88, in inner
    return method(app, *args, **kwargs)
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 632, in initialize
    self.init_sockets()
  File "/home/gibbsr/anaconda3/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 287, in init_sockets
    self.stdin_port = self._bind_socket(self.stdi

CPU times: user 1min 9s, sys: 8.36 s, total: 1min 18s
Wall time: 24min 34s


In [7]:
!date

Mon Nov  8 12:52:42 EST 2021
