## Notebook to run CellBender using wdl

based on [CellBender WDL](https://github.com/broadinstitute/CellBender/tree/e2fb5977cb187cb4b12172c9f77ed556bca92cb0/wdl)

[Broad wdl runner](https://github.com/broadinstitute/wdl-runner)

In [None]:
!date

#### import libraries

In [None]:
from pandas import DataFrame, read_csv
from os.path import exists
import json

#### set notebook variables

In [None]:
# naming
proj_name = 'aging'
gcp_proj_id = 'adrd-neuro'
gcp_user = 'gibbsr'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
cellbender_dir = f'{wrk_dir}/cellbender'
public_dir = f'{wrk_dir}/public'
info_dir = f'{wrk_dir}/sample_info'
analysis_bucket = 'gs://nihnialng-aging-brain/phase1/cellbender'
src_10x_bucket = 'gs://nihnialng-aging-brain/nisc'

# input files
cellbender_wdl = 'gs://nihnialng-aging-brain/phase1/public/cellbender_remove_background.wdl'
lcl_cellbender_wdl = f'{public_dir}/cellbender_remove_background.wdl'
info_file = f'{info_dir}/{proj_name}.pool_patient_sample_info.csv'

# out file

# variables
DEBUG = False
lane_range = range(1, 9)

#### utility functions

In [29]:
def run_bash_command(cmd_line: str, verbose: bool=False):
    if verbose:
        print(cmd_line)
    ret_value = !{cmd_line}
    return ret_value

### load the sample info data

In [None]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())
    display(info_df.pool_name.value_counts())

In [None]:
pool_files = {}
for pool in info_df.pool_name.unique():
    for lane in lane_range:
        pool_name = f'Aging_{pool}_SCRN_{lane}'
        pool_file = f'{src_10x_bucket}/{pool_name}/outs/raw_feature_bc_matrix.h5'
        pool_files[pool_name] = pool_file
if DEBUG:
    display(pool_files)

### setup tooling for submitting GCP life-sci job

#### pull down the broad tooling

In [None]:
#pull down the correct recent Broad tooling
this_cmd = f'git clone https://github.com/broadinstitute/wdl-runner.git \
{public_dir}/wdl-runner'
run_bash_command(this_cmd, DEBUG)

In [None]:
# pull the wdl
this_cmd = f'gsutil -mq cp {cellbender_wdl} {lcl_cellbender_wdl}'
run_bash_command(this_cmd, DEBUG)

### format argument jsons

In [None]:
# format per pool json
for pool, pool_file in pool_files.items():
    json_pool_outfile_name = f'{cellbender_dir}/{pool}.cellbender.json'
    pool_data = {}
    pool_data['cellbender_remove_background.run_cellbender_remove_background_gpu.sample_name'] = pool
    pool_data['cellbender_remove_background.run_cellbender_remove_background_gpu.input_file_unfiltered'] = pool_file
    with open(json_pool_outfile_name, 'w') as json_outfile:
        json.dump(pool_data, json_outfile, sort_keys=False, indent=4)
    
# format the generic options json    
options_outfile_name = f'{cellbender_dir}/generic.options.json'
options_data = {}
zones_dict = {'zones': 'us-central1-a us-central1-b us-central1-c us-central1-f'}
options_data['default_runtime_attributes'] = zones_dict
options_data['read_from_cache'] = True
options_data['write_to_cache'] = True
options_data['workflow_failure_mode'] = 'ContinueWhilePossible'
options_data['system.input-read-limits.lines'] = 640000  
with open(options_outfile_name, 'w') as json_outfile:
    json.dump(options_data, json_outfile, sort_keys=False, indent=4)

### run the wdl on GCP using life sciences

In [None]:
# function to format the gcp life-sciences wdl job
def frmt_glsp_wdl_cmd(pool_name, bucket, proj_id, work_dir,
                      tool_dir, my_user, cellbender_wdl):
    this_cmd = f'gcloud beta lifesciences pipelines run \
--project {proj_id} \
--pipeline-file {public_dir}/wdl-runner/wdl_runner/wdl_pipeline.yaml \
--location us-central1 \
--regions us-central1 \
--logging {bucket}/logs/wdls/{pool_name} \
--inputs-from-file WDL={cellbender_wdl},\
WORKFLOW_INPUTS={work_dir}/{pool_name}.cellbender.json,\
WORKFLOW_OPTIONS={work_dir}/generic.options.json \
--env-vars WORKSPACE={bucket}/workspace,\
OUTPUTS={bucket}/cellbender_temp_results \
--labels=cohort={proj_name},user={my_user},workflow=cellbender'
    return this_cmd

In [None]:
for pool in pool_files.keys():
    # create command
    gcp_cmd = frmt_glsp_wdl_cmd(pool, analysis_bucket, gcp_proj_id,
                                cellbender_dir, public_dir, gcp_user, lcl_cellbender_wdl)

    # run command
    op_id = run_bash_command(gcp_cmd, DEBUG)
    print(f'{pool}: {op_id}')
    if DEBUG:
        print(gcp_cmd)

In [None]:
op_id = 'projects/740413734628/locations/us-central1/operations/14006257249538057223'

# !gcloud beta lifesciences operations describe {op_id} \
# --location=us-central1 \
# --format='yaml(done, error, metadata.events)'

print('to check job, with polling, run this at cmd line:')
this_cmd = f'{public_dir}/wdl-runner/monitoring_tools/monitor_wdl_pipeline.sh {op_id} us-central1'
print(this_cmd)

#### check if operations still running

In [30]:
gcp_cmd = f'gcloud beta lifesciences operations list --project {gcp_proj_id} | grep True | wc -l'
ret_value = run_bash_command(gcp_cmd)
print(f'number of completed operations is {ret_value}')
gcp_cmd = f'gcloud beta lifesciences operations list --project {gcp_proj_id} | grep False | wc -l'
ret_value = run_bash_command(gcp_cmd)
print(f'number of incomplete operations is {ret_value}')

number of completed operations is ['225']
number of incomplete operations is ['0']


#### if succeeded the move result from temp out to final output

In [None]:
for pool in pool_files.keys():
    this_cmd = f'gsutil -mq mv {analysis_bucket}/cellbender_temp_results/{pool}_out* {analysis_bucket}/'
    run_bash_command(this_cmd, DEBUG)

#### verify expected files counts

In [31]:
print(f'Expected number Cellbender output samples: {len(pool_files)}')
this_cmd = f'gsutil ls {analysis_bucket}/Aging_P00*_SCRN_*_out_filtered.h5 | wc -l'
ret_value = run_bash_command(this_cmd)
print(f'Number of Cellbender output samples found: {ret_value}')

Expected number Cellbender output samples: 48
Number of Cellbender output samples found: ['48']


#### now clean up the temp cromwell workspace

In [None]:
this_cmd = f'gsutil -mq rm -r {analysis_bucket}/cellbender_temp_results'
run_bash_command(this_cmd, DEBUG)
this_cmd = f'gsutil -mq rm -r {analysis_bucket}/workspace'
run_bash_command(this_cmd, DEBUG)
this_cmd = f'gsutil -mq rm -r {analysis_bucket}/logs'
run_bash_command(this_cmd, DEBUG)

In [None]:
!date