## Notebook to run demuxlet using Cumulus Demuxlet wdl

here running directly on GCP, figure out how to run via Terra

based on [Cumulus/Demulet](https://cumulus-doc.readthedocs.io/en/0.12.0/demuxlet.html), but have the use Snapshot 1

[Broad wdl runner](https://github.com/broadinstitute/wdl-runner)

[Statgen Popscle includes demuxlet](https://github.com/statgen/popscle)

In [1]:
!date

Sat Jul 17 13:36:33 EDT 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd
import json

In [3]:
# parameters
project = 'adrd'
cohort = 'aging'
bank = 'nhbcc'
gcp_proj_id = 'adrd-neuro'
gcp_user = 'gibbsr'
pool_name = f'{project}_{cohort}_{bank}'
pool_names = []
for pnum in range(1, 7):
    for lane in range(1, 9):
        pool_names.append(f'Aging_P00{pnum}_SCRN_{lane}')

# directories
wrk_dir = f'/labshare/raph/datasets/adrd_neuro/{cohort}/demux'
tools_dir = '/labshare/raph/datasets/adrd_neuro/tools'
analysis_bucket = 'gs://nihnialng-aging-brain/analysis/demuxlet'
src_10x_bucket = 'gs://nihnialng-aging-brain/nisc'
genos_bucket = 'gs://nihnialng-aging-brain/genotypes'

# input files
demuxlet_wdl = 'gs://nihnialng-aging-pooled-pilot/analysis/demuxlet_test/demuxlet.1.wdl'
lcl_demuxlet_wdl = f'{tools_dir}/demuxlet.1.wdl'
genos_vcf_file = f'{genos_bucket}/{pool_name}.hg38.demuxlet.vcf.gz'

# out file
wdl_sample_sheet = f'{wrk_dir}/{pool_name}.demuxlet.sheet.tsv'

#### the snapshot 1 demuxlet wdl will do a scatter based on input sample sheet so format that

In [4]:
def frmt_tenx_file_name(sample_name, src_10x_bucket, file_type='bam'):
    if file_type == 'bam':
        this_file = f'{src_10x_bucket}/{sample_name}/outs/possorted_genome_bam.bam'
    else:
        this_file = f'{src_10x_bucket}/{sample_name}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'
    return this_file

In [5]:
# created the table data frame
bams = [frmt_tenx_file_name(sample, src_10x_bucket) for sample in pool_names]
barcodes = [frmt_tenx_file_name(sample, src_10x_bucket,'barcodes') for sample in pool_names]

this_data = {'sample': pool_names, 'bams': bams, 'barcodes': barcodes}
samples_df = pd.DataFrame(data=this_data)
samples_df['vcf'] = genos_vcf_file
print(samples_df.shape)
display(samples_df.head())

(48, 4)


Unnamed: 0,sample,bams,barcodes,vcf
0,Aging_P001_SCRN_1,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/genotypes/adrd_agin...
1,Aging_P001_SCRN_2,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/genotypes/adrd_agin...
2,Aging_P001_SCRN_3,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/genotypes/adrd_agin...
3,Aging_P001_SCRN_4,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/genotypes/adrd_agin...
4,Aging_P001_SCRN_5,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/nisc/Aging_P001_SCR...,gs://nihnialng-aging-brain/genotypes/adrd_agin...


#### save the sample sheet and push to GCS

In [6]:
samples_df.to_csv(wdl_sample_sheet, index=False, header=False, sep='\t')

In [7]:
this_cmd = f'gsutil -mq cp {wdl_sample_sheet} {src_10x_bucket}/'
print(this_cmd)
!{this_cmd}

gsutil -mq cp /labshare/raph/datasets/adrd_neuro/aging/demux/adrd_aging_nhbcc.demuxlet.sheet.tsv gs://nihnialng-aging-brain/nisc/


#### pull down the broad tooling

In [15]:
#pull down the correct recent Broad tooling
!git clone https://github.com/broadinstitute/wdl-runner.git {tools_dir}/wdl-runner

Cloning into '/labshare/raph/datasets/adrd_neuro/tools/wdl-runner'...
remote: Enumerating objects: 122, done.[K
remote: Total 122 (delta 0), reused 0 (delta 0), pack-reused 122[K
[KReceiving objects: 100% (122/122), 38.10 KiB | 6.35 MiB/s, done.
[KResolving deltas: 100% (62/62), done.


In [16]:
# pull the wdl
this_cmd = f'gsutil -mq cp {demuxlet_wdl} {lcl_demuxlet_wdl}'
print(this_cmd)
!{this_cmd}

gsutil -mq cp gs://nihnialng-aging-pooled-pilot/analysis/demuxlet_test/demuxlet.1.wdl /labshare/raph/datasets/adrd_neuro/tools/demuxlet.1.wdl


#### format argument jsons

In [8]:
# format demuxlet json
json_demux_outfile_name = f'{wrk_dir}/{pool_name}.demuxlet.json'
demux_data = {}
demux_data['demuxlet.tsv_file'] = f'{src_10x_bucket}/{pool_name}.demuxlet.sheet.tsv'
with open(json_demux_outfile_name, 'w') as json_outfile:
    json.dump(demux_data, json_outfile, sort_keys=False, indent=4)
    
# format the generic options json    
options_outfile_name = f'{wrk_dir}/generic.options.json'
options_data = {}
zones_dict = {'zones': 'us-central1-a us-central1-b us-central1-c us-central1-f'}
options_data['default_runtime_attributes'] = zones_dict
options_data['read_from_cache'] = True
options_data['write_to_cache'] = True
options_data['workflow_failure_mode'] = 'ContinueWhilePossible'
options_data['system.input-read-limits.lines'] = 640000  
with open(options_outfile_name, 'w') as json_outfile:
    json.dump(options_data, json_outfile, sort_keys=False, indent=4)

#### run the wdl on GCP using life sciences

In [9]:
# function to format the gcp life-sciences wdl job
def frmt_glsp_wdl_cmd(cohort, pool_name, bucket, proj_id, work_dir,
                      tool_dir, my_user, demuxlet_wdl):
    this_cmd = f'gcloud beta lifesciences pipelines run \
--project {proj_id} \
--pipeline-file {tool_dir}/wdl-runner/wdl_runner/wdl_pipeline.yaml \
--location us-central1 \
--regions us-central1 \
--logging {bucket}/logs/wdls/{pool_name} \
--inputs-from-file WDL={demuxlet_wdl},\
WORKFLOW_INPUTS={work_dir}/{pool_name}.demuxlet.json,\
WORKFLOW_OPTIONS={work_dir}/generic.options.json \
--env-vars WORKSPACE={bucket}/workspace,\
OUTPUTS={bucket}/demuxlet_temp_results \
--labels=cohort={cohort},user={my_user},workflow=demuxlet'
    return this_cmd

In [10]:

# create command
gcp_cmd = frmt_glsp_wdl_cmd(cohort, pool_name, analysis_bucket, gcp_proj_id,
                            wrk_dir, tools_dir, gcp_user, lcl_demuxlet_wdl)

# run command
print(gcp_cmd)
op_id = !{gcp_cmd}
print(op_id)

gcloud beta lifesciences pipelines run --project adrd-neuro --pipeline-file /labshare/raph/datasets/adrd_neuro/tools/wdl-runner/wdl_runner/wdl_pipeline.yaml --location us-central1 --regions us-central1 --logging gs://nihnialng-aging-brain/analysis/demuxlet/logs/wdls/adrd_aging_nhbcc --inputs-from-file WDL=/labshare/raph/datasets/adrd_neuro/tools/demuxlet.1.wdl,WORKFLOW_INPUTS=/labshare/raph/datasets/adrd_neuro/aging/demux/adrd_aging_nhbcc.demuxlet.json,WORKFLOW_OPTIONS=/labshare/raph/datasets/adrd_neuro/aging/demux/generic.options.json --env-vars WORKSPACE=gs://nihnialng-aging-brain/analysis/demuxlet/workspace,OUTPUTS=gs://nihnialng-aging-brain/analysis/demuxlet/demuxlet_temp_results --labels=cohort=aging,user=gibbsr,workflow=demuxlet
['Running [projects/740413734628/locations/us-central1/operations/8061285490362817354].']


In [11]:
op_id = 'projects/740413734628/locations/us-central1/operations/8061285490362817354'

# !gcloud beta lifesciences operations describe {op_id} \
# --location=us-central1 \
# --format='yaml(done, error, metadata.events)'

print('to check job, with polling, run this at cmd lien:')
this_cmd = f'{tools_dir}/wdl-runner/monitoring_tools/monitor_wdl_pipeline.sh {op_id} us-central1'
print(this_cmd)

to check job, with polling, run this at cmd lien:
/labshare/raph/datasets/adrd_neuro/tools/wdl-runner/monitoring_tools/monitor_wdl_pipeline.sh projects/740413734628/locations/us-central1/operations/8061285490362817354 us-central1


#### if succeeded the move result from temp out to final output

In [12]:
this_cmd = f'gsutil -mq mv {analysis_bucket}/demuxlet_temp_results/* {analysis_bucket}/demuxlet_results/'
print(this_cmd)
!{this_cmd}

gsutil -mq mv gs://nihnialng-aging-brain/analysis/demuxlet/demuxlet_temp_results/* gs://nihnialng-aging-brain/analysis/demuxlet/demuxlet_results/


#### now clean up the temp cromwell workspace

In [13]:
this_cmd = f'gsutil -mq rm -r {analysis_bucket}/workspace/demuxlet'
print(this_cmd)
!{this_cmd}

gsutil -mq rm -r gs://nihnialng-aging-brain/analysis/demuxlet/workspace/demuxlet
