## Notebook to run demuxlet using Cumulus Demuxlet wdl

here running directly via GCP Life Sciences API, should consider moving to GCP Batches API

based on [Cumulus/Demulet](https://cumulus-doc.readthedocs.io/en/0.12.0/demuxlet.html), but have the use Snapshot 1

[Broad wdl runner](https://github.com/broadinstitute/wdl-runner)

[Statgen Popscle includes demuxlet](https://github.com/statgen/popscle)

In [1]:
!date

Wed Aug  9 16:44:34 EDT 2023


#### import libraries

In [2]:
from pandas import DataFrame, read_csv
from os.path import exists
import json

#### set notebook variables

In [3]:
# parameters
proj_name = 'aging_phase2'
gcp_proj_id = 'adrd-neuro'
gcp_user = 'gibbsr'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
demux_dir = f'{wrk_dir}/demux'
public_dir = f'{wrk_dir}/public'
info_dir = f'{wrk_dir}/sample_info'
src_dir = f'{wrk_dir}/src_data'
analysis_bucket = 'gs://nihnialng-aging-brain/phase2/demux'
src_10x_bucket = 'gs://nihnialng-aging-brain/phase2/src_data'
genos_bucket = 'gs://nihnialng-aging-brain/phase2/genotypes'

# input files
demuxlet_wdl = 'gs://nihnialng-aging-pooled-pilot/analysis/demuxlet_test/demuxlet.1.wdl'
lcl_demuxlet_wdl = f'{public_dir}/demuxlet.1.wdl'
genos_vcf_file = f'{genos_bucket}/{proj_name}.hg38.demuxlet.vcf.gz'
info_file = f'{info_dir}/{proj_name}.sample_info.csv'

# out file
wdl_sample_sheet = f'{demux_dir}/{proj_name}.demuxlet.sheet.tsv'

# variables
DEBUG = False
lane_range = range(1, 9)

### load the sample info data

In [4]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())
    display(info_df.gex_pool.value_counts())
    display(info_df.atac_pool.value_counts())    

shape of info (36, 13)


#### drop the non-pooled samples and make sure pool nums are ints

In [5]:
info_df = info_df.loc[(~info_df.gex_pool.isna()) & (~info_df.atac_pool.isna())]
print(f'shape of info {info_df.shape}')
# make sure pool nums are ints and not floats
info_df.gex_pool = info_df.gex_pool.astype('int')
info_df.atac_pool = info_df.atac_pool.astype('int')
if DEBUG:
    display(info_df.head())
    display(info_df.gex_pool.value_counts())
    display(info_df.atac_pool.value_counts())    

shape of info (33, 13)


### the snapshot 1 demuxlet wdl will do a scatter based on input sample sheet, so format that

In [6]:
def run_bash_command(cmd_line: str, verbose: bool=False):
    if verbose:
        print(cmd_line)
    !{cmd_line}

def frmt_tenx_file_name(sample_name: str, src_10x_bucket: str, modality: str='gex', file_type: str='bam'):
    if modality == 'gex':
        if file_type == 'bam':
            this_file = f'{src_10x_bucket}/{modality}/{sample_name}/outs/possorted_genome_bam.bam'
        else: # assume barcodes
            this_file = f'{src_10x_bucket}/{modality}/{sample_name}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'
    elif modality == 'atac':
        if file_type == 'bam':
            this_file = f'{src_10x_bucket}/{modality}/{sample_name}/outs/possorted_bam.bam'
        else: # assume barcodes
            this_file = f'{src_10x_bucket}/{modality}/{sample_name}/outs/filtered_peak_bc_matrix/barcodes.tsv'
    else:
        raise ValueError('Invalid value, modality must be gex or atac')
    return this_file

In [7]:
pool_names = []
bams = []
barcodes = []
for pool in info_df.gex_pool.unique():
    for lane in lane_range:
        atac_sample = f'{src_dir}/atac/sample_ec_ATAC_P{pool}_{lane}'
        if exists(atac_sample):
            pool_names.append(f'SCAT_P{pool}_{lane}')
            bams.append(frmt_tenx_file_name(f'sample_ec_ATAC_P{pool}_{lane}', 
                                            src_10x_bucket, 'atac', 'bam'))
            barcodes.append(frmt_tenx_file_name(f'sample_ec_ATAC_P{pool}_{lane}', 
                                            src_10x_bucket, 'atac', 'barcodes'))            
        gex_sample = f'{src_dir}/gex/sample_ec_GEX_P{pool}_{lane}'
        if exists(gex_sample):
            pool_names.append(f'SCRN_P{pool}_{lane}')
            bams.append(frmt_tenx_file_name(f'sample_ec_GEX_P{pool}_{lane}', 
                                            src_10x_bucket, 'gex', 'bam'))
            barcodes.append(frmt_tenx_file_name(f'sample_ec_GEX_P{pool}_{lane}', 
                                            src_10x_bucket, 'gex', 'barcodes'))
this_data = {'sample': pool_names, 'bams': bams, 'barcodes': barcodes}
samples_df = DataFrame(data=this_data)
samples_df['vcf'] = genos_vcf_file
print(f'shape of demux sample sheet {samples_df.shape}')
if DEBUG:
    display(samples_df.head())            

shape of demux sample sheet (41, 4)


#### save the sample sheet and push to GCS

In [8]:
samples_df.to_csv(wdl_sample_sheet, index=False, header=False, sep='\t')

In [9]:
this_cmd = f'gsutil -mq cp {wdl_sample_sheet} {analysis_bucket}/'
run_bash_command(this_cmd, DEBUG)

### setup tooling for submitting GCP life-sci job

#### pull down the broad tooling

In [10]:
#pull down the correct recent Broad tooling
this_cmd = f'git clone https://github.com/broadinstitute/wdl-runner.git \
{public_dir}/wdl-runner'
run_bash_command(this_cmd, DEBUG)

Cloning into '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/wdl-runner'...
remote: Enumerating objects: 153, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 153 (delta 48), reused 46 (delta 46), pack-reused 96[K
Receiving objects: 100% (153/153), 45.69 KiB | 5.71 MiB/s, done.
Resolving deltas: 100% (83/83), done.


In [11]:
# pull the wdl
this_cmd = f'gsutil -mq cp {demuxlet_wdl} {lcl_demuxlet_wdl}'
run_bash_command(this_cmd, DEBUG)

### format argument jsons

In [12]:
# format demuxlet json
json_demux_outfile_name = f'{demux_dir}/{proj_name}.demuxlet.json'
demux_data = {}
demux_data['demuxlet.tsv_file'] = f'{analysis_bucket}/{proj_name}.demuxlet.sheet.tsv'
with open(json_demux_outfile_name, 'w') as json_outfile:
    json.dump(demux_data, json_outfile, sort_keys=False, indent=4)
    
# format the generic options json    
options_outfile_name = f'{demux_dir}/generic.options.json'
options_data = {}
zones_dict = {'zones': 'us-central1-a us-central1-b us-central1-c us-central1-f'}
options_data['default_runtime_attributes'] = zones_dict
options_data['read_from_cache'] = True
options_data['write_to_cache'] = True
options_data['workflow_failure_mode'] = 'ContinueWhilePossible'
options_data['system.input-read-limits.lines'] = 640000  
with open(options_outfile_name, 'w') as json_outfile:
    json.dump(options_data, json_outfile, sort_keys=False, indent=4)

### run the wdl on GCP using life sciences

In [15]:
# function to format the gcp life-sciences wdl job
def frmt_glsp_wdl_cmd(proj_name, bucket, proj_id, work_dir,
                      tool_dir, my_user, demuxlet_wdl):
    this_cmd = f'gcloud beta lifesciences pipelines run \
--project {proj_id} \
--pipeline-file {public_dir}/wdl-runner/wdl_runner/wdl_pipeline.yaml \
--location us-central1 \
--regions us-central1 \
--logging {bucket}/logs/wdls/{proj_name} \
--inputs-from-file WDL={demuxlet_wdl},\
WORKFLOW_INPUTS={work_dir}/{proj_name}.demuxlet.json,\
WORKFLOW_OPTIONS={work_dir}/generic.options.json \
--env-vars WORKSPACE={bucket}/workspace,\
OUTPUTS={bucket}/demuxlet_temp_results \
--labels=cohort={proj_name},user={my_user},workflow=demuxlet'
    return this_cmd

In [16]:

# create command
gcp_cmd = frmt_glsp_wdl_cmd(proj_name, analysis_bucket, gcp_proj_id,
                            demux_dir, public_dir, gcp_user, lcl_demuxlet_wdl)

# run command
print(gcp_cmd)
op_id = !{gcp_cmd}
print(op_id)

gcloud beta lifesciences pipelines run --project adrd-neuro --pipeline-file /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/wdl-runner/wdl_runner/wdl_pipeline.yaml --location us-central1 --regions us-central1 --logging gs://nihnialng-aging-brain/phase2/demux/logs/wdls/aging_phase2 --inputs-from-file WDL=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/demuxlet.1.wdl,WORKFLOW_INPUTS=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/aging_phase2.demuxlet.json,WORKFLOW_OPTIONS=/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/generic.options.json --env-vars WORKSPACE=gs://nihnialng-aging-brain/phase2/demux/workspace,OUTPUTS=gs://nihnialng-aging-brain/phase2/demux/demuxlet_temp_results --labels=cohort=aging_phase2,user=gibbsr,workflow=demuxlet
['Running [projects/740413734628/locations/us-central1/operations/9591435616534778205].']


In [17]:
op_id = 'projects/740413734628/locations/us-central1/operations/9591435616534778205'

# !gcloud beta lifesciences operations describe {op_id} \
# --location=us-central1 \
# --format='yaml(done, error, metadata.events)'

print('to check job, with polling, run this at cmd line:')
this_cmd = f'{public_dir}/wdl-runner/monitoring_tools/monitor_wdl_pipeline.sh {op_id} us-central1'
print(this_cmd)

to check job, with polling, run this at cmd line:
/labshare/raph/datasets/adrd_neuro/brain_aging/phase2/public/wdl-runner/monitoring_tools/monitor_wdl_pipeline.sh projects/740413734628/locations/us-central1/operations/9591435616534778205 us-central1


#### if succeeded the move result from temp out to final output

In [18]:
this_cmd = f'gsutil -mq mv {analysis_bucket}/demuxlet_temp_results/* {analysis_bucket}/'
run_bash_command(this_cmd, DEBUG)

#### verify expected files counts

In [22]:
print('SCAT pool counts expected and found:')
this_cmd = f'gsutil cat {analysis_bucket}/{proj_name}.demuxlet.sheet.tsv | grep SCAT_P | wc -l'
run_bash_command(this_cmd, DEBUG)
this_cmd = f'gsutil ls {analysis_bucket}/SCAT_P*.best | wc -l'
run_bash_command(this_cmd, DEBUG)
print('SCRN pool counts expected and found:')
this_cmd = f'gsutil cat {analysis_bucket}/{proj_name}.demuxlet.sheet.tsv | grep SCRN_P | wc -l'
run_bash_command(this_cmd, DEBUG)
this_cmd = f'gsutil ls {analysis_bucket}/SCRN_P*.best | wc -l'
run_bash_command(this_cmd, DEBUG)

SCAT pool counts expected and found:
20
20
SCRN pool counts expected and found:
21
21


#### now clean up the temp cromwell workspace

In [19]:
this_cmd = f'gsutil -mq rm -r {analysis_bucket}/workspace'
run_bash_command(this_cmd, DEBUG)
this_cmd = f'gsutil -mq rm -r {analysis_bucket}/logs'
run_bash_command(this_cmd, DEBUG)

In [20]:
!date

Thu Aug 10 09:56:58 EDT 2023
