## NINDS EOPD WGS

### July 25, 2019

notebook for processing the NINDS EOPD WGS data

#### global varibles and libraries

In [2]:
#global notebook variables for both python and bash majic (by stdin arguments)
WRKDIR = '/labseq/projects/ninds_eopd'
PRJ_BUCKET = 'gs://nihnialng-pd-wgs'
PROJECT_ID = 'pd-genome'
MYUSER = 'gibbsr'
AUTOSOMES=[str(x) for x in list(range(1,23))]
SEXOMES=['X','Y']
CHROMOSOMES=AUTOSOMES + SEXOMES
COHORT='ninds_eopd'
COHORTBUILD='{}.july2019'.format(COHORT)
COHORT_BUCKET='{}/{}'.format(PRJ_BUCKET,COHORT)

In [3]:
#import libraries
import pandas as pd
import time
import json
import os

#### see what fastqs are present in the stagin bucket

In [23]:
%%bash
#staging bucket that USUHS uploaded fastqs to
gsutil ls gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00 | wc -l
gsutil ls gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.01 | wc -l
    
gsutil ls gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00 | head
gsutil ls gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.01 | head

592
592
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L001_R1_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L001_R2_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L002_R1_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L002_R2_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L003_R1_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L003_R2_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L004_R1_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35863_S21_L004_R2_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35864_S22_L001_R1_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/A3-35864_S22_L001_R2_001.fastq.gz
gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.01/A3-35940_S23_L001_R1_001.fastq.gz
gs://nihnialng-staging-f

In [4]:
dest_bucket_path = '{}/{}/fastqs'.format(PRJ_BUCKET,COHORT)
gcs_fastq_mv_cmd = 'gsutil -mq cp gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.0*/*.fastq.gz \
{}/'.format(dest_bucket_path)

print('#run these commands at terminal:\n')
print('#WE\'VE ALREADY RUN THIS SO DO NOT NEED TO COPY AGAIN\n')
print(gcs_fastq_mv_cmd)

#run these commands at terminal:

#WE'VE ALREADY RUN THIS SO DO NOT NEED TO COPY AGAIN

gsutil -mq cp gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.0*/*.fastq.gz gs://nihnialng-pd-wgs/ninds_eopd/fastqs/


In [25]:
%%bash -s "$dest_bucket_path"
#check all the fastqs have been moved should be 592 pairs
DEST_BUCKET=${1}

gsutil ls ${DEST_BUCKET}/*_R1_001.fastq.gz | wc -l
gsutil ls ${DEST_BUCKET}/*_R2_001.fastq.gz | wc -l

592
592


#### get fastqs listing

In [26]:
%%bash -s "$WRKDIR" "$dest_bucket_path" "$COHORT"
#get fastq listing and tokenize
WRKDIR=${1}
DEST_BUCKET=${2}
COHORT=${3}

FASTQ_LISTING=${WRKDIR}/${COHORT}.fastq.listing.txt

gsutil ls ${DEST_BUCKET}/*.fastq.gz > ${FASTQ_LISTING}

sed -i s"/gs:\/\/nihnialng-pd-wgs\/${COHORT}\/fastqs\///"g ${FASTQ_LISTING}
sed -i s"/\.fastq\.gz//"g ${FASTQ_LISTING}
sed -i s"/_/\\t/"g ${FASTQ_LISTING}

less ${FASTQ_LISTING} | wc -l
head ${FASTQ_LISTING}
tail ${FASTQ_LISTING}

1184
A3-35863	S21	L001	R1	001
A3-35863	S21	L001	R2	001
A3-35863	S21	L002	R1	001
A3-35863	S21	L002	R2	001
A3-35863	S21	L003	R1	001
A3-35863	S21	L003	R2	001
A3-35863	S21	L004	R1	001
A3-35863	S21	L004	R2	001
A3-35864	S22	L001	R1	001
A3-35864	S22	L001	R2	001
A3-36014	S23	L004	R1	001
A3-36014	S23	L004	R2	001
A3-36015	S24	L001	R1	001
A3-36015	S24	L001	R2	001
A3-36015	S24	L002	R1	001
A3-36015	S24	L002	R2	001
A3-36015	S24	L003	R1	001
A3-36015	S24	L003	R2	001
A3-36015	S24	L004	R1	001
A3-36015	S24	L004	R2	001


#### load fastq listing and USUHS sample info and ID mappings

In [3]:
#ok now see what we have
#load the USUHS quality reports
usuhs_qa_file = WRKDIR + '/pA3.QA.june.26.2019.xlsx'
usuhs_qa = pd.read_excel(usuhs_qa_file)
print(usuhs_qa.shape)

#load the fastq info
fastqs_file = '{}/{}.fastq.listing.txt'.format(WRKDIR,COHORT)
fastqs_df = pd.read_csv(fastqs_file,header=None,sep='\t')
fastqs_df.columns = ['usuhsID','S','LANE','READ','NUM']
print(fastqs_df.shape)

#load usuhs key map
usuhs_key_file = WRKDIR + '/pA3.hernandez.export1.keyTable.june.27.2019.csv'
usuhs_keys_df = pd.read_csv(usuhs_key_file,sep=',')
print(usuhs_keys_df.shape)


(148, 97)
(1184, 5)
(148, 3)


In [4]:
print(usuhs_keys_df.shape)
usuhs_keys_df.head()

(148, 3)


Unnamed: 0,SampleID,Flowcell,Description
0,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01
1,A3-35864,190419_N03_0073_AHJF53DSXX,A3-35864-PennEOPD002-B01
2,A3-35865,190419_N03_0073_AHJF53DSXX,A3-35865-PennEOPD004-C01
3,A3-35866,190419_N03_0073_AHJF53DSXX,A3-35866-PennEOPD006-D01
4,A3-35893,190419_N04_0056_BHK37HDSXX,A3-35893-PennEOPD046-G04


#### unfortunately of 'real' sampleID is packed in the middle of the 'Description' column values, extract it

In [5]:
usuhs_keys_df['grbg1'],usuhs_keys_df['grbg2'],usuhs_keys_df['oriID'],usuhs_keys_df['grbg3'] = \
usuhs_keys_df['Description'].str.split('-').str
usuhs_keys_df.drop(['grbg1','grbg2','grbg3'],axis='columns',inplace=True)

In [6]:
usuhs_keys_df.head()

Unnamed: 0,SampleID,Flowcell,Description,oriID
0,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
1,A3-35864,190419_N03_0073_AHJF53DSXX,A3-35864-PennEOPD002-B01,PennEOPD002
2,A3-35865,190419_N03_0073_AHJF53DSXX,A3-35865-PennEOPD004-C01,PennEOPD004
3,A3-35866,190419_N03_0073_AHJF53DSXX,A3-35866-PennEOPD006-D01,PennEOPD006
4,A3-35893,190419_N04_0056_BHK37HDSXX,A3-35893-PennEOPD046-G04,PennEOPD046


In [7]:
#take a look at the USUHS QA table, if desired
usuhs_qa.head()

Unnamed: 0,x.X,x.runFolder,x.Sample_ID,x.LaneNumbers,x.fastqCount,x.sampleSheet,x.seqInSS,x.seqInSSpre,x.seqOutSS,x.ssDiffFlag,...,x.meanCoverageP,x.PCT_20X,x.PCT_30X,x.NA.,x.NA..1,x.NA..2,x.NA..3,x.NA..4,x.NA..5,x.valTime
0,9341,190419_N03_0073_AHJF53DSXX,A3-35863,1~2~3~4,8,/data/seq2/seqIn//NovaSeq/190419_N03_0073_AHJF...,2019-04-24_16:00:36.7968139030_-0400,2019-04-22_11:23:26.0000000000_-0400,2019-04-24_16:00:41.3045690060_-0400,0,...,45.61428,0.963868,0.943429,,,,,,,2019-06-12 12:06:40
1,9342,190419_N03_0073_AHJF53DSXX,A3-35864,1~2~3~4,8,/data/seq2/seqIn//NovaSeq/190419_N03_0073_AHJF...,2019-04-24_16:00:36.7968139030_-0400,2019-04-22_11:23:26.0000000000_-0400,2019-04-24_16:00:41.3045690060_-0400,0,...,43.295565,0.962888,0.934077,,,,,,,2019-06-12 12:06:40
2,9343,190419_N03_0073_AHJF53DSXX,A3-35865,1~2~3~4,8,/data/seq2/seqIn//NovaSeq/190419_N03_0073_AHJF...,2019-04-24_16:00:36.7968139030_-0400,2019-04-22_11:23:26.0000000000_-0400,2019-04-24_16:00:41.3045690060_-0400,0,...,40.469096,0.960691,0.913079,,,,,,,2019-06-12 12:06:40
3,9344,190419_N03_0073_AHJF53DSXX,A3-35866,1~2~3~4,8,/data/seq2/seqIn//NovaSeq/190419_N03_0073_AHJF...,2019-04-24_16:00:36.7968139030_-0400,2019-04-22_11:23:26.0000000000_-0400,2019-04-24_16:00:41.3045690060_-0400,0,...,42.829397,0.961585,0.930298,,,,,,,2019-06-12 12:06:40
4,9387,190419_N04_0056_BHK37HDSXX,A3-35893,1~2~3~4,8,/data/seq2/seqIn//NovaSeq/190419_N04_0056_BHK3...,2019-04-24_15:58:27.9914912360_-0400,2019-04-19_15:53:06.0000000000_-0400,2019-04-24_15:58:32.6838125650_-0400,0,...,35.554067,0.954387,0.815699,,,,,,,2019-06-12 12:06:40


In [8]:
#counts look right but check to see if any not found id
print(set(fastqs_df['usuhsID']) - set(usuhs_keys_df['SampleID']))
print(set(usuhs_keys_df['SampleID']) - set(fastqs_df['usuhsID']))

set()
set()


In [9]:
#merge the fastqs and the key maps
named_fastqs_df = pd.merge(fastqs_df,usuhs_keys_df,how='left',left_on='usuhsID',\
                           right_on='SampleID')

In [10]:
print(fastqs_df.shape)
print(named_fastqs_df.shape)
named_fastqs_df.head()

(1184, 5)
(1184, 9)


Unnamed: 0,usuhsID,S,LANE,READ,NUM,SampleID,Flowcell,Description,oriID
0,A3-35863,S21,L001,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
1,A3-35863,S21,L001,R2,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
2,A3-35863,S21,L002,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
3,A3-35863,S21,L002,R2,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
4,A3-35863,S21,L003,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001


In [11]:
#the NUM bit of the file name if always '001' so format the field
print(named_fastqs_df['NUM'].value_counts())
named_fastqs_df['NUM'] = '001'

1    1184
Name: NUM, dtype: int64


In [12]:
#check exprect count R1 == R2
named_fastqs_df['READ'].value_counts()

R2    592
R1    592
Name: READ, dtype: int64

In [13]:
#only need one row for each sample, not both fastq pairs, ie R1 and R2
cohort_df = named_fastqs_df.loc[named_fastqs_df['READ'] == 'R1']
print(cohort_df.shape)
cohort_df.head()

(592, 9)


Unnamed: 0,usuhsID,S,LANE,READ,NUM,SampleID,Flowcell,Description,oriID
0,A3-35863,S21,L001,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
2,A3-35863,S21,L002,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
4,A3-35863,S21,L003,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
6,A3-35863,S21,L004,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
8,A3-35864,S22,L001,R1,1,A3-35864,190419_N03_0073_AHJF53DSXX,A3-35864-PennEOPD002-B01,PennEOPD002


#### format per sample jsons files for the fastq to ubam jobs

In [14]:
def checkfortemplatefile(this_template_file):
    if not os.path.isfile(this_template_file):
        print('need ' + this_template_file)

In [15]:
#create the jsons directory for the fastq to bam if it doesn't exist
#also check to see if the blank jsons are present or you need to retrieve
fastq_to_bam_json_dir = '{}/jsons'.format(WRKDIR)

if os.path.isdir(fastq_to_bam_json_dir):
    os.makedirs(fastq_to_bam_json_dir + '/fastqtoubam', exist_ok=True)    
    
checkfortemplatefile(fastq_to_bam_json_dir + '/blank.fastqtoubam.json')
checkfortemplatefile(fastq_to_bam_json_dir + '/blank.broadbam.hg38.json')
checkfortemplatefile(fastq_to_bam_json_dir + '/blank.align.label.json')

#### here we are going to subet to just a 3rd or the sample, so each of you have a subset to process

so you have to save you subset list to the file path below

In [16]:
your_subset_file = '{}/your_subset_of_full_cohort.list'.format(WRKDIR)
your_subset = pd.read_csv(your_subset_file,header=None)
print(your_subset.shape)

print(cohort_df.shape)
cohort_df = cohort_df.loc[cohort_df['oriID'].isin(your_subset[0])]
print(cohort_df.shape)
cohort_df.head()

(48, 1)
(592, 9)
(192, 9)


Unnamed: 0,usuhsID,S,LANE,READ,NUM,SampleID,Flowcell,Description,oriID
0,A3-35863,S21,L001,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
2,A3-35863,S21,L002,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
4,A3-35863,S21,L003,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
6,A3-35863,S21,L004,R1,1,A3-35863,190419_N03_0073_AHJF53DSXX,A3-35863-PennEOPD001-A01,PennEOPD001
8,A3-35864,S22,L001,R1,1,A3-35864,190419_N03_0073_AHJF53DSXX,A3-35864-PennEOPD002-B01,PennEOPD002


In [34]:
#format the WF input jsons for fastq to ubams
from datetime import datetime

json_template = '{}/jsons/blank.fastqtoubam.json'.format(WRKDIR)

fastq_format = '{}/{}/fastqs/{}_{}_{}_{}_001.fastq.gz'

DEFAULTATTEMPTS = 3
DEFAULTDISK = 500
DEFAULTMEM = '30 GB'

sample_ids = cohort_df['oriID'].unique()
for sample_id in sample_ids:
    with open(json_template) as json_file:  
        data = json.load(json_file)
        data['ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.gatk_path'] = '/gatk/gatk'
        data['ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.docker'] = 'broadinstitute/gatk:4.0.1.2'
        data['ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.mem_size'] = DEFAULTMEM
        data['ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.disk_size'] = DEFAULTDISK
        data['ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.preemptible_tries'] = DEFAULTATTEMPTS

        sample_data = data.copy()
        sample_data['ConvertPairedFastQsToUnmappedBamWf.readgroup_list'] = []
        sample_data['ConvertPairedFastQsToUnmappedBamWf.metadata'] = {}
        sample_data['ConvertPairedFastQsToUnmappedBamWf.fastq_pairs'] = {}
        sample_df = cohort_df.loc[cohort_df['oriID'] == sample_id]
        json_outfile_name = '{}/jsons/fastqtoubam/{}.fastqtoubam.json'.format(WRKDIR,sample_id)

        for index, row in sample_df.iterrows():
            lib_date, lib_machine, lib_index, flowcell = row['Flowcell'].split('_')

            read_group_name = '{}_{}_{}_{}'.format(row['oriID'], row['usuhsID'], row['LANE'], lib_date)

            this_date_str = datetime.strptime(lib_date, '%y%m%d').strftime('%Y-%m-%d')
            read_group_info = [row['oriID'], row['oriID'], row['Flowcell'], this_date_str, 'ILLUMINA', 'USUHS']

            read_group_fastqs = [fastq_format.format(PRJ_BUCKET,COHORT,row['usuhsID'],row['S'],row['LANE'],'R1'), \
                                fastq_format.format(PRJ_BUCKET,COHORT,row['usuhsID'],row['S'],row['LANE'],'R2')]

            sample_data['ConvertPairedFastQsToUnmappedBamWf.readgroup_list'].append(read_group_name)
            sample_data['ConvertPairedFastQsToUnmappedBamWf.metadata'].update({read_group_name : read_group_info})
            sample_data['ConvertPairedFastQsToUnmappedBamWf.fastq_pairs'].update({read_group_name : read_group_fastqs})
        
        with open(json_outfile_name,'w') as json_outfile:
            json.dump(sample_data,json_outfile,sort_keys=True,indent=4)

cohort_file_list = '{}/{}.samples.list'.format(WRKDIR,COHORT)
pd.DataFrame(data=sample_ids).to_csv(cohort_file_list,header=False,index=False)

In [115]:
#define format function the ggp cmd
def formatgcpcmd(this_sample,chrt_bucket):
    this_cmd = 'echo -n {SAMPLE} OPID=\n\
gcloud alpha genomics pipelines run \
--project {PROJECT_ID} \
--pipeline-file {PRJ_BUCKET}/resources/tools/wdl_pipeline.yaml \
--zones us-central1-f \
--memory 7 \
--logging {COHORT_BUCKET}/logs/ubams/{SAMPLE} \
--inputs-from-file WDL={WRKDIR}/tools/broad/paired-fastq-to-unmapped-bam.wdl \
--inputs-from-file WORKFLOW_INPUTS={WRKDIR}/jsons/{COHORT}/fastqtoubam/{SAMPLE}.fastqtoubam.json \
--inputs-from-file WORKFLOW_OPTIONS={WRKDIR}/jsons/generic.google-papi.options.json \
--inputs WORKSPACE={COHORT_BUCKET}/workspace/{SAMPLE} \
--inputs OUTPUTS={COHORT_BUCKET}/ubams/{SAMPLE} \
--labels=pipe=fastq_to_ubam,sample={LABELNAME},cohort={LCCOHORT},user={MYUSER}'
    return(this_cmd.format(SAMPLE=this_sample,PROJECT_ID=PROJECT_ID,PRJ_BUCKET=PRJ_BUCKET,\
                         COHORT_BUCKET=chrt_bucket,WRKDIR=WRKDIR,COHORT=COHORT,\
                          LABELNAME=this_sample.lower(),LCCOHORT=COHORT.lower(),MYUSER=MYUSER))

#iterate over samples formatting the cmds
cohort_bucket = '{}/{}'.format(PRJ_BUCKET,COHORT)
cmds = [formatgcpcmd(sample_id,cohort_bucket) for sample_id in sample_ids]

temp_script_file = '{}/{}.run_fastqs_to_ubams.sh'.format(WRKDIR,COHORT.lower())

with open(temp_script_file, 'w') as file_handler:
        for this_cmd in cmds:
            file_handler.write("{}\n".format(this_cmd))
            
print('#run these commands at terminal:\n')
print('chmod +x ' + temp_script_file)
print('nohup ' + temp_script_file + ' > {}/{}.run_fastqs_to_ubams.log &'.format(WRKDIR,COHORT.lower()))

#run these commands at terminal:

chmod +x /labseq/projects/ninds_eopd/ninds_eopd.run_fastqs_to_ubams.sh
nohup /labseq/projects/ninds_eopd/ninds_eopd.run_fastqs_to_ubams.sh > /labseq/projects/ninds_eopd/ninds_eopd.run_fastqs_to_ubams.log &


In [17]:
%%bash -s "$PROJECT_ID" "$MYUSER" "$COHORT"
#see if there are instances running the job
PROJECT_ID=${1}
MYUSER=${2}
COHORT=${3}

PIPELABEL=fastq_to_ubam

echo "full job worker node count"
gcloud compute instances list --project ${PROJECT_ID} \
    --filter "labels.pipe=${PIPELABEL} labels.cohort=${COHORT} labels.user=${MYUSER}" | grep RUNNING | wc -l
#echo "job managers"
#gcloud compute instances list --project ${PROJECT_ID} \
#    --filter "labels.pipe=${PIPELABEL} labels.cohort=${COHORT} labels.user=${MYUSER}"

full job worker node count
0


Listed 0 items.


In [18]:
%%bash

OPID=EI6syK2uLRjvoZHbvPnIuRgglfil2O0VKg9wcm9kdWN0aW9uUXVldWU
gcloud alpha genomics operations describe ${OPID} \
--format='yaml(done, error, metadata.events)'

done: true
metadata:
  events:
  - description: start
    startTime: '2019-05-23T17:17:17.141792399Z'
  - description: pulling-image
    startTime: '2019-05-23T17:17:17.141868185Z'
  - description: localizing-files
    startTime: '2019-05-23T17:17:53.776297803Z'
  - description: running-docker
    startTime: '2019-05-23T17:17:53.776333263Z'
  - description: delocalizing-files
    startTime: '2019-05-23T22:29:25.124150479Z'
  - description: ok
    startTime: '2019-05-23T22:29:26.216592166Z'


In [19]:
%%bash -s "$WRKDIR" "$PRJ_BUCKET" "$COHORT"
#get a list of files that were successfully created
WRKDIR=${1}
PRJ_BUCKET=${2}
COHORT=${3}

COHORT_BUCKET=${PRJ_BUCKET}/${COHORT}

gsutil -mq ls ${COHORT_BUCKET}/ubams > ${WRKDIR}/${COHORT}.found.files

sed -i s"/gs:\/\/nihnialng-pd-wgs\/${COHORT}\/ubams\///"g ${WRKDIR}/${COHORT}.found.files
sed -i s"/\///"g ${WRKDIR}/${COHORT}.found.files

less ${WRKDIR}/${COHORT}.found.files | wc -l

148


In [20]:
#check for any missing expected bams
expected_file = '{}/{}.samples.list'.format(WRKDIR, COHORT)
observed_file = '{}/{}.found.files'.format(WRKDIR, COHORT) 
missing_file = '{}/{}.missing.samples.list'.format(WRKDIR, COHORT)

expected = pd.read_csv(expected_file,header=None)
observed = pd.read_csv(observed_file,header=None)

print(expected.shape)
print(observed.shape)

print(len(set(expected[0]) - set(observed[0])))

missing = expected.loc[~expected[0].isin(observed[0])]
print(missing.shape)
missing.head()

#save the missing list
missing.to_csv(missing_file,header=None,index=None)

(148, 1)
(148, 1)
0
(0, 1)


## THIS IS WHERE WE STOPPED AFTER FASTQ TO uBAM coversion

#### clean up the temp workspace and logs from the fastq to ubam conversions

In [39]:
print('#run these commands at terminal:\n')
print('#WE\'VE ALREADY RUN THIS SO DO NOT NEED TO CLEANUP AGAIN\n')

print('gsutil -mq rm -r {}/logs/ubams'.format(COHORT_BUCKET))
print('gsutil -mq rm -r {}/workspace'.format(COHORT_BUCKET))

#run these commands at terminal:

#WE'VE ALREADY RUN THIS SO DO NOT NEED TO CLEANUP AGAIN

gsutil -mq rm -r gs://nihnialng-pd-wgs/ninds_eopd/logs/ubams
gsutil -mq rm -r gs://nihnialng-pd-wgs/ninds_eopd/workspace


#### make sure the json dir is there for the next steps

In [29]:
#create the jsons directory for the ubam to cram if it doesn't exist
#also check to see if the blank jsons are present or you need to retrieve
fastq_to_bam_json_dir = '{}/jsons'.format(WRKDIR)

if os.path.isdir(fastq_to_bam_json_dir):
    os.makedirs(fastq_to_bam_json_dir + '/broadbams', exist_ok=True)    
    
checkfortemplatefile(fastq_to_bam_json_dir + '/blank.align.label.json')
checkfortemplatefile(fastq_to_bam_json_dir + '/PairedEndSingleSampleWf.gatk4.0.options.json')
checkfortemplatefile(fastq_to_bam_json_dir + '/template.broadbam.hg38.json')
checkfortemplatefile(fastq_to_bam_json_dir + '/cromwell_client.py')
checkfortemplatefile(fastq_to_bam_json_dir + '/PairedEndSingleSampleWf.gatk4.0.wdl')

In [30]:
print('#run these commands at terminal:\n')

print('gsutil cp gs://nihnialng-pd-wgs/tools/broad/blank.align.label.json {}/jsons/'.format(WRKDIR))
print('gsutil cp gs://nihnialng-pd-wgs/tools/broad/PairedEndSingleSampleWf.gatk4.0.options.json \
{}/jsons/'.format(WRKDIR))
print('gsutil cp gs://nihnialng-pd-wgs/tools/broad/template.broadbam.hg38.json {}/jsons/'.format(WRKDIR))
print('gsutil cp gs://nihnialng-pd-wgs/tools/broad/cromwell_client.py {}/jsons/'.format(WRKDIR))
print('gsutil cp gs://nihnialng-pd-wgs/tools/broad/PairedEndSingleSampleWf.gatk4.0.wdl {}/jsons/'.format(WRKDIR))

#run these commands at terminal:

gsutil cp gs://nihnialng-pd-wgs/tools/broad/blank.align.label.json /labseq/projects/ninds_eopd/jsons/
gsutil cp gs://nihnialng-pd-wgs/tools/broad/PairedEndSingleSampleWf.gatk4.0.options.json /labseq/projects/ninds_eopd/jsons/
gsutil cp gs://nihnialng-pd-wgs/tools/broad/template.broadbam.hg38.json /labseq/projects/ninds_eopd/jsons/
gsutil cp gs://nihnialng-pd-wgs/tools/broad/json.substitute.py /labseq/projects/ninds_eopd/jsons/
gsutil cp gs://nihnialng-pd-wgs/tools/broad/cromwell_client.py /labseq/projects/ninds_eopd/jsons/
gsutil cp gs://nihnialng-pd-wgs/tools/broad/PairedEndSingleSampleWf.gatk4.0.wdl /labseq/projects/ninds_eopd/jsons/


#### create the ubam to cram per sample json files

In [92]:
#generate the extra json files for ubams to crams; ie labels and options for alignment wf
from datetime import datetime

json_label_template = '{}/jsons/blank.align.label.json'.format(WRKDIR)
json_options_template = '{}/jsons/PairedEndSingleSampleWf.gatk4.0.options.json'.format(WRKDIR)
json_broad_template = '{}/jsons/template.broadbam.hg38.json'.format(WRKDIR)

sample_ids = cohort_df['oriID'].unique()
for sample_id in sample_ids:
    json_labels_outfile_name = '{}/jsons/broadbams/{}.labels.json'.format(WRKDIR,sample_id)
    json_options_outfile_name = '{}/jsons/broadbams/{}.options.json'.format(WRKDIR,sample_id)
    json_broad_outfile_name = '{}/jsons/broadbams/{}.broadbam.hg38.json'.format(WRKDIR,sample_id)    

    #format and write the label json
    with open(json_label_template) as json_file:  
        label_data = json.load(json_file)
        
        label_data['cohort'] = COHORT.lower()
        label_data['sample'] = sample_id.lower()
        label_data['user'] = MYUSER.lower()
        
        with open(json_labels_outfile_name,'w') as json_outfile:
            json.dump(label_data,json_outfile,sort_keys=True,indent=4)   
    
    #format and write the options json
    with open(json_options_template) as json_file:  
        options_data = json.load(json_file)
        
        options_data['final_workflow_outputs_dir'] = '{}/hg38/align-wf/{}'.format(COHORT_BUCKET, sample_id)
        options_data['final_workflow_log_dir'] = '{}/logs/{}'.format(COHORT_BUCKET, sample_id)
        options_data['final_call_logs_dir'] = '{}/logs/{}'.format(COHORT_BUCKET, sample_id)
        
        with open(json_options_outfile_name,'w') as json_outfile:
            json.dump(options_data,json_outfile,sort_keys=True,indent=4)   

    #format and write the broad json
    get_ubams_cmd = 'gsutil ls {}/ubams/{}/*.unmapped.bam'.format(COHORT_BUCKET,sample_id)
    ubams = !{get_ubams_cmd}

    with open(json_broad_template) as json_file:  
        broad_data = json.load(json_file)
        
        broad_data['PairedEndSingleSampleWorkflow.sample_name'] = sample_id
        broad_data['PairedEndSingleSampleWorkflow.base_file_name'] = sample_id
        broad_data['PairedEndSingleSampleWorkflow.flowcell_unmapped_bams'] = ubams
        broad_data['PairedEndSingleSampleWorkflow.final_gvcf_base_name'] = sample_id
        
        with open(json_broad_outfile_name,'w') as json_outfile:
            json.dump(broad_data,json_outfile,sort_keys=False,indent=4)   

#### setup the cromwell server

In [36]:
print('#setup a cromwell server for running the alignment wdl jobs\n\
#get Matt\'s cromwell stuff\n\
#already pulled for other projects so copy over from dementia_wgs\n')

print('#run these commands at terminal:\n')
print('#I\'VE ALREADY RUN THIS, SO DO NOT NEED TO CREATE SERVER AGAIN\n')
print('#cp -r ../dementia_wgs/tools/verily-amp-pd-source {}/tools/'.format(WRKDIR))

print('\n#fire up the cromwell instance')
print('chmod +x {}/tools/verily-amp-pd-source/setup_cromwell_vm/*.sh'.format(WRKDIR))
print('cd {}/tools/verily-amp-pd-source/setup_cromwell_vm/'.format(WRKDIR))
print('./create_cromwell_server.sh {}-cromwell {} n1-highmem-8'\
      .format(COHORT,PROJECT_ID))
print('./configure.sh {}-cromwell {} nihnialng-pd-wgs/{}'\
      .format(COHORT,PROJECT_ID,COHORT.replace('_','-')))

print('\n#When that is up, ssh to the instance:')
print('gcloud --project {} compute ssh {}-cromwell'.format(PROJECT_ID,COHORT.replace('_','-')))

print('\n#And in that SSH session, run:')
print('cd /install')
print('docker-compose -f /install/workspace/config/docker-compose.yml up')


#setup a cromwell server for running the alignment wdl jobs
#get Matt's cromwell stuff
#already pulled for other projects so copy over from dementia_wgs

#run these commands at terminal:

#I'VE ALREADY RUN THIS, SO DO NOT NEED TO CREATE SERVER AGAIN

#cp -r ../dementia_wgs/tools/verily-amp-pd-source /labseq/projects/ninds_eopd/tools/

#fire up the cromwell instance
chmod +x /labseq/projects/ninds_eopd/tools/verily-amp-pd-source/setup_cromwell_vm/*.sh
cd /labseq/projects/ninds_eopd/tools/verily-amp-pd-source/setup_cromwell_vm/
./create_cromwell_server.sh ninds_eopd-cromwell pd-genome n1-highmem-8
./configure.sh ninds_eopd-cromwell pd-genome nihnialng-pd-wgs/ninds-eopd

#When that is up, ssh to the instance:
gcloud --project pd-genome compute ssh ninds-eopd-cromwell

#And in that SSH session, run:
cd /install
docker-compose -f /install/workspace/config/docker-compose.yml up


#### create the ssh tunnel to the cromwell server so you can submit jobs

In [43]:
print('#When cromwell is up, create an SSH tunnel from your workstation:')
print('#run these commands at terminal:\n')
print('gcloud --project {} compute ssh {}-cromwell -- -L 8000:localhost:8000'\
      .format(PROJECT_ID,COHORT.replace('_','-')))

print('\n#after this runs you well actually be logged into the cromwell server, ' \
      'so you will need to open another termincal session on your machine to submit your jobs\n')

#When cromwell is up, create an SSH tunnel from your workstation:
#run these commands at terminal:

gcloud --project pd-genome compute ssh ninds-eopd-cromwell -- -L 8000:localhost:8000

#after this runs you well actually be logged into the cromwell server, so you will need to open another termincal session on your machine to submit your jobs



In [34]:
#define format function the ggp cmd
def formatgcpcmd(this_sample,chrt_bucket):
    this_cmd = 'echo -n {SAMPLE} OPID=\n\
python {WRKDIR}/jsons/cromwell_client.py \
--wdl {WRKDIR}/jsons/PairedEndSingleSampleWf.gatk4.0.wdl \
--workflow-inputs {WRKDIR}/jsons/broadbams/{SAMPLE}.broadbam.hg38.json \
--workflow-options {WRKDIR}/jsons/broadbams/{SAMPLE}.options.json \
--workflow-labels {WRKDIR}/jsons/broadbams/{SAMPLE}.labels.json'
    return(this_cmd.format(WRKDIR=WRKDIR,COHORT=COHORT,SAMPLE=this_sample))

#iterate over samples formatting the cmds
cohort_bucket = '{}/{}'.format(PRJ_BUCKET,COHORT)
cmds = [formatgcpcmd(sample_id,cohort_bucket) for sample_id in sample_ids]

temp_script_file = '{}/{}.run_ubams_to_crams.sh'.format(WRKDIR,COHORT.lower())

with open(temp_script_file, 'w') as file_handler:
        for this_cmd in cmds:
            file_handler.write("{}\n".format(this_cmd))
            
print('#run these commands at terminal:\n')
print('chmod +x ' + temp_script_file)
print('nohup ' + temp_script_file + ' > {}/{}.run_ubams_to_crams.log &'.format(WRKDIR,COHORT.lower()))

#run these commands at terminal:

chmod +x /labseq/projects/ninds_eopd/ninds_eopd.run_ubams_to_crams.sh
nohup /labseq/projects/ninds_eopd/ninds_eopd.run_ubams_to_crams.sh > /labseq/projects/ninds_eopd/ninds_eopd.run_ubams_to_crams.log &


In [100]:
#see how many GCE (Google Compute Engine) instances are running your jobs
PIPELABEL='pairedendsinglesamplewf'

print('#full job worker node count')
!gcloud compute instances list --project {PROJECT_ID} \
--filter "labels:({PIPELABEL} {COHORT} {MYUSER})" | grep RUNNING | wc -l

print('#number of all running instances in project')
!gcloud compute instances list --project {PROJECT_ID} | grep RUNNING | wc -l

#full job worker node count
Listed 0 items.
0
#number of all running instances in project
676


#### commands for checking statuses using Cromwell REST cmds

In [50]:
print('#run these commands at terminal:\n')

print('#These actually won\'t be that helpful because you are all using the same server.\n')

print('#When cromwell is up, create an SSH tunnel from your workstation, if not already connected:\n')
print('gcloud --project {} compute ssh {}-cromwell -- -L 8000:localhost:8000'.\
      format(PROJECT_ID,COHORT.replace('_','-')))

print('\n#if tunnel established can check cromwell status\n')
print('curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Running"')
print('curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Submitted"')
print('curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Failed"')
print('curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Succeeded"')

#run these commands at terminal:

#These actually won't be that helpful because you are all using the same server.

#When cromwell is up, create an SSH tunnel from your workstation, if not already connected:

gcloud --project pd-genome compute ssh ninds-eopd-cromwell -- -L 8000:localhost:8000

#if tunnel established can check cromwell status

curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Running"
curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Submitted"
curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Failed"
curl -X GET "http://localhost:8000/api/workflows/v1/query?status=Succeeded"


#### commands to check progess by counting expected output files

In [103]:
#check bam, cram, and gvcf counts
print('#These actually won\'t be that helpful because you are all using the same bucket path.\n')
print('#crams')
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-ConvertToCram/**.cram | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-ConvertToCram/**.crai | wc -l
print('#bams')
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBamFiles/**.bam | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBamFiles/**.bai | wc -l
print('#gvcfs')
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-MergeVCFs/**.g.vcf.gz | wc -l

#These actually won't be that helpful because you are all using the same bucket path.

#crams
118
118
#bams
118
118
#gvcfs
118


#### commands for copying the primary output files to a final dest path

In [57]:
print('#run these commands at terminal:\n')

print('#gvcfs')
print('gsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-MergeVCFs/**.g.vcf.gz* \
{}/hg38/gvcfs/'.format(COHORT_BUCKET,COHORT_BUCKET))
print('\n#crams')
print('gsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-ConvertToCram/**.cram* \
{}/hg38/crams/'.format(COHORT_BUCKET,COHORT_BUCKET))
print('\n#bams')
print('gsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBamFiles/**.bam \
{}/hg38/bams/'.format(COHORT_BUCKET,COHORT_BUCKET))
print('gsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBamFiles/**.bai \
{}/hg38/bams/'.format(COHORT_BUCKET,COHORT_BUCKET))

#run these commands at terminal:

#gvcfs
gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-MergeVCFs/**.g.vcf.gz* gs://nihnialng-pd-wgs/ninds_eopd/hg38/gvcfs/

#crams
gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-ConvertToCram/**.cram* gs://nihnialng-pd-wgs/ninds_eopd/hg38/crams/

#bams
gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBamFiles/**.bam gs://nihnialng-pd-wgs/ninds_eopd/hg38/bams/
gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBamFiles/**.bai gs://nihnialng-pd-wgs/ninds_eopd/hg38/bams/


#### confirm counts of the primary output files at final dest path

In [None]:
#check bam, cram, and gvcf counts at final path
print('#crams')
!gsutil ls {COHORT_BUCKET}/hg38/crams/*.cram | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/crams/*.crai | wc -l
print('#bams')
!gsutil ls {COHORT_BUCKET}/hg38/bams/*.bam | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/bams/*.bai | wc -l
print('#gvcfs')
!gsutil ls {COHORT_BUCKET}/hg38/gvcfs/*.g.vcf.gz | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/gvcfs/*.g.vcf.gz.tbi | wc -l

#### check that all expected files are there by looking at sample IDs

In [None]:
%%bash -s "$WRKDIR" "$COHORT_BUCKET" "$COHORT"
#get a list of files that were successfully created
WRKDIR=${1}
COHORT_BUCKET=${2}
COHORT=${3}

gsutil -mq ls ${COHORT_BUCKET}/hg38/crams/*.cram > ${WRKDIR}/${COHORT}.found.files

sed -i s"/gs:\/\/nihnialng-pd-wgs\/${COHORT}\/hg38\/crams\///"g ${WRKDIR}/${COHORT}.found.files
sed -i s"/\///"g ${WRKDIR}/${COHORT}.found.files
sed -i s"/\.cram//"g ${WRKDIR}/${COHORT}.found.files

less ${WRKDIR}/${COHORT}.found.files | wc -l

In [None]:
#check for any missing expected bams
expected_file = '{}/{}.samples.list'.format(WRKDIR, COHORT)
observed_file = '{}/{}.found.files'.format(WRKDIR, COHORT) 
missing_file = '{}/{}.missing.samples.list'.format(WRKDIR, COHORT)

expected = pd.read_csv(expected_file,header=None)
observed = pd.read_csv(observed_file,header=None)

print(expected.shape)
print(observed.shape)

print(len(set(expected[0]) - set(observed[0])))

missing = expected.loc[~expected[0].isin(observed[0])]
print(missing.shape)
missing.head()

#save the missing list
missing.to_csv(missing_file,header=None,index=None)

#### if all the expected primary output files are present then go ahead and copy of the other output files

In [68]:
#move the other metrics reports that may be of interest to keep
#need to move all the other summary metric files over to final output
print('#run these commands at terminal:\n')
print('gsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-ValidateCram/**.cram.validation_report ' \
'{}/hg38/align-wf/call-ValidateCram/'.format(COHORT_BUCKET,COHORT_BUCKET))
print('\ngsutil -mq cp ${}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-CheckContamination/**.preBqsr.selfSM ' \
'{}/hg38/align-wf/call-CheckContamination/'.format(COHORT_BUCKET,COHORT_BUCKET))
print('\ngsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBqsrReports/**.recal_data.csv ' \
'{}/hg38/align-wf/call-GatherBqsrReports/'.format(COHORT_BUCKET,COHORT_BUCKET))
print('\ngsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/**_metrics {}/hg38/align-wf/metrics/'.\
      format(COHORT_BUCKET,COHORT_BUCKET))
print('\ngsutil -mq cp {}/hg38/align-wf/*/PairedEndSingleSampleWorkflow/**.pdf {}/hg38/align-wf/metrics/'.\
      format(COHORT_BUCKET,COHORT_BUCKET))

#run these commands at terminal:

gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-ValidateCram/**.cram.validation_report gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/call-ValidateCram/

gsutil -mq cp $gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-CheckContamination/**.preBqsr.selfSM gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/call-CheckContamination/

gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/*/call-GatherBqsrReports/**.recal_data.csv gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/call-GatherBqsrReports/

gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/**_metrics gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/metrics/

gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/*/PairedEndSingleSampleWorkflow/**.pdf gs://nihnialng-pd-wgs/ninds_eopd/hg38/align-wf/metrics/


#### check file counts at destination for primary and other metrics output files

In [5]:
#check file counts
!gsutil ls {COHORT_BUCKET}/hg38/gvcfs/*.vcf.gz | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/gvcfs/*.vcf.gz.tbi | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/crams/*.cram | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/crams/*.crai | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/bams/*.bam | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/bams/*.bai | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/call-ValidateCram/*.cram.validation_report | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/call-CheckContamination/*.preBqsr.selfSM | wc -l
!gsutil ls {COHORT_BUCKET}/hg38/align-wf/call-GatherBqsrReports/*.recal_data.csv | wc -l

148
148
148
148
148
148
148
148
148


#### check the cram validation reports

In [6]:
#pull down the validation reports and check for erros
!mkdir -p {WRKDIR}/temp

!gsutil -mq cp {COHORT_BUCKET}/hg38/align-wf/call-ValidateCram/*.cram.validation_report {WRKDIR}/temp/

!ls {WRKDIR}/temp/*.validation_report | wc -l
!less {WRKDIR}/temp/*.validation_report | grep "No errors found" | wc -l

#clean up the temp validation reports
!rm {WRKDIR}/temp/*.validation_report


148
148


# This is where we stopped after alignments

#### do some additional cleanup, I've already run these deletes

including the deletion of the ubams

In [8]:
#if successfully, clean up log files and workspace path
!echo gsutil -mq rm -r {COHORT_BUCKET}/logs
!echo gsutil -mq rm -r {COHORT_BUCKET}/cromwell-execution/PairedEndSingleSampleWorkflow

gsutil -mq rm -r gs://nihnialng-pd-wgs/ninds_eopd/logs
gsutil -mq rm -r gs://nihnialng-pd-wgs/ninds_eopd/cromwell-execution/PairedEndSingleSampleWorkflow


In [52]:
#define format function the gcp cmd to delete additional per sample un-needed files
def formatgcpcmd(this_sample,chrt_bucket):
    this_cmd = 'gsutil -mq rm -r {COHORT_BUCKET}/hg38/align-wf/{SAMPLE}/PairedEndSingleSampleWorkflow'
    return(this_cmd.format(COHORT_BUCKET=chrt_bucket,SAMPLE=this_sample))

#here reloading sample_ids
cohort_file_list = '{}/{}.samples.list'.format(WRKDIR,COHORT)
sample_ids = pd.read_csv(cohort_file_list,header=None).to_numpy()
sample_ids = sample_ids.reshape(len(sample_ids))

#iterate over samples formatting the cmds
cmds = [formatgcpcmd(sample_id,COHORT_BUCKET) for sample_id in sample_ids]

temp_script_file = '{}/{}.delete_other_files.sh'.format(WRKDIR,COHORT.lower())

with open(temp_script_file, 'w') as file_handler:
        for this_cmd in cmds:
            file_handler.write("{}\n".format(this_cmd))
            
print('#run these commands at terminal:\n')
print('#I\'VE ALREADY RUN THIS SO DO NOT NEED TO DELETE AGAIN\n')
print('chmod +x ' + temp_script_file)
print('nohup ' + temp_script_file + ' > {}/{}.delete_other_files.log &'.format(WRKDIR,COHORT.lower()))

#run these commands at terminal:

#I'VE ALREADY RUN THIS SO DO NOT NEED TO DELETE AGAIN

chmod +x /labseq/projects/ninds_eopd/ninds_eopd.delete_other_files.sh
nohup /labseq/projects/ninds_eopd/ninds_eopd.delete_other_files.sh > /labseq/projects/ninds_eopd/ninds_eopd.delete_other_files.log &


In [126]:
#define format function the ggp cmd to delete ubams
#deleting all the ubams (were just intermediate format)
def formatgcpcmd(this_sample,chrt_bucket):
    this_cmd = 'gsutil -mq rm -r {COHORT_BUCKET}/ubams/{SAMPLE}'
    return(this_cmd.format(COHORT_BUCKET=chrt_bucket,SAMPLE=this_sample))

#iterate over samples formatting the cmds
cmds = [formatgcpcmd(sample_id,COHORT_BUCKET) for sample_id in sample_ids]

temp_script_file = '{}/{}.delete_ubams.sh'.format(WRKDIR,COHORT.lower())

with open(temp_script_file, 'w') as file_handler:
        for this_cmd in cmds:
            file_handler.write("{}\n".format(this_cmd))
            
print('#run these commands at terminal:\n')
print('#I\'VE ALREADY RUN THIS SO DO NOT NEED TO DELETE AGAIN\n')
print('chmod +x ' + temp_script_file)
print('nohup ' + temp_script_file + ' > {}/{}.delete_ubams.log &'.format(WRKDIR,COHORT.lower()))

#run these commands at terminal:

#I'VE ALREADY RUN THIS SO DO NOT NEED TO DELETE AGAIN

chmod +x /labseq/projects/ninds_eopd/ninds_eopd.delete_ubams.sh
nohup /labseq/projects/ninds_eopd/ninds_eopd.delete_ubams.sh > /labseq/projects/ninds_eopd/ninds_eopd.delete_ubams.log &


#### archive the fastqs to nearline storage for some periond of time

In [81]:
#move fastqs to archvie bucket (nearline)
#nearline, coldline -> father away storage bucket, cheaper 
fastq_bucket_path = '{}/{}/fastqs/*.fastq.gz'.format(PRJ_BUCKET,COHORT)
archive_bucket_path = 'gs://nihnialng-pd-archive/fastqs/usuhs/{}/'.\
format(COHORTBUILD.replace('.', '_'))

print('#run these commands at terminal:\n')
print('#I\'VE ALREADY RUN THIS SO DO NOT NEED TO MOVE AGAIN\n')
print('nohup gsutil -mq cp {} {} &'.format(fastq_bucket_path, archive_bucket_path))

#run these commands at terminal:

#I'VE ALREADY RUN THIS SO DO NOT NEED TO MOVE AGAIN

nohup gsutil -mq cp gs://nihnialng-pd-wgs/ninds_eopd/fastqs/*.fastq.gz gs://nihnialng-pd-archive/fastqs/usuhs/ninds_eopd_july2019/ &


In [130]:
#check that all the files were moved, although should be successful if no error was reported
#should match the original counts from early fastq counting cell
!gsutil ls {archive_bucket_path} | wc -l

#get storage size
!gsutil -mq du -hs {archive_bucket_path}

1184
10.13 TiB    gs://nihnialng-pd-archive/fastqs/usuhs/ninds_eopd_july2019


#### if everything is ok with archiving of fastqs, delete original staging bucket fastqs

In [82]:
#delete the staging bucket fastqs that USUHS uploaded
print('#run these commands at terminal:\n')
print('#I\'VE ALREADY RUN THIS SO DO NOT NEED TO DELETE AGAIN\n')
print('gsutil -mq rm gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/*.fastq.gz')
print('gsutil -mq rm gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.01/*.fastq.gz')
print('gsutil -mq rm {}/fastqs/*.fastq.gz'.format(COHORT_BUCKET))

#run these commands at terminal:

#I'VE ALREADY RUN THIS SO DO NOT NEED TO DELETE AGAIN

gsutil -mq rm gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.00/*.fastq.gz
gsutil -mq rm gs://nihnialng-staging-f745d15a/pA3.hernandez.N148.01/*.fastq.gz
gsutil -mq rm gs://nihnialng-pd-wgs/ninds_eopd/fastqs/*.fastq.gz


#### check for any contaminated samples
pull down seq contamination check info

In [10]:
#pull down the verifybamid check contamination files 
!mkdir -p {WRKDIR}/seqqc

!gsutil -mq cp {COHORT_BUCKET}/hg38/align-wf/call-CheckContamination/*.preBqsr.selfSM {WRKDIR}/seqqc/
!ls {WRKDIR}/seqqc/*.preBqsr.selfSM | wc -l

148


In [11]:
%%bash -s "$WRKDIR" "$COHORTBUILD" "$COHORT"
#combine the contamination files into single table

WRKDIR=${1}
COHORTBUILD=${2}
COHORT=${3}

SAMPLESLISTFILE="${WRKDIR}/${COHORT}.samples.list"
METRICSFILE="${WRKDIR}/${COHORTBUILD}.contam.metrics.txt"

if [ -s ${METRICSFILE} ]; then
rm ${METRICSFILE}
fi
echo "id avgdp freemix" > ${METRICSFILE}

ls ${WRKDIR}/seqqc/*.preBqsr.selfSM | xargs -l basename > ${WRKDIR}/selfsm.file.list
sed -i s"/\.preBqsr\.selfSM//"g ${WRKDIR}/selfsm.file.list

while read SAMPLELINE
do
SAMPLE=$(echo ${SAMPLELINE} | awk '{print $1}')
awk -v SAMPLE=${SAMPLE} '$1 == SAMPLE {print $1,$6,$7}' \
${WRKDIR}/seqqc/${SAMPLE}.preBqsr.selfSM >> ${METRICSFILE}
done < ${WRKDIR}/selfsm.file.list

rm ${WRKDIR}/selfsm.file.list

#### check the contamination reports for problems

In [13]:
#check the contamination and seq coverage rates
#reading report persample, lops over file loops into single table
#lower than expected coverage, 
#2% and 4.9% = still useable?
contam_metrics_file = '{}/{}.contam.metrics.txt'.format(WRKDIR,COHORTBUILD)

WARN_FREEMIX = 0.02
MAX_FREEMIX = 0.049
MIN_DEPTH = 27

metrics_df = pd.read_csv(contam_metrics_file,sep='\s+')
print(metrics_df.shape)
metrics_df.head()

maybe_contamin_df = metrics_df.loc[(metrics_df['freemix'] <= MAX_FREEMIX) & \
                                   (metrics_df['freemix'] > WARN_FREEMIX)]
print('number of samples maybe contaminated with freemix > {} and < {} is {}'\
      .format(WARN_FREEMIX,MAX_FREEMIX,maybe_contamin_df.shape[0]))
contam_maybe_file = '{}/{}.contamination.possible.samples.txt'.format(WRKDIR,COHORTBUILD)
maybe_contamin_df.to_csv(contam_maybe_file,index=False,sep='\t')

contaminated_df = metrics_df.loc[metrics_df['freemix'] > MAX_FREEMIX]
print('number of samples with freemix > {} is {}'.format(MAX_FREEMIX,contaminated_df.shape[0]))
contam_problems_file = '{}/{}.contaminated.samples.txt'.format(WRKDIR,COHORTBUILD)
contaminated_df.to_csv(contam_problems_file,index=False,sep='\t')

low_cov_df = metrics_df.loc[metrics_df['avgdp'] < MIN_DEPTH]
print('number of samples with avgdp < {} is {}'.format(MIN_DEPTH,low_cov_df.shape[0]))
low_coverage_file = '{}/{}.low_coverage.samples.txt'.format(WRKDIR,COHORTBUILD)
low_cov_df.to_csv(low_coverage_file,index=False,sep='\t')


(148, 3)
number of samples maybe contaminated with freemix > 0.02 and < 0.049 is 0
number of samples with freemix > 0.049 is 0
number of samples with avgdp < 27 is 0


#### good news there aren't any samples with contamination or low coverage

if there had been, you would want to just exclude the ones with freemix > 5% and decide on excluding the possibly contaminated or low coverage

#### now prep for running the joint genotyping

#### pull down the broad tooling

In [94]:
#pull down the correct recent Broad tooling
!mkdir -p {WRKDIR}/tools

#don't think wget is easy to install for Mac, maybe use curl instead
# !wget https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/master/joint-discovery-gatk4.wdl \
#     -O {WRKDIR}/tools/joint-discovery-gatk4.wdl
# !wget https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/master/joint-discovery-gatk4.hg38.wgs.inputs.json \
#     -O {WRKDIR}/tools/joint-discovery-gatk4.hg38.wgs.inputs.json

!curl https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/master/joint-discovery-gatk4.wdl \
    -o {WRKDIR}/tools/joint-discovery-gatk4.wdl
!curl https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/master/joint-discovery-gatk4.hg38.wgs.inputs.json \
    -o {WRKDIR}/tools/joint-discovery-gatk4.hg38.wgs.inputs.json
    
!curl https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/master/generic.google-papi.options.json \
    -o {WRKDIR}/tools/generic.google-papi.options.json    

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 33879  100 33879    0     0   827k      0 --:--:-- --:--:-- --:--:--  827k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5493  100  5493    0     0   137k      0 --:--:-- --:--:-- --:--:--  137k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   134  100   134    0     0   1353      0 --:--:-- --:--:-- --:--:--  1353


#### update joint calling jsons

In [114]:
#update some of the json configs for broad joint calling
#generate the extra json files for ubams to crams; ie labels and options for alignment wf
#formats set of json files that formats files were running

generic_options_template = '{}/tools/generic.google-papi.options.json'.format(WRKDIR)
wdl_input_template = '{}/tools/joint-discovery-gatk4.hg38.wgs.inputs.json'.format(WRKDIR)

json_labels_outfile_name = '{}/jsons/{}.jdgatk4.labels.json'.format(WRKDIR,COHORT)
json_options_outfile_name = '{}/jsons/{}.jdgatk4.options.json'.format(WRKDIR,COHORT)
json_broad_outfile_name = '{}/jsons/{}.jdgatk4.hg38.wgs.inputs.json'.format(WRKDIR,COHORT)    

#format and write the label json
label_data = {}
label_data['workflow'] = 'jdgatk4'
label_data['cohort'] = COHORT.lower()
label_data['user'] = MYUSER.lower()

with open(json_labels_outfile_name,'w') as json_outfile:
    json.dump(label_data,json_outfile,sort_keys=True,indent=4)   

#format and write the options json
with open(generic_options_template) as json_file:  
    options_data = json.load(json_file)
    
    options_data['read_from_cache'] = True
    options_data['write_to_cache'] = True
    options_data['workflow_failure_mode'] = 'ContinueWhilePossible'
    options_data['system.input-read-limits.lines'] = 640000    
    options_data['final_workflow_outputs_dir'] = '{}/hg38/joint_calling'.format(COHORT_BUCKET)
    options_data['final_workflow_log_dir'] = '{}/logs/joint_calling'.format(COHORT_BUCKET)
    options_data['final_call_logs_dir'] = '{}/logs/joint_calling'.format(COHORT_BUCKET)

    with open(json_options_outfile_name,'w') as json_outfile:
        json.dump(options_data,json_outfile,sort_keys=True,indent=4)   

#format and write the broad json
with open(wdl_input_template) as json_file:  
    broad_data = json.load(json_file)
    
    sample_gvcfs_path = '{}/{}.sample.gvcfs.map.txt'.format(COHORT_BUCKET,COHORTBUILD)
    broad_data['JointGenotyping.callset_name'] = COHORTBUILD
    broad_data['JointGenotyping.sample_name_map'] = sample_gvcfs_path

    with open(json_broad_outfile_name,'w') as json_outfile:
        json.dump(broad_data,json_outfile,sort_keys=False,indent=4)   


In [116]:
%%bash -s "$WRKDIR" "$COHORT_BUCKET" "$COHORT"
#get a list of files that were successfully created
WRKDIR=${1}
COHORT_BUCKET=${2}
COHORT=${3}

gsutil -mq ls ${COHORT_BUCKET}/hg38/crams/*.cram > ${WRKDIR}/${COHORT}.found.files

sed -i s"/gs:\/\/nihnialng-pd-wgs\/${COHORT}\/hg38\/crams\///"g ${WRKDIR}/${COHORT}.found.files
sed -i s"/\///"g ${WRKDIR}/${COHORT}.found.files
sed -i s"/\.cram//"g ${WRKDIR}/${COHORT}.found.files

less ${WRKDIR}/${COHORT}.found.files | wc -l

148


In [121]:
#need to create the gvcfs sample map file used as json argument above JointGenotyping.sample_name_map
#instead of just looping expected ids do so from found list in previous cell, allow to move/remove fails

found_file_list = '{}/{}.found.files'.format(WRKDIR,COHORT)
found_ids = pd.read_csv(found_file_list,header=None).to_numpy()
found_ids = sample_ids.reshape(len(found_ids))

gvcfs_map_file = '{}/{}.sample.gvcfs.map.txt'.format(WRKDIR,COHORTBUILD)

with open(gvcfs_map_file, 'w') as file_handler:
    for sample_id in found_ids:
        file_handler.write("{}\t{}/hg38/gvcfs/{}.g.vcf.gz\n".\
                           format(sample_id,COHORT_BUCKET,sample_id))

#now copy the map file up to cloud bucket
!gsutil -mq cp {gvcfs_map_file} {COHORT_BUCKET}/


#### now run the joint calling job

In [43]:
print('#When cromwell is up, create an SSH tunnel from your workstation:')
print('#run these commands at terminal:\n')
print('gcloud --project {} compute ssh {}-cromwell -- -L 8000:localhost:8000'\
      .format(PROJECT_ID,COHORT.replace('_','-')))

print('\n#after this runs you well actually be logged into the cromwell server, ' \
      'so you will need to open another termincal session on your machine to submit your jobs\n')

#When cromwell is up, create an SSH tunnel from your workstation:
#run these commands at terminal:

gcloud --project pd-genome compute ssh ninds-eopd-cromwell -- -L 8000:localhost:8000

#after this runs you well actually be logged into the cromwell server, so you will need to open another termincal session on your machine to submit your jobs



In [125]:
#now run the joint calling job, submit job

run_cmd = 'python {wrk_dir}/jsons/cromwell_client.py \
--wdl {wrk_dir}/tools/oint-discovery-gatk4.wdl \
--workflow-inputs {wrk_dir}/jsons/{this_cohort}.jdgatk4.hg38.wgs.inputs.json \
--workflow-options {wrk_dir}/jsons/{this_cohort}.jdgatk4.options.json \
--workflow-labels {wrk_dir}/jsons/{this_cohort}.jdgatk4.labels.json'.\
format(wrk_dir=WRKDIR, this_cohort=COHORT)

print('#run these commands at terminal:\n')

print('{} > {}/{}.jdgatk4.jobid'.format(run_cmd, WRKDIR, COHORTBUILD))

#run these commands at terminal:

python /labseq/projects/ninds_eopd/jsons/cromwell_client.py --wdl /labseq/projects/ninds_eopd/tools/oint-discovery-gatk4.wdl --workflow-inputs /labseq/projects/ninds_eopd/jsons/ninds_eopd.jdgatk4.hg38.wgs.inputs.json --workflow-options /labseq/projects/ninds_eopd/jsons/ninds_eopd.jdgatk4.options.json --workflow-labels /labseq/projects/ninds_eopd/jsons/ninds_eopd.jdgatk4.labels.json > /labseq/projects/ninds_eopd/ninds_eopd.july2019.jdgatk4.jobid


# We are stopping here until next meeting

In [99]:
#if done with the cromwell server delete it

!echo gcloud compute instances delete {COHORT.replace('_','-')}-cromwell \
--project {PROJECT_ID} --zone us-central1-f

gcloud compute instances delete ninds-eopd-cromwell --project pd-genome --zone us-central1-f
