## Notebook for seting up resources to quantifying RNA with Salmon on GCP

https://salmon.readthedocs.io/en/latest/salmon.html


#### Jan 07, 2020

In [83]:
# setup notebook global vars
#gcp
project_id = 'nih-nia-lng-cbg'
prj_bucket = 'gs://nihnialngcbg-eqtl'
my_user='gibbsr'


#### pull Gencode reference transcripts for indexing

In [68]:
# pull Gencode reference transcripts for indexing
#setup the local reference files on labseq for gencode_37 from GRCh38/hg38
gencode_lastest = '32'
gencode_src_path = f'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human\
/release_{gencode_lastest}'
gencode_gtf = f'gencode.v{gencode_lastest}.annotation.gtf.gz'
gencode_fa = f'gencode.v{gencode_lastest}.transcripts.fa.gz'
gencode_local_dir = f'/labseq/Genomes/GRCh38_hg38/gencode_{gencode_lastest}'

#make sure local path is created
!mkdir -p {gencode_local_dir}

#pull the gencode reference
!curl --silent -L {gencode_src_path}/{gencode_gtf} \
--output {gencode_local_dir}/{gencode_gtf}

!curl --silent -L {gencode_src_path}/{gencode_fa} \
--output {gencode_local_dir}/{gencode_fa}


#### prep reference transcriptome as need

In [94]:
# for Gencode
# exclude transcripts with low level support, ie only keep 'goodish' \
# exclude Ensembl TSL levels 4 or 5 and Gencode level 3

# need to use salmon decoy prep script (?)

!gunzip {gencode_local_dir}/{gencode_fa}

In [70]:
# for CHESS
# need to create transcript fasta from GTF and reference genome
# need to modify fasta entries labels with transcript and gene IDs and gene name

# need to use salmon decoy prep script (?)


In [97]:
#### push prepped transcriptome(s) to cloud
local_transcriptome_fasta = f'{gencode_local_dir}/{gencode_fa}'.replace('.gz','')
gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/{gencode_fa}'.replace('.gz','')

gcloud_cmd = f'gsutil -mq cp {local_transcriptome_fasta} {gcs_transcriptome_fasta}'

print(gcloud_cmd)

!{gcloud_cmd}

gsutil -mq cp /labseq/Genomes/GRCh38_hg38/gencode_32/gencode.v32.transcripts.fa gs://nihnialngcbg-eqtl/resources/references/gencode.v32.transcripts.fa


#### create salmon index for the reference transcriptome

In [98]:
# use a GCP lifesciences job to generate the salmon index(?)
docker_img = 'us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1'

# gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/salmon/gencode_r32/\
# nohup.filtered.gencode.v32.transcripts.fa'

gcp_lsci_pipe_cmd = 'mkdir -p /gcloud-shared/reference; \
salmon --no-version-check index -t ${REF_TX_FASTA} -i /gcloud-shared/reference \
-k 31 --gencode --threads $(nproc); \
gsutil -mq cp -r /gcloud-shared/reference ${GS_SALMON_INDEX_OUT}; \
ls -lhR /gcloud-shared'

gcp_lsci_pipe_vars = f'GS_SALMON_INDEX_OUT={prj_bucket}/resources/references/\
salmon/gencode_{gencode_lastest}'

cloud_cmd = f'gcloud beta lifesciences pipelines run \
--project {project_id} \
--docker-image {docker_img} \
--machine-type n1-standard-4 \
--logging {prj_bucket}/resources/references/logs/salmon_index_gencode_{gencode_lastest}.log \
--command-line \'{gcp_lsci_pipe_cmd}\' \
--inputs REF_TX_FASTA={gcs_transcriptome_fasta} \
--env-vars {gcp_lsci_pipe_vars} \
--labels=pipe=salmonindex,reference=gencode,user={my_user} \
--preemptible'

print(f'{cloud_cmd}\n')

!{cloud_cmd}

gcloud beta lifesciences pipelines run --project nih-nia-lng-cbg --docker-image us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1 --machine-type n1-standard-4 --logging gs://nihnialngcbg-eqtl/resources/references/logs/salmon_index_gencode_32.log --command-line 'mkdir -p /gcloud-shared/reference; salmon --no-version-check index -t ${REF_TX_FASTA} -i /gcloud-shared/reference -k 31 --gencode --threads $(nproc); gsutil -mq cp -r /gcloud-shared/reference ${GS_SALMON_INDEX_OUT}; ls -lhR /gcloud-shared' --inputs REF_TX_FASTA=gs://nihnialngcbg-eqtl/resources/references/gencode.v32.transcripts.fa --env-vars GS_SALMON_INDEX_OUT=gs://nihnialngcbg-eqtl/resources/references/salmon/gencode_32 --labels=pipe=salmonindex,reference=gencode,user=gibbsr --preemptible

Running [projects/463418702553/locations/us-central1/operations/7476798936617483009].


In [100]:
#check job status, id OPID above
op_id = 7476798936617483009

!gcloud beta lifesciences operations describe {op_id}

done: true
metadata:
  '@type': type.googleapis.com/google.cloud.lifesciences.v2beta.Metadata
  createTime: '2020-01-09T21:39:49.740754Z'
  endTime: '2020-01-09T21:52:40.707318167Z'
  events:
  - description: Worker released
    timestamp: '2020-01-09T21:52:40.707318167Z'
    workerReleased:
      instance: google-pipelines-worker-b117596df4631df34df9ffb03442457f
      zone: us-central1-f
  - containerStopped:
      actionId: 3
    description: Stopped running "/bin/sh -c gsutil -m -q cp /google/logs/output gs://nihnialngcbg-eqtl/resources/references/logs/salmon_index_gencode_32.log"
    timestamp: '2020-01-09T21:52:39.596493069Z'
  - containerStarted:
      actionId: 3
    description: Started running "/bin/sh -c gsutil -m -q cp /google/logs/output gs://nihnialngcbg-eqtl/resources/references/logs/salmon_index_gencode_32.log"
    timestamp: '2020-01-09T21:52:32.275574630Z'
  - containerStopped:
      actionId: 2
      stderr: |
[Building BooPHF]  99.8 %   elapsed:   0 min 14 sec   rema

#### run a test of quantifying an RNA sample

In [104]:
# run a test of quantifying an RNA sample
in_bucket = 'gs://nihnialngcbg-eqtl/rosmap'
out_bucket = 'gs://nihnialngcbg-eqtl/rosmap_test'
sample = '102_120418'

gcp_lsci_pipe_cmd = 'mkdir -p /gcloud-shared/reference; \
gsutil -mq cp ${SALMONTRANSCRIPTMODELS}/* /gcloud-shared/reference; \
salmon --no-version-check quant -i /gcloud-shared/reference -l A -1 ${FASTQR1} -2 ${FASTQR2} \
--validateMappings --seqBias --gcBias --posBias --threads $(nproc) \
-o /gcloud-shared/sample; \
gsutil -mq cp -r /gcloud-shared/sample ${GS_OUTPATH}; \
ls -lhR /gcloud-shared'

gcp_lsci_pipe_vars = f'SALMONTRANSCRIPTMODELS={prj_bucket}/resources/references/\
salmon/gencode_{gencode_lastest},\
GS_OUTPATH={out_bucket}/quants/gencode_r32/{sample},\
FASTQR1=/gcloud-shared/inputR1.gz,\
FASTQR2=/gcloud-shared/inputR2.gz'

cloud_cmd = f'gcloud beta lifesciences pipelines run \
--project {project_id} \
--docker-image {docker_img} \
--machine-type n1-standard-4 \
--logging {out_bucket}/logs/salmon/{sample} \
--command-line \'{gcp_lsci_pipe_cmd}\' \
--inputs FASTQR1={in_bucket}/fastqs/{sample}_R1.fastq.gz \
--inputs FASTQR2={in_bucket}/fastqs/{sample}_R2.fastq.gz \
--env-vars {gcp_lsci_pipe_vars} \
--labels=pipe=salmonquant,sample={sample},user={my_user} '

# ' \
# --preemptible'

print(f'{cloud_cmd}\n')

!{cloud_cmd}

gcloud beta lifesciences pipelines run --project nih-nia-lng-cbg --docker-image us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1 --machine-type n1-standard-4 --logging gs://nihnialngcbg-eqtl/rosmap_test/logs/salmon/102_120418 --command-line 'mkdir -p /gcloud-shared/reference; gsutil -mq cp ${SALMONTRANSCRIPTMODELS}/* /gcloud-shared/reference; salmon --no-version-check quant -i /gcloud-shared/reference -l A -1 ${FASTQR1} -2 ${FASTQR2} --validateMappings --seqBias --gcBias --posBias --threads $(nproc) -o /gcloud-shared/sample; gsutil -mq cp -r /gcloud-shared/sample ${GS_OUTPATH}; ls -lhR /gcloud-shared' --inputs FASTQR1=gs://nihnialngcbg-eqtl/rosmap/fastqs/102_120418_R1.fastq.gz --inputs FASTQR2=gs://nihnialngcbg-eqtl/rosmap/fastqs/102_120418_R2.fastq.gz --env-vars SALMONTRANSCRIPTMODELS=gs://nihnialngcbg-eqtl/resources/references/salmon/gencode_32,GS_OUTPATH=gs://nihnialngcbg-eqtl/rosmap_test/quants/gencode_r32/102_120418,FASTQR1=/gcloud-shared/inputR1.gz,FASTQR2=/gcloud-shared/inpu

In [109]:
#check job status, id OPID above
op_id = 8359902396683357875

!gcloud beta lifesciences operations describe {op_id}

done: true
metadata:
  '@type': type.googleapis.com/google.cloud.lifesciences.v2beta.Metadata
  createTime: '2020-01-09T22:54:33.648924Z'
  endTime: '2020-01-09T23:46:42.441497100Z'
  events:
  - description: Worker released
    timestamp: '2020-01-09T23:46:42.441497100Z'
    workerReleased:
      instance: google-pipelines-worker-ef96c2fdee115da0ee33424069362bfa
      zone: us-central1-f
  - containerStopped:
      actionId: 4
    description: Stopped running "/bin/sh -c gsutil -m -q cp /google/logs/output gs://nihnialngcbg-eqtl/rosmap_test/logs/salmon/102_120418"
    timestamp: '2020-01-09T23:46:41.624120594Z'
  - containerStarted:
      actionId: 4
    description: Started running "/bin/sh -c gsutil -m -q cp /google/logs/output gs://nihnialngcbg-eqtl/rosmap_test/logs/salmon/102_120418"
    timestamp: '2020-01-09T23:46:34.628396745Z'
  - containerStopped:
      actionId: 3
      stderr: |+
        rel diff. = 0.163479
        [2020-01-09 23:43:26.470] [jointLog] [info] iteration = 70