# Setting up resources to quantify transcript with Salmon on GCP

### PART1: generate salmon index
https://salmon.readthedocs.io/en/latest/salmon.html


#### Jan 07, 2020

In [2]:
# setup notebook global vars
#gcp 
project_id = 'nih-nia-lng-cbg'
prj_bucket = 'gs://nihnialngcbg-eqtl'
my_user='mooreank'


#### pull chess reference transcripts for indexing

In [66]:
# pull Chess reference transcripts for indexing
#setup the local reference files on labseq for gencode_37 from GRCh38/hg38
chess_src_path = f'http://ccb.jhu.edu/chess/\
/release_{gencode_lastest}'
chess_gtf = f'chess2.2_and_refseq.gtf.gz'
chess_fa = f'hg38_p8.fa.gz'
#gencode_local_dir = f'/labseq/Genomes/GRCh38_hg38/gencode_{gencode_lastest}'
chess_local_dir = f'/Users/mooreank/Desktop/Raph/requant/chess'


#### pull Gencode reference transcripts for indexing

In [66]:
# pull Gencode reference transcripts for indexing
#setup the local reference files on labseq for gencode_37 from GRCh38/hg38
gencode_lastest = '32'
gencode_src_path = f'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human\
/release_{gencode_lastest}'
gencode_gtf = f'gencode.v{gencode_lastest}.annotation.gtf.gz'
gencode_fa = f'gencode.v{gencode_lastest}.transcripts.fa.gz'
#gencode_local_dir = f'/labseq/Genomes/GRCh38_hg38/gencode_{gencode_lastest}'
gencode_local_dir = f'/Users/mooreank/Desktop/Raph/requant/gencode'


In [None]:
#make sure local path is created
!mkdir -p {chess_local_dir}

#pull the gencode reference
!curl --silent -L {chess_src_path}/{chess_gtf} \
--output {chess_local_dir}/{chess_gtf}

!curl --silent -L {chess_src_path}/{chess_fa} \
--output {chess_local_dir}/{chess_fa}

In [None]:
#make sure local path is created
!mkdir -p {gencode_local_dir}

#pull the gencode reference
!curl --silent -L {gencode_src_path}/{gencode_gtf} \
--output {gencode_local_dir}/{gencode_gtf}

!curl --silent -L {gencode_src_path}/{gencode_fa} \
--output {gencode_local_dir}/{gencode_fa}

#### prep reference transcriptome as need

In [94]:
# need to use salmon decoy prep script (?)

!gunzip {chess_local_dir}/{chess_fa}

In [70]:
# for CHESS
# need to create transcript fasta from GTF and reference genome
# need to modify fasta entries labels with transcript and gene IDs and gene name
import pandas as pd
import numpy as np

WRKDIR = '/Users/mooreank/Desktop/Raph/requant/chess/'

chess = pd.read_csv('{}chess2.2_and_refseq.gtf'.format(WRKDIR), sep='\t', header = None)
chess = pd.DataFrame(chess)
chess.columns = ['Chrom','Source','Type','Start','End','','','','Split']

chess['Split'] = chess['Split'].str.rstrip(';')
chess[['Transcript_id', 'Gene_id','Gene_name']] = chess.Split.str.split(";",expand=True) 
del chess['Split']

chess.loc[chess['Gene_name'].isnull(), 'Gene_name'] = 'None'
chess['Transcript_id'] = chess['Transcript_id'].map(lambda x: x.lstrip('transcript_id "').rstrip('"'))
chess['Gene_id'] = chess['Gene_id'].map(lambda x: x.lstrip('gene_id "').rstrip('"'))
chess['Gene_name'] = chess['Gene_name'].map(lambda x: x.lstrip('gene_name "').rstrip('"'))

chess['Fasta_id'] = chess['Transcript_id'].map(str)+"|"+chess['Gene_id'].map(str)+"|"+chess['Gene_name'].map(str)+"|"+chess['Start'].map(str)+"|"+chess['End'].map(str)+'|'+chess['Type']


chess_transc = chess[chess['Type'] == 'transcript']

##make dict of transcript_id and fast id
keys = chess_transc['Transcript_id'].tolist()
values = chess_transc['Fasta_id'].tolist()
headers = dict(zip(keys, values))

from Bio import SeqIO

original_file = '/Users/mooreank/Desktop/Raph/requant/chess/chess.refseq.transcriptome.fa'
corrected_file = '/Users/mooreank/Desktop/Raph/requant/chess/chess.refseq.transcriptome.corrected.fa'

with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        for key, value in headers.items():
            if key == record.id:
                record.id = value
                #print(record)
                SeqIO.write(record, corrected, 'fasta')


# need to use salmon decoy prep script (?)


#### filter gencode transcript based on transcript rating (run as python file)

#### get list of bad transcript record ids

In [5]:
# FOR GENCODE
# exclude transcripts with low level support, ie only keep 'goodish' \
# exclude Ensembl TSL levels 4 or 5 AND Gencode level 3

##generating text file of transcript ids to remove from transcriptome fasta
#!/usr/bin/env python

import pandas as pd
import gtfparse as gtf
pd.set_option('display.max_columns', None)
gencode_lastest = '32'

WRKDIR = '/Users/mooreank/Desktop/Raph/requant/gencode/'
#annot_file = f'{WRKDIR}gencode.v32.annotation.gtf'
#annot_file = f'{WRKDIR}gencode.v29.transcripts.txt.gz'

gencode_gtf = gencode_gtf = f'{WRKDIR}gencode.v{gencode_lastest}.annotation.gtf'
annot_df = gtf.read_gtf(gencode_gtf)
#annot_df = pd.read_csv(annot_file, sep = '\t')
annot_df.head()

annot_trans = annot_df[annot_df['feature'] == 'transcript']
#annot_trans = annot_df[annot_df['type'] == 'transcript']

filter1 = annot_trans[(annot_trans['transcript_support_level'] == '4') & (annot_trans['level'] == '3')]
filter1_list = filter1['transcript_id'].tolist()
print(len(filter1_list))
#filter2.head()

filter2 = annot_trans[(annot_trans['transcript_support_level'] == '5') & (annot_trans['level'] == '3')]
filter2_list = filter2['transcript_id'].tolist()
print(len(filter2_list))
#filter3.head()

#all_bad = set(filter1_list+filter2_list+filter3_list)
all_bad = set(filter1_list+filter2_list)
len(all_bad)

with open('/Users/mooreank/Desktop/Raph/salmon/gencode/bad.gencode.v32.transcript_ids.updated.txt', 'w') as f:
    for item in all_bad:
        f.write("%s\n" % item)

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,level,hgnc_id,havana_gene,transcript_id,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,11869,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,,,,,,,,,,,
1,chr1,HAVANA,transcript,11869,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,,,,,
2,chr1,HAVANA,exon,11869,12227,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,1.0,ENSE00002234944.1,,,
3,chr1,HAVANA,exon,12613,12721,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,2.0,ENSE00003582793.1,,,
4,chr1,HAVANA,exon,13221,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,3.0,ENSE00002312635.1,,,


### removing bad transcripts from reference transcriptome fasta

In [None]:
# FOR GENCODE
#!/usr/bin/env python

#import sys
#from sets import Set
from Bio import SeqIO

gencode_fa = 'gencode.v32.transcripts.fa'
WRKDIR = '/home/mooreank/salmon/'
ids = '{}bad.gencode.v32.transcript_ids.txt'.format(WRKDIR)
#original_fasta = '{}gencode.v32.transcripts.fa'.format(WRKDIR)
#corrected_fasta = '{}filtered.gencode.v32.transcripts.fa'.format(WRKDIR)

# read the first file given and generate a set (faster iteration respect lists
##get list of transcript ids to remove
identifiers = []

with open(ids, 'r') as fi:
    for line in fi:
        line = line.strip()
        identifiers.append(str(line).replace(">", ""))
        
##get list of all record ids in original fasta
record_ids = []

#with open('{}gencode.v32.transcripts.fa'.format(WRKDIR)) as original_fasta:
with open(gencode_fa) as original_fasta:
    records = SeqIO.parse(original_fasta, 'fasta')
    for record in records:
        record_ids.append(record.id)



##get list of record ids to keep
filtered = [i for i in record_ids if not any(i for j in identifiers if str(j) in i)]


##run through original fasta and keep only records with record ids in filtered list
#with open('{}gencode.v32.transcripts.fa'.format(WRKDIR)) as original_fasta, open('{}/nohup.filtered.gencode.v32.transcripts.fa'.format(WRKDIR), 'w') as corrected_fasta:
with open(gencode_fa) as original_fasta, open('{}/nohup.filtered.gencode.v32.transcripts.fa'.format(WRKDIR), 'w') as corrected_fasta:
        records = SeqIO.parse(original_fasta, 'fasta')
        for record in records:
            for x in filtered:
                if x == record.id:
                    SeqIO.write(record, corrected_fasta, 'fasta')

#### push prepped transcriptomes

In [63]:
#### push CHESS
local_transcriptome_fasta = f'{chess_local_dir}/{chess_fa}'.replace('.gz','')
gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/{gencode_fa}'.replace('.gz','')
#gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/salmon/chess/chess.refseq.transcriptome.corrected.fa'

gcloud_cmd = f'gsutil -mq cp {local_transcriptome_fasta} {gcs_transcriptome_fasta}'

print(gcloud_cmd)

!{gcloud_cmd}

gsutil -mq cp /labseq/Genomes/GRCh38_hg38/gencode_32/gencode.v32.transcripts.fa gs://nihnialngcbg-eqtl/resources/references/gencode.v32.transcripts.fa
CommandException: No URLs matched: /labseq/Genomes/GRCh38_hg38/gencode_32/gencode.v32.transcripts.fa
CommandException: 1 file/object could not be transferred.


In [None]:
#### push GENCODE
gencode_fa = 'filtered.gencode.v32.transcripts.updated.fa'
local_transcriptome_fasta = f'{gencode_local_dir}/{gencode_fa}'.replace('.gz','')
#gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/{gencode_fa}'.replace('.gz','')
gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/salmon/gencode_v32_filtered/{gencode_fa}'.replace('.gz','')

gcloud_cmd = f'gsutil -mq cp {local_transcriptome_fasta} {gcs_transcriptome_fasta}'

print(gcloud_cmd)

!{gcloud_cmd}

In [76]:
print(local_transcriptome_fasta)
print(gcs_transcriptome_fasta)

/labseq/Genomes/GRCh38_hg38/gencode_32/gencode.v32.transcripts.fa
gs://nihnialngcbg-eqtl/resources/references/salmon/chess/chess.refseq.transcriptome.corrected.fa


#### create salmon index for the reference transcriptome (chess)

In [3]:
##CHESS VERSION

gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/salmon/chess/chess.refseq.transcriptome.corrected.fa'


# use a GCP lifesciences job to generate the salmon index(?)
docker_img = 'us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1'

# gcs_transcriptome_fasta = f'{prj_bucket}/resources/references/salmon/gencode_r32/\
# nohup.filtered.gencode.v32.transcripts.fa'

gcp_lsci_pipe_cmd = 'mkdir -p /gcloud-shared/reference; \
salmon --no-version-check index -t ${REF_TX_FASTA} -i /gcloud-shared/reference \
-k 31 --gencode --threads $(nproc); \
gsutil -mq cp -r /gcloud-shared/reference ${GS_SALMON_INDEX_OUT}; \
ls -lhR /gcloud-shared'

gcp_lsci_pipe_vars = f'GS_SALMON_INDEX_OUT={prj_bucket}/resources/references/\
salmon/chess_test'

cloud_cmd = f'gcloud beta lifesciences pipelines run \
--project {project_id} \
--docker-image {docker_img} \
--machine-type n1-standard-4 \
--zones us-central1-f \
--logging {prj_bucket}/resources/references/logs/salmon_index_chess.log \
--command-line \'{gcp_lsci_pipe_cmd}\' \
--inputs REF_TX_FASTA={gcs_transcriptome_fasta} \
--env-vars {gcp_lsci_pipe_vars} \
--labels=pipe=salmonindex,reference=gencode,user={my_user} \
--preemptible'

print(f'{cloud_cmd}\n')

!{cloud_cmd}

gcloud beta lifesciences pipelines run --project nih-nia-lng-cbg --docker-image us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1 --machine-type n1-standard-4 --zones us-central1-f --logging gs://nihnialngcbg-eqtl/resources/references/logs/salmon_index_chess.log --command-line 'mkdir -p /gcloud-shared/reference; salmon --no-version-check index -t ${REF_TX_FASTA} -i /gcloud-shared/reference -k 31 --gencode --threads $(nproc); gsutil -mq cp -r /gcloud-shared/reference ${GS_SALMON_INDEX_OUT}; ls -lhR /gcloud-shared' --inputs REF_TX_FASTA=gs://nihnialngcbg-eqtl/resources/references/salmon/chess/chess.refseq.transcriptome.corrected.fa --env-vars GS_SALMON_INDEX_OUT=gs://nihnialngcbg-eqtl/resources/references/salmon/chess_test --labels=pipe=salmonindex,reference=gencode,user=mooreank --preemptible

Running [projects/463418702553/locations/us-central1/operations/4827236342629327656].


In [None]:
#check job status, id OPID above
op_id = 448973225553939675

!gcloud beta lifesciences operations describe {op_id}

#### run a test of quantifying an RNA sample

In [None]:
##get full sample list to run
%%bash

#make sure all the desired bams made it up
#gsutil ls gs://nihnialngcbg-eqtl/rosmap/fastqs/*_R1.fastq.gz > bucket.fastqs.list
gsutil ls gs://nihnialngcbg-eqtl/rosmap/fastqs/*_R2.fastq.gz > bucket.fastqs.list
#sed -i s"/_R1\.fastq\.gz//"g bucket.fastqs.list
sed -i -e s"/_R2\.fastq\.gz//"g bucket.fastqs.list
sed -i -e s"/gs\:\/\/nihnialngcbg-eqtl\/rosmap\/fastqs\///"g bucket.fastqs.list

head -3 bucket.fastqs.list > test.bucket.fastqs.list


In [22]:
###get list of samples in python (need to finish this)
samples = ['03_120405', '05_120405','08_120410']
#sample_list = ['03_120405','05_120405']

In [69]:
###for chess reference

# run a test of quantifying an RNA sample
# in_bucket = 'gs://nihnialngcbg-eqtl/rosmap'
# out_bucket = 'gs://nihnialngcbg-eqtl/rosmap_test'
# sample = '03_120405'
# project_id = 'nih-nia-lng-cbg'
# prj_bucket = 'gs://nihnialngcbg-eqtl'
# docker_img = 'us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1'
# my_user = 'mooreank'



# gcp_lsci_pipe_cmd = 'mkdir -p /gcloud-shared/reference; \
# gsutil -mq cp ${SALMONTRANSCRIPTMODELS}/* /gcloud-shared/reference; \
# salmon --no-version-check quant -i /gcloud-shared/reference -l A -1 ${FASTQR1} -2 ${FASTQR2} \
# --validateMappings --seqBias --gcBias --posBias --threads $(nproc) \
# -o /gcloud-shared/sample; \
# gsutil -mq cp -r /gcloud-shared/sample ${GS_OUTPATH}; \
# ls -lhR /gcloud-shared'

# gcp_lsci_pipe_vars = f'SALMONTRANSCRIPTMODELS={prj_bucket}/resources/references/\
# salmon/chess/reference,\
# GS_OUTPATH={out_bucket}/quants/chess/{sample},\
# FASTQR1=/gcloud-shared/inputR1.gz,\
# FASTQR2=/gcloud-shared/inputR2.gz'

# cloud_cmd = f'gcloud beta lifesciences pipelines run \
# --project {project_id} \
# --docker-image {docker_img} \
# --machine-type n1-standard-4 \
# --logging {out_bucket}/logs/salmon/{sample} \
# --command-line \'{gcp_lsci_pipe_cmd}\' \
# --inputs FASTQR1={in_bucket}/fastqs/${{sample}}_R1.fastq.gz \
# --inputs FASTQR2={in_bucket}/fastqs/${{sample}}R2.fastq.gz \
# --env-vars {gcp_lsci_pipe_vars} \
# --labels=pipe=salmonquant,sample={sample},user={my_user} \
# --preemptible ' 

# print(f'{cloud_cmd}\n')

# #!{cloud_cmd}






gcloud beta lifesciences pipelines run --project nih-nia-lng-cbg --docker-image us.gcr.io/nih-nia-lng-cbg/nihnialngcbg-salmon:2020-1 --machine-type n1-standard-4 --logging gs://nihnialngcbg-eqtl/rosmap_test/logs/salmon/03_120405 --command-line 'mkdir -p /gcloud-shared/reference; gsutil -mq cp ${SALMONTRANSCRIPTMODELS}/* /gcloud-shared/reference; salmon --no-version-check quant -i /gcloud-shared/reference -l A -1 ${FASTQR1} -2 ${FASTQR2} --validateMappings --seqBias --gcBias --posBias --threads $(nproc) -o /gcloud-shared/sample; gsutil -mq cp -r /gcloud-shared/sample ${GS_OUTPATH}; ls -lhR /gcloud-shared' --inputs FASTQR1=gs://nihnialngcbg-eqtl/rosmap/fastqs/${sample}_R1.fastq.gz --inputs FASTQR2=gs://nihnialngcbg-eqtl/rosmap/fastqs/${sample}R2.fastq.gz --env-vars SALMONTRANSCRIPTMODELS=gs://nihnialngcbg-eqtl/resources/references/salmon/chess/reference,GS_OUTPATH=gs://nihnialngcbg-eqtl/rosmap_test/quants/chess/03_120405,FASTQR1=/gcloud-shared/inputR1.gz,FASTQR2=/gcloud-shared/inputR2.gz