# Setup for TEtranscripts pipeline on google cloud
https://github.com/mhammell-laboratory/TEtranscripts

In [None]:
WRKDIR = 'path/to/telocal/dir'
DOCKERFILEDIR = f'{WRKDIR}/build'
DOWNLOADDIR = f'{WRKDIR}/pipelinedownloads'
PROJECT_ID = 'pd-genome'


GCOUTPUT ='gs://path/to/output' 
USER = 'grennfp'


COHORT = 'ppmi'

In [None]:
!mkdir {DOWNLOADDIR}


In [None]:
from pathlib import Path
import pandas as pd
import os

# (2) Get the Gene and TE Annotation Files
found [here](http://labshare.cshl.edu/shares/mhammelllab/www-data/TElocal/prebuilt_indices/)

## a) TE Annotation

In [None]:
!curl http://labshare.cshl.edu/shares/mhammelllab/www-data/TElocal/prebuilt_indices/hg38_rmsk_wHSAT2_TElocus.ind.gz -o {DOWNLOADDIR}/hg38_rmsk_wHSAT2_TElocus.ind.gz
!gunzip {DOWNLOADDIR}/hg38_rmsk_wHSAT2_TElocus.ind.gz


In [None]:
!curl http://labshare.cshl.edu/shares/mhammelllab/www-data/TElocal/GTF/hg38_rmsk_TEinst.gtf.gz -o {DOWNLOADDIR}/hg38_rmsk_TEinst.gtf.gz
!gunzip {DOWNLOADDIR}/hg38_rmsk_TEinst.gtf.gz

In [None]:
!curl http://labshare.cshl.edu/shares/mhammelllab/www-data/TEtranscripts/TE_GTF/hg38_rmsk_TE.gtf.gz -o {DOWNLOADDIR}/hg38_rmsk_TEinst2.gtf.gz
!gunzip {DOWNLOADDIR}/hg38_rmsk_TEinst2.gtf.gz


## b) Gene Annotation
download the hg38 gene annotation file: http://genome.ucsc.edu/cgi-bin/hgTables 

enter file name, select gtf output format, and hit get output

# (2) Docker Setup
docker must be installed locally (https://docs.docker.com/docker-for-mac/install/)

## a) make the dockerfile and build the image
run the following in the directory of the dockerfile  
make sure the annotation files are in the same directory as the docker file before building. 

In [None]:
!docker build -t telocal-image {DOCKERFILEDIR} 


In [None]:
!docker images

## b) Push the Image to Google Cloud

tag it and push it to the google cloud project we will be using

In [None]:
!docker tag telocal-image:latest us.gcr.io/{PROJECT_ID}/telocal-image
    

In [None]:
!docker images

In [None]:
!docker push us.gcr.io/{PROJECT_ID}/telocal-image

#### once pushed find the container on google cloud in the container registry and copy its full path
something like:

```us.gcr.io/projectid/test-image@sha256:04bc2af3cccd8618e6eafadc7d46e7fb24b2dc89e0e62ea0bdb26865d081f632```

paste the full path name into the input.json file wherever a docker variable is set

like

```"TEtranscriptsWorkflow.dockerimg": "us.gcr.io/projectid/test-image@sha256:04bc2af3cccd8618e6eafadc7d46e7fb24b2dc89e0e62ea0bdb26865d081f632"```

In [None]:
IMAGEPATH = 'us.gcr.io/pd-genome/telocal-image@sha256:6a4ede078b704d1f361eb471f14a9f2bc12811c08718887069f188d47795ed27'

# 3) Get List of Samples and Paths to BAM Files

In [None]:
bams = ['gs://pathtosample1.bam',
       'gs://pathtosample2.bam']
samples = ['sample1','sample2']

# (4) Setup Google Genomics Pipeline Run For TElocal


In [None]:
import json
#for paired end like nabec and ppmi samples
json_input_template = f'{WRKDIR}/templates/blank.telocal.input.json'

for i in range(len(samples2)):
    sample_id = str.replace(samples2[i].lower(),"-","")
    
    json_input_outfile_name = f'{WRKDIR}/jsons/{sample_id}.input.json'
    
    with open(json_input_template) as json_file:
        input_data = json.load(json_file)
        
        input_data['TElocalWorkflow.sample_name'] = sample_id
        input_data['TElocalWorkflow.sample_bam_file'] = bams2[i]
        
        input_data['TElocalWorkflow.TElocal.mem_size'] = "256 GB"
        
        input_data['TElocalWorkflow.TElocal.cpus'] = "32"
        #STRANDED OPTION (forward, reverse or no)
        input_data['TElocalWorkflow.TElocal.stranded'] = "no"
        
        with open(json_input_outfile_name, 'w') as json_outfile:
            json.dump(input_data,json_outfile,sort_keys=True,indent=4)

In [None]:
def formatgcpcmd(this_sample):
    this_cmd = f'echo -n {this_sample} \n\
gcloud alpha genomics pipelines run \
--project {PROJECT_ID} \
--pipeline-file gs://path/to/wdl_pipeline_preemptible.yaml \
--zones us-central1-a \
--memory 104 \
--logging {GCOUTPUT}/logs/{this_sample} \
--inputs-from-file WDL={WRKDIR}/TElocal_pipeline.wdl \
--inputs-from-file WORKFLOW_INPUTS={WRKDIR}/jsons/{this_sample}.input.json \
--inputs-from-file WORKFLOW_OPTIONS={WRKDIR}/generic.google-papi.options.json \
--inputs WORKSPACE={GCOUTPUT}/workspace/{this_sample} \
--inputs OUTPUTS={GCOUTPUT}/output/{this_sample} \
--preemptible \
--labels=pipe=telocal,sample={this_sample},cohort={COHORT.lower()},user={USER}'
    return(this_cmd)



In [None]:
cmds = [formatgcpcmd(sample.replace('-','').lower()) for sample in samples2]

temp_script_file = f'{WRKDIR}/run_TElocal_ggp.sh'.format(WRKDIR)

with open(temp_script_file, 'w') as file_handler:
    for this_cmd in cmds:
        file_handler.write(f"{this_cmd}\n")
        


# (5) Run TElocal 

In [None]:
print('#run these commands at terminal:\n')
print('chmod +x ' + temp_script_file)
print('nohup ' + temp_script_file + ' > {}/run_TElocal.log &'.format(WRKDIR))

In [None]:
#command to check the status of the job
!gcloud alpha genomics operations describe EK3H8YGoLhjDq9HN3NGvQyCkgNbfpAYqD3Byb2R1Y3Rpb25RdWV1ZQ

In [None]:
#command to cancel the job
!gcloud alpha genomics operations cancel EIWBuP-nLhi-2Ir6koey1voBIKSA1t-kBioPcHJvZHVjdGlvblF1ZXVl