## Trinity assembly

In [None]:
%run ../config/init.py

### Loading data from {{ cookiecutter.dataset_name }}/sample_table.csv accession list

The file `{{ cookiecutter.dataset_name }}/sample_table.cs` should contains a single column with all SRA IDs to be processed.

In [None]:
data_dir = os.path.join(DATA, DATASET)
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'trinity_assembly'))

sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

## Testing gcloud configuration

### Requirements

#### [Cloud SDK](https://cloud.google.com/sdk)


Run *gcloud init* to initialize the gcloud environment and follow its instructions:

 `$ gcloud init`

In [None]:
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')

### Defining variables

Edit GCP zone and region variable accordingly to your geographical location.

In [None]:
MACHINE_TYPE = 'n2-standard-64'
PREEMPTIBLE = False   # Use preemptible
LOCAL_SSD_SIZE = 3000 # 8 diskw of size 375

# Use zones close to your location. Multiple zones allow more access to resources
ZONES = ['us-east1-c', 'us-east1-b','us-east1-d', 'us-east4-a', 'us-east4-b', 'us-east4-c']


### Retrieve GCP storage bucket

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://{}-vector-'.format(DATASET.lower())
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://{}-vector-'.format(DATASET.lower()),'').replace('/','')
        break

vector_bucket  = '{}-vector-{}'.format(DATASET.lower(),bucket)
print('vector bucket: {0}'.format(vector_bucket))

out_bucket = '{}-trinity-{}'.format(DATASET.lower(), bucket)
bucket_list = !gsutil ls gs://{out_bucket}
if ''.join(bucket_list).startswith('BucketNotFoundException'):
    !gsutil mb gs://{out_bucket}

## Submitting job

In [None]:
operations = {'operations':None}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-trinity.json')

os.chdir(result_dir)
       
if os.path.exists('operations-trinity.tsv'):
    operations['logs'] = {}
    with open('operations-trinity.tsv') as fin:
        operations['operations'] = fin.readline().strip()
else:
    trinity_yml = {
        "max_memory" : "500G",
        "CPU": 128,
        "output": "trinity",
        "seqType": "fq",
{% if cookiecutter.sequencing_technology == 'paired-end' %}
        "left": [],
        "right": []
{% else %}
        "single": []
{% endif %}
    }
    for s in sra_df[0].unique():
{% if cookiecutter.sequencing_technology == 'paired-end' %}
        trinity_yml['left'].append(
            {'class': 'File', 'path': '/data/{}_clean_foreign_1.fastq.gz'.format(s)}
        )
        trinity_yml['right'].append(
            {'class': 'File', 'path': '/data/{}_clean_foreign_2.fastq.gz'.format(s)}
        )
{% else %}
        trinity_yml['single'].append(
            {'class': 'File', 'path': '/data/{}_clean_foreign.fastq.gz'.format(s)}
        )
{% endif %}
    write_to_yaml(trinity_yml, "trinity.yml")  
    !gsutil cp trinity.yml gs://{vector_bucket}
    a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={out_bucket},INBUCKET={vector_bucket}
    if len(a) == 1 and a[0].startswith('Running'):
        a = a[0].replace('].','').split('/')[5]
        operations['operations'] = a
        with open('operations-trinity.tsv', 'w') as fout:
            fout.write('{}\n'.format(a))
    else:
        print("ERROR:\n" + str(a))
print('Operation: ' +  operations['operations'])        

In [None]:
id = operations['operations']
a = !gcloud beta lifesciences operations describe --format=json {id}
l = json.loads(''.join(a))
if 'done' in l:
    if  'error' not in l:
        operations['logs'] = l
        with gzip.GzipFile('{}_trinity.json.gz'.format(id), 'w') as fout:   # 4. gzip
            fout.write(json.dumps(l, indent=2).encode('utf-8'))
        ts = get_gpc_starttimestamp(l)
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(l['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = (te - ts)/pandas.Timedelta('1 minute')
        MACHINE_PRICE = 0.85 # n2d-standard-64 1.308544 Preemptible
        print('Computig cost: $ {:.2f}'.format(elapsed * MACHINE_PRICE/60))
    else:
        print("ERROR")
        print(l['error'])
else:
    ts = get_gpc_starttimestamp(l)
    ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
    te = datetime.strptime(datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S"), "%Y-%m-%dT%H:%M:%S")
    elapsed = (te - ts)/pandas.Timedelta('1 minute')
    print('Still running. Elapsed time: {:.2f}, latest even:'.format(elapsed))
    print(json.dumps(l['metadata']['events'][0], indent=4))

## Downloading assembly results from GCP

In [None]:
os.chdir(result_dir)
!gsutil -m cp gs://{out_bucket}/Trinity.fasta ./
!gzip Trinity.fasta