# Vector detection using Blast and the NCBI UniVec database

In [None]:
%run ../config/init.py

## Testing gcloud configuration

### Requirements

#### [Cloud SDK](https://cloud.google.com/sdk)


Run *gcloud init* to initialize the gcloud environment and follow its instructions:

 `$ gcloud init`
 

In [None]:
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')

### Defining variables

Edit GCP zone and region variable accordingly to your geographical location.

In [None]:
ZONE = 'us-east4'
REGION = 'us-east4-c'
QUERY_SIZE = 10000
# Prices from 06/09/2021
PRICE = 0.64
MACHINE_TYPE = "n1-standard-64"
TRANSCRIPTOME_NAME = 'Trinity.fasta.gz'
TRANSCRIPTOME_FILE = os.path.join(RESULTS, DATASET, 'trinity_assembly', TRANSCRIPTOME_NAME)

### Creating working directory

In [None]:
result_dir = os.path.join(RESULTS, DATASET)
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
result_dir = os.path.join(result_dir, 'fasta')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)

### Create or retrieve GCP storage bucket

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://{}-vector-'.format(DATASET.lower())
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://{}-vector-'.format(DATASET.lower()),'').replace('/','')
        break

vector_bucket  = '{}-vector-{}'.format(DATASET.lower(),bucket)
print('vector bucket: {0}'.format(vector_bucket))

out_bucket = '{}-data-{}'.format(DATASET.lower(), bucket)
bucket_list = !gsutil ls gs://{out_bucket}
if ''.join(bucket_list).startswith('BucketNotFoundException'):
    !gsutil mb gs://{out_bucket}
    !gsutil cp {TRANSCRIPTOME_FILE} gs://{out_bucket}/


### Submitting jobs
After running this cell you should go to the [Google Cloud Console](https://console.cloud.google.com/compute) to visualize all running instances.

In [None]:
if not os.path.exists('operations.tsv'):
    PIPELINE_JSON = os.path.join(BIN, 'gcp', 'pipeline-transcriptome-cleanup.json')
    a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE_JSON} --env-vars=QUERY_SIZE={QUERY_SIZE},EVALUE=700,INBUCKET={out_bucket},TRANSCRIPTOME={TRANSCRIPTOME_NAME}
    if len(a) == 1 and a[0].startswith('Running'):
        a = a[0].replace('].','').split('/')[5]
        print('Job submitted with ID: {}'.format(a))
        with open('operations.tsv', 'w') as fout:
            fout.write('{}\n'.format(a))
    else:
        print('Error!!')
        print(a)
else:
    with open('operations.tsv') as fin:
        a = int(fin.read().strip())
        print('Job ID: {}'.format(a))

In [None]:
if os.path.exists('{}_log.json.gz'.format(a)):
    with gzip.GzipFile('{}_log.json.gz'.format(a), 'r') as fin:
        log = json.loads(fin.read().decode('utf-8'))
else:
    log = !gcloud beta lifesciences operations describe --format=json {a}
    log = json.loads(''.join(log))
    if log['done'] == True and 'error' not in log:
        with gzip.GzipFile('{}_log.json.gz'.format(a), 'w') as fout:   # 4. gzip
            fout.write(json.dumps(log, indent=2).encode('utf-8'))
    else:
        log['done'] = False
        print('Workflow still running or finished with error')
        print(json.dumps(log, indent=4))

if log['done'] == True:
    ts = get_gpc_starttimestamp(log)
    ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
    te = te = datetime.strptime(log['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
    elapsed = (te - ts)/pandas.Timedelta('1 minute')
    print('Workflow finished correctly in {} minutes with cost $ {:.2f}'.format(elapsed, elapsed/60 * PRICE))