## Vector removal

In [None]:
%run ../config/init.py

### Loading data from {{ cookiecutter.dataset_name }}/sample_table.csv accession list

The file `{{ cookiecutter.dataset_name }}/sample_table.cs` should contains a single column with all SRA IDs to be processed.

In [None]:
data_dir = os.path.join(DATA, DATASET)
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'vector_cleanup'))

sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

## Testing gcloud configuration

### Requirements

#### [Cloud SDK](https://cloud.google.com/sdk)


Run *gcloud init* to initialize the gcloud environment and follow its instructions:

 `$ gcloud init`

In [None]:
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')

### Defining variables

Edit GCP zone and region variable accordingly to your geographical location.

In [None]:
REGION = 'us-east4'
ZONE = 'us-east4-c'

### Retrieve GCP storage bucket

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://transcriptome-trimming-'
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://transcriptome-trimming-','').replace('/','')
        break

trimming_bucket = 'transcriptome-trimming-' + bucket
vector_bucket  = 'transcriptome-vector-' + bucket
        
bucket_list = !gsutil ls gs://{vector_bucket}
if ''.join(bucket_list).startswith('BucketNotFoundException'):
    !gsutil mb -l {ZONE} gs://{vector_bucket}
        
        
print('trimming bucket: {0}'.format(trimming_bucket))
print('vector bucket: {0}'.format(vector_bucket))

### Submitting jobs for vector cleanup  to GCP

In [None]:
operations = {}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-transcriptome-fastq-cleanup.json')

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir) 
os.chdir(op_dir)
       
if os.path.exists('operations-vector-fastq-pe.tsv'):
    operations['logs'] = {}
    operations['operations'] = pandas.read_csv('operations-vector-fastq-pe.tsv', sep='\t')
else:
    d = []
    for f in sra_df[0].unique():
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={vector_bucket},INBUCKET={trimming_bucket},SAMPLE={f}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])
    operations['logs'] = {}
    operations['operations'] = pandas.DataFrame(d, columns=['sample', 'id', 'status'])
    operations['operations'].to_csv('operations-vector-fastq-pe.tsv', sep='\t', index=None)

display(operations['operations'])

### GCP log retrival for plotting
This cell will download the GCP logs for completed operations (jobs) creating the `[sample.json` files for each sample. 

You should executed it multiple times until all operations are completed. 

In [None]:
os.chdir(op_dir)
    
df = operations['operations'].dropna()
data = []                
for i, r in df.iterrows():
    id = r['id']
    if os.path.exists('{}.json.gz'.format(r['sample'])):
        with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'r') as fin:  
            operations['logs'][r['sample']] = json.loads(fin.read().decode('utf-8'))
    else:
        if r['sample'] not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                operations['logs'][r['sample']] = l
                with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'w') as fout:   # 4. gzip
                    fout.write(json.dumps(l, indent=2).encode('utf-8'))  
    if r['sample'] in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][r['sample']])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][r['sample']]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([r['sample'], elapsed])
operations['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
display(operations['gcp'])


MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
print('Computig cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))