## Vector removal

In [None]:
%run ../config/init.py

### Loading data from {{ cookiecutter.dataset_name }}/sample_table.csv accession list

The file `{{ cookiecutter.dataset_name }}/sample_table.cs` should contains a single column with all SRA IDs to be processed.

In [None]:
KINGDOM = '{{ cookiecutter.kingdom }}'
data_dir = os.path.join(DATA, DATASET)
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'vector_cleanup'))

sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

## Testing gcloud configuration

### Requirements

#### [Cloud SDK](https://cloud.google.com/sdk)


Run *gcloud init* to initialize the gcloud environment and follow its instructions:

 `$ gcloud init`

In [None]:
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')

### Defining variables

Edit GCP zone and region variable accordingly to your geographical location.

In [None]:
REGION = 'us-east4'
ZONE = 'us-east4-c'

### Retrieve GCP storage bucket

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://{}-trimming-'.format(DATASET.lower())
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://{}-trimming-'.format(DATASET.lower()),'').replace('/','')
        break

trimming_bucket = '{}-trimming-{}'.format(DATASET.lower(), bucket)
print('in bucket: {}'.format(trimming_bucket))

vector_bucket = '{}-vector-{}'.format(DATASET.lower(), bucket)

bucket_list = !gsutil ls gs://{vector_bucket}
if ''.join(bucket_list).startswith('BucketNotFoundException'):
    !gsutil mb gs://{vector_bucket}

print('vector bucket: {}'.format(vector_bucket))

for s in sra_df[0].unique():
    out_bucket = '{}-{}'.format(s.lower(),bucket)

    bucket_list = !gsutil ls gs://{out_bucket}
    if ''.join(bucket_list).startswith('BucketNotFoundException'):
        !gsutil mb gs://{out_bucket}


### Submitting jobs for vector cleanup  to GCP

In [None]:
operations = {
    'logs':{},
    'operations': pandas.DataFrame(columns=['sample', 'id', 'status'])}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-transcriptome-fastq-vector-cleanup.json')

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir)
os.chdir(op_dir)

if os.path.exists('operations-vector-fastq.tsv'):
    operations['operations'] = pandas.read_csv('operations-vector-fastq.tsv', sep='\t')

d = []
for f in sra_df[0].unique():
    if f not in operations['operations']['sample'].unique():
        print('Submitting sample: ' + f)
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={vector_bucket},INBUCKET={trimming_bucket},SAMPLE={f}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])

if d:
    operations['operations'] = pandas.concat([operations['operations'], pandas.DataFrame(d, columns=['sample', 'id', 'status'])])
    operations['operations'].to_csv('operations-vector-fastq.tsv', sep='\t', index=None)

display(operations['operations'])

### Checking GCP runs
This cell will download the GCP logs for completed operations (jobs).

In [None]:
os.chdir(op_dir)

df = operations['operations'].dropna()
data = []
with_error = []
count_running = 0
for i, r in df.iterrows():
    id = r['id']
    if os.path.exists('{}.json.gz'.format(r['sample'])):
        with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'r') as fin:
            operations['logs'][r['sample']] = json.loads(fin.read().decode('utf-8'))
    else:
        if r['sample'] not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                if 'error' not in l:
                    operations['logs'][r['sample']] = l
                    with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'w') as fout:   # 4. gzip
                        fout.write(json.dumps(l, indent=2).encode('utf-8'))
                else:
                    with_error.append(id)
            else:
                count_running += 1
    if r['sample'] in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][r['sample']])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][r['sample']]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([r['sample'], elapsed])

if count_running != 0:
    print('Still running {}'.format(count_running))

if with_error:
    operations['operations'] = df[~df['id'].isin(with_error)]
    operations['operations'].to_csv('operations-vector-fastq.tsv', sep='\t', index=None)
    print('{} runs with errors. Please rerun previous cell'.format(len(with_error)))
elif count_running == 0:
    operations['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
    operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
    display(operations['gcp'])

    MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
    print('Computig cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))

## Submitting jobs for contamination screening on fastq files

Edit variable TOTAL_PER_FILE accordingly with the number of reads in your fastq files. This will use to run
multiple blast searches in parallel.

In [None]:
operations = {
    'logs':{},
    'operations': pandas.DataFrame(columns=['sample', 'id', 'status'])}
TOTAL_PER_FILE = 1000000
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-transcriptome-fastq-contamination-cleanup.json')

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir)
os.chdir(op_dir)

if os.path.exists('operations-fastq-contamination-cleanup.tsv'):
    operations['operations'] = pandas.read_csv('operations-fastq-contamination-cleanup.tsv', sep='\t')

d = []
for f in sra_df[0].unique():
    if f not in operations['operations']['sample'].unique():
        out_bucket = '{}-{}'.format(f.lower(),bucket)
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={out_bucket},INBUCKET={vector_bucket},SAMPLE={f},TOTAL_PER_FILE={TOTAL_PER_FILE},PERCENT_IDENTITY={PERCENT_IDENTITY}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])
if d:
    operations['operations'] = pandas.concat([operations['operations'], pandas.DataFrame(d, columns=['sample', 'id', 'status'])])
    operations['operations'].to_csv('operations-fastq-contamination-cleanup.tsv', sep='\t', index=None)

display(operations['operations'])

### Checking GCP runs
This cell will download the GCP logs for completed operations (jobs).

In [None]:
os.chdir(op_dir)

df = operations['operations'].dropna()
data = []
with_error = []
count_running = 0
for i, r in df.iterrows():
    id = r['id']
    if os.path.exists('{}_cont.json.gz'.format(r['sample'])):
        with gzip.GzipFile('{}_cont.json.gz'.format(r['sample']), 'r') as fin:
            operations['logs'][r['sample']] = json.loads(fin.read().decode('utf-8'))
    else:
        if r['sample'] not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                if 'error' not in l:
                    operations['logs'][r['sample']] = l
                    with gzip.GzipFile('{}_cont.json.gz'.format(r['sample']), 'w') as fout:   # 4. gzip
                        fout.write(json.dumps(l, indent=2).encode('utf-8'))
                else:
                    with_error.append(id)
            else:
                count_running += 1
    if r['sample'] in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][r['sample']])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][r['sample']]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([r['sample'], elapsed])

if count_running != 0:
    print('Still running {}'.format(count_running))

if with_error:
    operations['operations'] = df[~df['id'].isin(with_error)]
    operations['operations'].to_csv('operations-fastq-contamination-cleanup.tsv', sep='\t', index=None)
    print('{} runs with errors. Please rerun previous cell'.format(len(with_error)))
elif count_running == 0:
    operations['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
    operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
    display(operations['gcp'])
    MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
    print('Computig cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))

## Screening to identify foreign chromosomes contamination

In [None]:
foreign_databases = ['archaea','other_eukaryota', 'arthropoda', 'fungi', 'chordata',
                     'other_metazoa', 'viruses_and_viroids', 'viridiplantae', 'bacteria']

PERCENT_IDENTITY = 95.0
operations = {
    'logs':{},
    'operations': pandas.DataFrame(columns=['sample', 'db', 'file', 'id', 'status'])}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-transcriptome-fastq-foreign-contamination-blastn.json')
BLASTBUCKET = 'contamination-foreign-screen'

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir)
os.chdir(op_dir)

if os.path.exists('operations-fastq-foreign-contamination-blastn.tsv'):
    operations['operations'] = pandas.read_csv('operations-fastq-foreign-contamination-blastn.tsv', sep='\t')

d = []
df = operations['operations']
for f in sra_df[0].unique():
    out_bucket = '{}-{}'.format(f.lower(),bucket)
    files = !gsutil ls gs://{out_bucket}/{TOTAL_PER_FILE}_*.fsa.gz
    for file in files:
        file = os.path.basename(file).replace('.fsa.gz', '')
        for db in foreign_databases:
            if df[(df['sample'] == f) & (df['db'] == db) & (df['file'] == file)].empty:
                print('Submitting sample: {} {} {}'.format(f, db, file))
                a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={out_bucket},INBUCKET={out_bucket},SAMPLE={file},DB={db},BLASTBUCKET={BLASTBUCKET},PERCENT_IDENTITY={PERCENT_IDENTITY}
                if len(a) == 1 and a[0].startswith('Running'):
                    a = a[0].replace('].','').split('/')[5]
                    d.append([f, db, file, a, 'running'])
                else:
                    d.append([f, db, file, None, a])
if d:
    operations['operations'] = pandas.concat([operations['operations'], pandas.DataFrame(d, columns=['sample', 'db', 'file', 'id', 'status'])])
    operations['operations'].to_csv('operations-fastq-foreign-contamination-blastn.tsv', sep='\t', index=None)

display(operations['operations'])


### Checking GCP runs
This cell will download the GCP logs for completed operations (jobs).

In [None]:
os.chdir(op_dir)

df = operations['operations'].dropna()
data = []
with_error = []
count_running = 0
for i, r in df.iterrows():
    id = r['id']
    key = '{}_{}_{}'.format(r['sample'], r['db'], r['file'])
    if os.path.exists('{}_foreign_screen.json.gz'.format(key)):
        with gzip.GzipFile('{}_foreign_screen.json.gz'.format(key), 'r') as fin:
            operations['logs'][key] = json.loads(fin.read().decode('utf-8'))
    else:
        if key not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                if 'error' not in l:
                    operations['logs'][key] = l
                    with gzip.GzipFile('{}_foreign_screen.json.gz'.format(key), 'w') as fout:   # 4. gzip
                        fout.write(json.dumps(l, indent=2).encode('utf-8'))
                else:
                    with_error.append(id)
            else:
                count_running += 1
    if key in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][key])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][key]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([key, elapsed])

if count_running != 0:
    print('Still running {}'.format(count_running))

if with_error:
    operations['operations'] = df[~df['id'].isin(with_error)]
    operations['operations'].to_csv('operations-fastq-foreign-contamination-blastn.tsv', sep='\t', index=None)
    print('{} runs with errors. Please rerun previous cell'.format(len(with_error)))
elif count_running == 0:
    operations['gcp'] = pandas.DataFrame(data, columns=['Run', 'Time'])
    operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
    display(operations['gcp'])

    MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
    print('Computig cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))

## Removing contaminated reads from fastq files

In [None]:
operations = {
    'logs':{},
    'operations': pandas.DataFrame(columns=['sample', 'id', 'status'])}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-transcriptome-fastq-foreign-contamination-cleanup.json')

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir)
os.chdir(op_dir)

if os.path.exists('operations-fastq-foreign-contamination-cleanup.tsv'):
    operations['operations'] = pandas.read_csv('operations-fastq-foreign-contamination-cleanup.tsv', sep='\t')

d = []
df = operations['operations']
for f in sra_df[0].unique():
    blast_bucket = '{}-{}'.format(f.lower(),bucket)
    if f not in operations['operations']['sample'].unique():
        print('Submitting sample: {}'.format(f))
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={vector_bucket},BLAST_RESULTS={blast_bucket},SAMPLE={f},FASTQ_BUCKET={vector_bucket},KINGDOM={KINGDOM},TOTAL_PER_FILE={TOTAL_PER_FILE}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])
if d:
    operations['operations'] = pandas.concat([operations['operations'], pandas.DataFrame(d, columns=['sample', 'id', 'status'])])
    operations['operations'].to_csv('operations-fastq-foreign-contamination-cleanup.tsv', sep='\t', index=None)

display(operations['operations'])

### Checking GCP runs
This cell will download the GCP logs for completed operations (jobs).

In [None]:
os.chdir(op_dir)

df = operations['operations'].dropna()
data = []
with_error = []
count_running = 0
for i, r in df.iterrows():
    id = r['id']
    if os.path.exists('{}_foreign_cleanup.json.gz'.format(r['sample'])):
        with gzip.GzipFile('{}_foreign_cleanup.json.gz'.format(r['sample']), 'r') as fin:
            operations['logs'][r['sample']] = json.loads(fin.read().decode('utf-8'))
    else:
        if r['sample'] not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                if 'error' not in l:
                    operations['logs'][r['sample']] = l
                    with gzip.GzipFile('{}_foreign_cleanup.json.gz'.format(r['sample']), 'w') as fout:   # 4. gzip
                        fout.write(json.dumps(l, indent=2).encode('utf-8'))
                else:
                    with_error.append(id)
            else:
                count_running += 1
    if r['sample'] in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][r['sample']])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][r['sample']]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([r['sample'], elapsed])

if count_running != 0:
    print('Still running {}'.format(count_running))

if with_error:
    operations['operations'] = df[~df['id'].isin(with_error)]
    operations['operations'].to_csv('operations-fastq-foreign-contamination-cleanup.tsv', sep='\t', index=None)
    print('{} runs with errors. Please rerun previous cell'.format(len(with_error)))
elif count_running == 0:
    operations['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
    operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
    display(operations['gcp'])
    MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
    print('Computig cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))