### Detecting contamination in raw data

We use Blastn to detect reads with Blast hits out of the *Viridiplantae* kingdom.
This contamination should be removed before assembling the transcripts

In [None]:
%run ../config/init.py
import configparser

### Loading data from {{ cookiecutter.dataset_name }}/sample_table.csv accession list

The file `{{ cookiecutter.dataset_name }}/sample_table.cs` should contains a single column with all SRA IDs to be processed.

In [None]:
data_dir = os.path.join(DATA, DATASET)
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'contamination_cleanup'))

sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

## Testing gcloud configuration

### Requirements

#### [Cloud SDK](https://cloud.google.com/sdk)


Run *gcloud init* to initialize the gcloud environment and follow its instructions:

 `$ gcloud init`

In [None]:
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')

### Defining variables

Edit GCP zone and region variable accordingly to your geographical location.

In [None]:
REGION = 'us-east4'
ZONE = 'us-east4-b'

### Retrieve GCP storage bucket

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://transcriptome-vector-'
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://transcriptome-vector-','').replace('/','')
        break

vector_bucket  = 'transcriptome-vector-' + bucket
print('vector bucket: {0}'.format(vector_bucket))

for s in sra_df[0].unique():
    out_bucket = '{}-{}'.format(s.lower(),bucket)
        
    bucket_list = !gsutil ls gs://{out_bucket}
    if ''.join(bucket_list).startswith('BucketNotFoundException'):
        !gsutil mb gs://{out_bucket}

### Creating Elastic-Blast config files

Elastic-Blast will create a Kubernetes cluster for processing all samples FASTA files.
The **NUMBER_OF_INSTANCES** to use should be compatible with the GCP quotas defined in the project.
The cluster will be using **n1-standard-32** machines, therefore, the total number of CPUs in the ZONE
to use are **NUMBER_OF_INSTANCES * 32**. Additionally, the Local SSD (GB) quota should be also modified.
Each instance will be using 1500 GB local disk, therefore the total local SSD available should
be **NUMBER_OF_INSTANCES * 1500**.


In [None]:
# Number of VM
NUMBER_OF_INSTANCES = 30

for s in sra_df[0].unique():
    config = configparser.ConfigParser()
    config['cloud-provider'] = {
        'gcp-project': project,
        'gcp-region': REGION,
        'gcp-zone': ZONE
    }
    config['cluster'] = {
        'num-nodes': str(NUMBER_OF_INSTANCES),
        'use-preemptible': 'yes',
        'machine-type': 'n1-standard-32'
    }
    config['blast'] = {
        'program': 'blastn',
        'db': 'nt',
        'results': 'gs://{}-{}'.format(s.lower(),bucket),
        'options': '-task megablast -evalue 1e-5 -max_target_seqs 5 -outfmt "6 qseqid sgi saccver length pident evalue bitscore score staxid"',
        'queries': 'gs://{}/{}_clean_1.fastq.fsa'.format(vector_bucket, s)
    }
    with open('elastic-blast-{}.cfg'.format(s), 'w') as fout:
        config.write(fout)

### Run elastic-blast in a terminal suing these commands

This process may takes a lot of time depending on of the size of the samples.

In [None]:
print('cd {}'.format(result_dir))
print('export PATH=`pwd`:$PATH')
for s in sra_df[0].unique():
    print('elastic-blast submit --cfg elastic-blast-{}.cfg'.format(s))

### Creating taxonomy graph

In [None]:
TAXDUMP_FILE = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
TAX_DIR = os.path.join(DATA, 'taxonomy')
TAX_PICKLE = os.path.join(TAX_DIR, 'taxonomy_networkx.pickle')
if not os.path.exists(TAX_DIR):
    os.mkdir(TAX_DIR)
    
os.chdir(TAX_DIR)
if not os.path.exists(TAX_PICKLE):
    !wget {TAXDUMP_FILE}
    !tar xzf taxdump.tar.gz
    node_file = 'nodes.dmp'
    name_file = 'names.dmp'
    tax_id = parse_tax_name_file(name_file)    
    print('Taxonomies: {}'.format(len(tax_id)))
    tax = nx.DiGraph()
    entries = parse_nodes_file(node_file, tax_id)
    nodes, edges = zip(*entries)
    print('{} nodes created'.format(len(nodes)))
    tax.add_nodes_from(nodes)
    for e in edges:
        if e: 
            tax.add_edge(*e)
    print('Printing pickle file')
    pickle.dump(tax, open(TAX_PICKLE, "wb"))    
    !ls -1 | grep -v `basename {TAX_PICKLE}` | xargs rm -v
    vector_bucket  = 'transcriptome-vector-' + bucket
    !gsutil -m cp taxonomy_networkx.pickle gs://{vector_bucket}/

## Removing contaminated reads 

In [None]:
operations = {}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-contamination-cleanup.json')
TAXID = 33090

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir) 
os.chdir(op_dir)
       
if os.path.exists('operations-contamination-cleanup.tsv'):
    operations['logs'] = {}
    operations['operations'] = pandas.read_csv('operations-contamination-cleanup.tsv', sep='\t')
else:
    d = []
    for f in sra_df[0].unique():
        blast_bucket = '{}-{}'.format(f.lower(),bucket)
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=TAXID={TAXID},BLAST_BUCKET={blast_bucket},OUTBUCKET={vector_bucket},INBUCKET={vector_bucket},SAMPLE={f}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])
    operations['logs'] = {}
    operations['operations'] = pandas.DataFrame(d, columns=['sample', 'id', 'status'])
    operations['operations'].to_csv('operations-contamination-cleanup.tsv', sep='\t', index=None)

display(operations['operations'])

In [None]:
os.chdir(op_dir)
    
df = operations['operations'].dropna()
data = []                
for i, r in df.iterrows():
    id = r['id']
    if os.path.exists('{}.json.gz'.format(r['sample'])):
        with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'r') as fin:  
            operations['logs'][r['sample']] = json.loads(fin.read().decode('utf-8'))
    else:
        if r['sample'] not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                operations['logs'][r['sample']] = l
                with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'w') as fout:   # 4. gzip
                    fout.write(json.dumps(l, indent=2).encode('utf-8'))  
    if r['sample'] in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][r['sample']])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][r['sample']]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([r['sample'], elapsed])
operations['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
display(operations['gcp'])


MACHINE_PRICE = 0.02 # n1-standard-2 preemptible
print('Computing cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))

## Downloading FastQC results from GCP

In [None]:
os.chdir(result_dir)
!gsutil -m -o 'GSUtil:parallel_composite_upload_threshold=150M' -o 'GSUtil:parallel_process_count=4' -o 'GSUtil:parallel_thread_count=4' rsync -x '.*\.fastq\.*' gs://{vector_bucket}/ ./ 


In [None]:
str_msg = '#### FastQC report\n'
display(Markdown(str_msg))
os.chdir(NOTEBOOKS)

base_url = 'https://storage.cloud.google.com/{}/'.format(vector_bucket)

str_msg = '| Sample | FastQC<br>Report | No of Reads<br>in fastq | Seq<br> Len | %GC '
str_msg += '| Poor<br>Quality | Fail<br>Tests |\n'
str_msg += '| --- | --- |--- | --- | --- | --- | --- |\n'
for sample in sra_df[0].unique():
    for r in range(1,3):
        s = '{}_clean_noCont_{}'.format(sample, r)
        str_msg += '| <a href="{0}{1}.fastq.gz" target="_blank">{1}</a>'.format(base_url, s)
        str_msg += '| '
        str_msg += find_file_print_link_size(result_dir, s, '.html', 'MB', ' --- ')
        str_msg += ' |'
        f = os.path.relpath(os.path.join(result_dir, s + '_fastqc.zip'))
        if os.path.exists(f) and os.path.getsize(f) != 0:
            tests, tot_seq, poor_quality, seq_len, gc_content = parse_fastqc_zip(f)            
            str_msg += "{:,}".format(tot_seq) + '|'
            str_msg += seq_len + '|'
            str_msg += gc_content + '|'
            str_msg += str(poor_quality) + '|'
            fail_tests = ''
            for t in tests:
                if tests[t] == 'FAIL':
                    if fail_tests:
                        fail_tests += '<br>'
                    fail_tests += t
            str_msg += fail_tests + '|\n'

        else:
            str_msg += ' --- | --- | --- | --- | --- |\n'

display(Markdown(str_msg))
del str_msg

### Delete temporal buckets

In [None]:
for s in sra_df[0].unique():
    out_bucket = '{}-{}'.format(s.lower(),bucket)
    !gsutil -m rm -r gs://{out_bucket}