## Trimming samples with Trimmomatic

In [None]:
%run ../config/init.py

### Loading data from {{ cookiecutter.dataset_name }}/sample_table.csv accession list

The file `{{ cookiecutter.dataset_name }}/sample_table.cs` should contains a single column with all SRA IDs to be processed.

In [None]:
data_dir = os.path.join(DATA, DATASET)
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'trimmomatic'))

sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

## Testing gcloud configuration

### Requirements

#### [Cloud SDK](https://cloud.google.com/sdk)


Run *gcloud init* to initialize the gcloud environment and follow its instructions:

 `$ gcloud init`

In [None]:
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')

### Defining variables

Edit GCP zone and region variable accordingly to your geographical location.

In [None]:
MACHINE_TYPE = 'n1-standard-16'
PREEMPTIBLE = True
LOCAL_SSD_SIZE = 375 # 1 disk is size 375, 2 disks are 750 ...

# Use zones close to your location. Multiple zones allow more access to resources
ZONES = ['us-east1-c', 'us-east1-b','us-east1-d', 'us-east4-a', 'us-east4-b', 'us-east4-c']

### Retrieve GCP storage bucket

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://{}-sra-'.format(DATASET.lower())
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://{}-sra-'.format(DATASET.lower()),'').replace('/','')
        break

inbucket_name = '{}-sra-{}'.format(DATASET.lower(), bucket)
print('in bucket: {}'.format(inbucket_name))

outbucket_name = '{}-trimming-{}'.format(DATASET.lower(), bucket)

bucket_list = !gsutil ls gs://{outbucket_name}
if ''.join(bucket_list).startswith('BucketNotFoundException'):
    !gsutil mb gs://{outbucket_name}

print('out bucket: {}'.format(outbucket_name))

### Creating Trimmomatic workflow input file

Trimmomatic options should be modified accordingly with the FastQC report taking into account:

{% if cookiecutter.sequencing_technology == 'paired-end' %}
 * IlluminaClip: TruSeq3-PE.fa:2:30:10
{% else %}
 * IlluminaClip: TruSeq3-SE.fa:2:30:10
{% endif %}
 * Minlen: 25
 * Avgqual: 30
 * Leading: 30
 * Trailing: 30
 
For more info about [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)

### More options from the Trimmomatic CWL workflow
```
       [--avgqual AVGQUAL] 
       [--crop CROP]       
       [--headcrop HEADCROP]
       [--illuminaClip ILLUMINACLIP]
       [--leading LEADING]
       [--maxinfo MAXINFO]
       [--minlen MINLEN]
       [--phred PHRED]
       [--tophred33]
       [--tophred64]
       [--trailing TRAILING]
```

In [None]:
for s in sra_df[0]:
    MINLEN = 50
    AVGQUAL = 30
    LEADING = 30
    TRAILING = 30
    HEADCROP = 10

{% if cookiecutter.sequencing_technology == 'paired-end' %}
    TRIMMOMATIC_ADAPTER = '/_conda/envs/__trimmomatic@0.39/share/trimmomatic/adapters/TruSeq3-PE.fa:2:30:10'
{% else %}
    TRIMMOMATIC_ADAPTER = '/_conda/envs/__trimmomatic@0.39/share/trimmomatic/adapters/TruSeq3-SE.fa:2:30:10'
{% endif %}

    trimming_yml = {
        'threads': 16,
        'illuminaClip': TRIMMOMATIC_ADAPTER,
        'minlen': MINLEN,
        'avgqual': AVGQUAL,
        'leading': LEADING,
        'trailing': TRAILING,
        'headcrop': HEADCROP,
        'input_files': [
{% if cookiecutter.sequencing_technology == 'paired-end' %}
            [{'class': 'File', 'path': '/data/{}_1.fastq.gz'.format(s)},
            {'class': 'File', 'path': '/data/{}_2.fastq.gz'.format(s)}]
{% else %}
            {'class': 'File', 'path': '/data/{}.fastq.gz'.format(s)}
{% endif %}
        ]
    }
    trimming_file = '{}_trimming.yml'.format(s)
    write_to_yaml(trimming_yml, trimming_file)  
    !gsutil cp {trimming_file} gs://{inbucket_name}

### Submitting jobs for transfering the SRA files to GCP

In [None]:
operations = {
    'logs':{},
    'operations': pandas.DataFrame(columns=['sample', 'id', 'status'])}
{% if cookiecutter.sequencing_technology == 'paired-end' %}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-trimming-fastq-pe.json')
{% else %}
PIPELINE = os.path.join(BIN, 'gcp', 'pipeline-trimming-fastq-se.json')
{% endif %}
update_pipeline(PIPELINE, ZONES, MACHINE_TYPE, LOCAL_SSD_SIZE, PREEMPTIBLE)

os.chdir(result_dir)
op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir) 
os.chdir(op_dir)
           
if os.path.exists('operations-trimming-fastq.tsv'):
    operations['operations'] = pandas.read_csv('operations-trimming-fastq.tsv', sep='\t')

d = []
for f in sra_df[0].unique():
    if f not in operations['operations']['sample'].unique():
        print('Submitting sample: ' + f)
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE} --env-vars=OUTBUCKET={outbucket_name},INBUCKET={inbucket_name},SAMPLE={f}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])

if d:
    operations['operations'] = pandas.concat([operations['operations'], pandas.DataFrame(d, columns=['sample', 'id', 'status'])])
    operations['operations'].to_csv('operations-trimming-fastq.tsv', sep='\t', index=None)

display(operations['operations'])

### GCP log retrival for plotting
This cell will download the GCP logs for completed operations (jobs) creating the `sample.json` files for each sample.

In [None]:
os.chdir(op_dir)

df = operations['operations'].dropna()
data = []
with_error = []
count_running = 0
for i, r in df.iterrows():
    id = r['id']
    if os.path.exists('{}.json.gz'.format(r['sample'])):
        with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'r') as fin:
            operations['logs'][r['sample']] = json.loads(fin.read().decode('utf-8'))
    else:
        if r['sample'] not in operations['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                if 'error' not in l:
                    operations['logs'][r['sample']] = l
                    with gzip.GzipFile('{}.json.gz'.format(r['sample']), 'w') as fout:   # 4. gzip
                        fout.write(json.dumps(l, indent=2).encode('utf-8'))
                else:
                    with_error.append(id)
            else:
                count_running += 1
    if r['sample'] in operations['logs']:
        ts = get_gpc_starttimestamp(operations['logs'][r['sample']])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations['logs'][r['sample']]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([r['sample'], elapsed])

if count_running != 0:
    print('Still running {}'.format(count_running))

if with_error:
    operations['operations'] = df[~df['id'].isin(with_error)]
    operations['operations'].to_csv('operations-trimming-fastq.tsv', sep='\t', index=None)
    print('{} runs with errors. Please rerun previous cell'.format(len(with_error)))
elif count_running == 0:
    operations['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
    operations['gcp']['Time'] = operations['gcp']['Time']/pandas.Timedelta('1 minute')
    display(operations['gcp'])
    MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
    print('Computing cost cost: $ {:.2f}'.format(operations['gcp']['Time'].sum() * MACHINE_PRICE/60))


## Downloading FastQC results from GCP

In [None]:
os.chdir(result_dir)
!gsutil -q -m -o 'GSUtil:parallel_composite_upload_threshold=150M' -o 'GSUtil:parallel_process_count=4' -o 'GSUtil:parallel_thread_count=4' rsync -x '.*\.fastq\.gz' gs://{outbucket_name}/ ./


In [None]:
str_msg = '#### FastQC report\n'
display(Markdown(str_msg))
os.chdir(NOTEBOOKS)

base_url = 'https://storage.cloud.google.com/{}/'.format(outbucket_name)

str_msg = '| Sample | FastQC<br>Report | No of Reads<br>in fastq | Seq<br> Len | %GC '
str_msg += '| Poor<br>Quality | Fail<br>Tests |\n'
str_msg += '| --- | --- |--- | --- | --- | --- | --- |\n'
for sample in sra_df[0].unique():
{% if cookiecutter.sequencing_technology == 'paired-end' %}
    for r in range(1,3):
        s = '{}_{}'.format(sample, r)
        str_msg += '| <a href="{0}{1}.fastq.gz" target="_blank">{1}</a>'.format(base_url, s)
        str_msg += '| '
        str_msg += find_file_print_link_size(result_dir, s, '.html', 'MB', ' --- ')
        str_msg += ' |'
        f = os.path.relpath(os.path.join(result_dir, s + '_fastqc.zip'))
        if os.path.exists(f) and os.path.getsize(f) != 0:
            tests, tot_seq, poor_quality, seq_len, gc_content = parse_fastqc_zip(f)            
            str_msg += "{:,}".format(tot_seq) + '|'
            str_msg += seq_len + '|'
            str_msg += gc_content + '|'
            str_msg += str(poor_quality) + '|'
            fail_tests = ''
            for t in tests:
                if tests[t] == 'FAIL':
                    if fail_tests:
                        fail_tests += '<br>'
                    fail_tests += t
            str_msg += fail_tests + '|\n'

        else:
            str_msg += ' --- | --- | --- | --- | --- |\n'
{% else %}
    str_msg += '| <a href="{0}{1}.fastq.gz" target="_blank">{1}</a>'.format(base_url, sample)
    str_msg += '| '
    str_msg += find_file_print_link_size(result_dir, sample, '.html', 'MB', ' --- ')
    str_msg += ' |'
    f = os.path.relpath(os.path.join(result_dir, s + '_fastqc.zip'))
    if os.path.exists(f) and os.path.getsize(f) != 0:
        tests, tot_seq, poor_quality, seq_len, gc_content = parse_fastqc_zip(f)
        str_msg += "{:,}".format(tot_seq) + '|'
        str_msg += seq_len + '|'
        str_msg += gc_content + '|'
        str_msg += str(poor_quality) + '|'
        fail_tests = ''
        for t in tests:
            if tests[t] == 'FAIL':
                if fail_tests:
                    fail_tests += '<br>'
                fail_tests += t
        str_msg += fail_tests + '|\n'

    else:
        str_msg += ' --- | --- | --- | --- | --- |\n'
{% endif %}
display(Markdown(str_msg))
del str_msg