## Alignment raw reads to the Transcriptome

In [None]:
%run ../config/init.py

In [None]:
result_dir = os.path.join(RESULTS, DATASET, 'alignments')
submission_dir = os.path.join(RESULTS, DATASET, 'submission')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
    
fasta_dir = os.path.join(result_dir, 'fasta_genbank_ids')
if not os.path.exists(fasta_dir):
    os.mkdir(fasta_dir) 


sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

### Collecting submission file names

In [None]:
splitted_files = [f.replace('.fsa', '') for dr, ds, files in os.walk(submission_dir) for f in files if f.endswith('.fsa')]

### Changing FASTA IDS to use the new GenBank IDs

From the TSA submission download the accession list to the submission directory.
Set the `ACCESSION_FILE` variable

In [None]:
ACCESSION_FILE = 'GISG03_accs'
accs = pandas.read_csv(os.path.join(submission_dir, ACCESSION_FILE), sep='\t', header=None)
accs.head()

### Creating a new Fasta file with submitted transcripts

A new file named *transcriptome.fsa* will be created with the submitted transcripts.

In [None]:
os.chdir(fasta_dir)
bar_length = 100
if not os.path.exists('transcriptome.fsa'):
    with open('transcriptome.fsa', "w") as output_handle:
        for s in splitted_files:
            sname = os.path.join(submission_dir, s + '.fsa')
            if os.path.exists(sname):
                total = !grep -c "^>" {sname}
                total = int(''.join(total))
                print('Processing sample: {} with {} transcripts'.format(s,total))
                count = 0
                for record in SeqIO.parse(sname, 'fasta'): 
                    record.id = accs[accs[0] == record.id][1].iloc[0]
                    record.description = ''
                    count += 1
                    percent = count * 100/total
                    SeqIO.write(record, output_handle, "fasta")
                    print('{:6d} [{}] {:3.1f}%'.format(count, 
                                                       "#" * int(percent) + "-" * (bar_length - int(percent)), 
                                                       percent), 
                          end='\r')
                print()

### Testing gcloud configuration¶

In [None]:
ZONE = 'us-east4'
REGION = 'us-east4-c'
!gcloud --version
account = !gcloud config get-value account
account = ''.join(account)
project = !gcloud config get-value project
project = ''.join(project)
if account != '(unset)' and project != '(unset)':
    print('Using account: {} with project: {}'.format(account, project))
else:
    print('Please, configure Cloud SDK before running this notebook')
    print('Open a Terminal and run: gcloud init')


### Create GCP bucket for SRA files

In [None]:
os.chdir(fasta_dir)
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://{}-vector-'.format(DATASET.lower())
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://{}-vector-'.format(DATASET.lower()),'').replace('/','')
        break

inbucket = '{}-vector-{}'.format(DATASET.lower(), bucket)
outbucket = '{}-align-{}'.format(DATASET.lower(), bucket)

bucket_list = !gsutil ls gs://{outbucket}
if ''.join(bucket_list).startswith('BucketNotFoundException'):
    !gsutil mb gs://{outbucket}
    !gsutil -m cp -R transcriptome.fsa gs://{inbucket}/

print('input bucket: {}'.format(inbucket))
print('output bucket: {}'.format(outbucket))

### Submitting alignments

In [None]:
PIPELINE_JSON = os.path.join(BIN, 'gcp', 'pipeline-read-assignment.json')

operations_alignment = {}

op_dir = os.path.join(result_dir, 'gcp')
if not os.path.exists(op_dir):
    os.mkdir(op_dir) 
os.chdir(op_dir)
                
if os.path.exists('operations-alignment.tsv'):
    operations_alignment['logs'] = {}
    operations_alignment['operations'] = pandas.read_csv('operations-alignment.tsv', sep='\t')
else:
    d = []    
    for f in sra_df[0].unique():
        a = !gcloud beta lifesciences pipelines run --pipeline-file={PIPELINE_JSON} --env-vars=INBUCKET={inbucket},OUTBUCKET={outbucket},SRA={f}
        if len(a) == 1 and a[0].startswith('Running'):
            a = a[0].replace('].','').split('/')[5]
            d.append([f, a, 'running'])
        else:
            d.append([f, None, a])
    operations_alignment['logs'] = {}
    operations_alignment['operations'] = pandas.DataFrame(d, columns=['sra', 'id', 'status'])
    operations_alignment['operations'].to_csv('operations-alignment.tsv', sep='\t', index=None)

display(operations_alignment['operations'])


In [None]:
os.chdir(op_dir)
    
df = operations_alignment['operations'].dropna()
data = []                
for i, r in df.iterrows():
    id = r['id']
    sample = '{}_{}'.format(r['sample'], r['sra'])
    if os.path.exists('{}.json.gz'.format(sample)):
        with gzip.GzipFile('{}.json.gz'.format(sample), 'r') as fin:  
            operations_alignment['logs'][sample] = json.loads(fin.read().decode('utf-8'))
    else:
        if sample not in operations_alignment['logs']:
            a = !gcloud beta lifesciences operations describe --format=json {id}
            l = json.loads(''.join(a))
            if 'done' in l:
                operations_alignment['logs'][sample] = l
                with gzip.GzipFile('{}.json.gz'.format(sample), 'w') as fout:   # 4. gzip
                    fout.write(json.dumps(l, indent=2).encode('utf-8'))  
    if sample in operations_alignment['logs']:
        ts = get_gpc_starttimestamp(operations_alignment['logs'][sample])
        ts = datetime.strptime(ts.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        te = datetime.strptime(operations_alignment['logs'][sample]['metadata']['endTime'].split('.')[0], "%Y-%m-%dT%H:%M:%S")
        elapsed = te - ts
        data.append([sample, elapsed])
operations_alignment['gcp'] = pandas.DataFrame(data, columns=['Sample', 'Time'])
operations_alignment['gcp']['Time'] = operations_alignment['gcp']['Time']/pandas.Timedelta('1 minute')

MACHINE_PRICE = 0.16 # n1-standard-16 preemptible
print('Alignment cost: $ {:.2f}'.format(operations_alignment['gcp']['Time'].sum() * MACHINE_PRICE/60))

display(operations_alignment['gcp'])

### Download results from GCP

In [None]:
os.chdir(result_dir)
!gsutil -m -o 'GSUtil:parallel_composite_upload_threshold=150M' -o 'GSUtil:parallel_process_count=4' -o 'GSUtil:parallel_thread_count=4' cp -R gs://{outbucket}/ ./ 
    

### Reads Stats

In [None]:
os.chdir(result_dir)

str_msg = '| Sample | Total<br>reads | Mapped<br>reads | Mapped<br>reads (%) '
str_msg += '| Un-Mapped<br>reads | Un-Mapped<br>reads (%) | Properly<br>paired<br>reads (%) '
str_msg += '| Error<br>rate '
str_msg += '| Average<br>length '
str_msg += '| Average<br>quality '
str_msg += '|\n| --- | --- '
str_msg += '| --- | --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '| --- '
str_msg += '|\n'
for s in sra_df[0].unique():
    alignment_path = os.path.join(outbucket, s)
    str_msg += '| ' + s
    str_msg += '| '
    files = [f for ds, dr, files in os.walk(alignment_path) for f in files if
             f.startswith(s) and f.endswith('.stats')
             and os.path.getsize(os.path.join(alignment_path, f)) != 0]
    if len(files) == 1:
        f = os.path.relpath(os.path.join(alignment_path, files[0]))
        stats = load_content_dict_line(f, ':', 'SN', '\t', True, 'SN\t', '')        
        str_msg += "{:,}".format(int(stats['raw total sequences'])) + ' |'        
        str_msg += "{:,}".format(int(stats['reads mapped'])) + ' |'
        str_msg += "{:.2f}".format(float(stats['reads mapped'])*100/float(stats['raw total sequences'])) + ' |'
        str_msg += "{:,}".format(int(stats['reads unmapped'])) + ' |'
        str_msg += "{:.2f}".format(float(stats['reads unmapped'])*100/float(stats['raw total sequences'])) + ' |'
        str_msg += "{:,}".format(float(stats['percentage of properly paired reads (%)'])) + ' |'
        str_msg += "{:.2e}".format(float(stats['error rate'].replace('%', ''))) + ' |'
        str_msg += "{:,}".format(int(stats['average length'].replace('%', ''))) + ' |'
        str_msg += "{:.1f}".format(float(stats['average quality'].replace('%', ''))) + ' |'
        str_msg += '\n'
    else:
        str_msg += ' --- | --- | --- | --- |\n'
display(Markdown(str_msg))
del str_msg
