In [18]:
import numpy as np
from Bio import AlignIO
from Bio import SeqIO

from bokeh.plotting import figure, show, output_notebook
# from bokeh.charts import Scatter, output_file, show
from bkcharts import Scatter, output_file, show
from bokeh.palettes import Spectral4
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, Band

import urllib
import time
import re

# re.compile("([/])").split(alignment.name)[0]

### Overview

- [Parse MSA file](#Parse-MSA-file)
- [Access UniProt Database](#Access-UniProt-DB-for-each-sequence-in-MSA)
- [Write to file](#Write-names-and-descriptions-to-file)

## Load MSA file

In [30]:
num_msa = 4
filter_length = {0:'f10', 1:'f20', 2:'f30',3:'f40', 4:'f50'}
folder = './actin-sequences/pfam-MSA/pfam_31/'
msa_input_1 = open(folder + 'PF00022_full_pfam31.afa')
msa_input_2 = open(folder + 'PF00022_full_pfam31.afa_f10')
msa_input_3 = open(folder + 'PF00022_full_pfam31.afa_f20')
msa_input_4 = open(folder + 'PF00022_full_pfam31.afa_f30')

msa_input = np.zeros(num_msa)
# print msa_input


In [3]:
length_sequence = 375

alignment = AlignIO.read(msa_input, 'fasta')
total_sequences= len(alignment)
length_alignment = alignment.get_alignment_length()
coverage = float(total_sequences) / float(length_sequence)

print('Number of sequences: %d' % total_sequences)
print("Alignment length %d" % length_alignment)
print("Actin sequence length: %d" % length_sequence)
print("Seq/Len: %d" % coverage)

Number of sequences: 14516
Alignment length 407
Actin sequence length: 375
Seq/Len: 38


In [11]:
def calculateCoverage(alignment, total_sequences, length_sequence, length_alignment):
    #modify this to include multiple alignments
    sequence_count = 0
    gap_threshold = 0.25
    site_coverage = np.zeros((total_sequences, length_sequence))
    percent_gaps = np.zeros(total_sequences)

    for record in alignment:
        residue_count = 0
        gap_count = 0
        current_sequence = record.seq
    #     gap_count = seq.count('-')
    #     residue_count = length_alignment - gap_count
        for residue in xrange(length_sequence):
            if (current_sequence[residue] == '-'):
                gap_count += 1
            else:
                residue_count += 1
                site_coverage[
                    sequence_count, residue] = float(sequence_count) / float(length_sequence)
                #put code here to write counted sequences to new file

        percent_gaps[sequence_count] = float(gap_count) / float(length_alignment)

        sequence_count += 1

    avg_site_coverage = np.zeros(length_sequence)
    for i in xrange(length_sequence):
        avg_site_coverage[i] = np.sum(site_coverage[:, i]) / float(total_sequences)
        
    return avg_site_coverage, percent_gaps

In [12]:
def plotHist(array, num_bins=50, 
             plot_title='Sequence length Coverage', xlabel='Seq/Len', ylabel='Counts'):
    
    hist, edges = np.histogram(array, bins=num_bins)
    is_normed = np.sum(hist)
    print('Counts add to: %d' % is_normed)
    hover = HoverTool(tooltips=[('x-axis, Counts', '$x{0.00}, $y{0.00}')])
    TOOLS="crosshair,pan,wheel_zoom,reset,save,box_select"
    hq = figure(title=plot_title, 
               width=800, height=300, tools=[TOOLS, hover],
               background_fill_color='beige')

    hq.xaxis.axis_label = xlabel
    hq.yaxis.axis_label = ylabel

    hq.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
            fill_color="teal", line_color="#033649", alpha=0.5)

    hq.ygrid.band_fill_alpha = 0.1
    hq.ygrid.band_fill_color = "navy"
    hq.legend.click_policy="hide"
    output_notebook()
    show(hq)

In [13]:
def plotSiteCoverage(array, x_range, coverage):
    """
    Plots the ratios given an array of
    len(ratio_array) and an x-axis range.
    """
    hover = HoverTool(tooltips=[('Top Pairs, IMatch Ratio', '$x{0}, $y{0.00}')])
    TOOLS = "crosshair, pan, wheel_zoom, reset, save, box_select"
    p = figure(title='Single-Site Coverage', width=800, height=300, 
              tools=[TOOLS, hover], toolbar_location="above",
              x_axis_label='Amino Acid Site', x_range=[0,x_range],
              y_axis_label='Fully-Covered Seqs')
    
    p.line(xrange(x_range), coverage, line_dash='dashed', line_width=1.2, color='firebrick')
    p.line(xrange(x_range), array, color='navy',line_width=1)
    p.background_fill_color='beige'
    p.legend.location='top_left'
    p.xgrid.grid_line_color = 'navy'
#     p.xgrid.grid_line_dash = 'dashed'
    p.xgrid.grid_line_alpha = 0.1
    p.ygrid.band_fill_alpha = 0.1
    p.ygrid.band_fill_color = "navy"
    p.legend.orientation='horizontal'
    p.legend.click_policy='hide'
    output_notebook()
    show(p)

In [14]:
a_site_coverage, p_gaps = calculateCoverage(
    alignment, total_sequences, length_sequence, length_alignment)

In [15]:
plotSiteCoverage(a_site_coverage, length_sequence, coverage)

In [16]:
plotHist(p_gaps, num_bins=50, plot_title='Fraction of Sequence Gaps', xlabel='Fraction Gaps')

Counts add to: 14516


In [None]:
alignment_stk = AlignIO.read(open(folder + 'PF00022_full_pfam31.stk'), format='stockholm')
l_a = alignment_stk.get_alignment_length()
print("Alignment length: %d" % l_a)
r = alignment_stk[2]
l_a - r.seq.count('-') / float(3445)

## Access UniProt DB for each sequence in MSA

[Back to overview](#Overview)

In [None]:
# count = 0
# t0 = time.time()
# for record in alignment[:2]:
#     #print(record.id)
#     #handle = urllib.urlopen("http://www.uniprot.org/uniprot/"+ record.id +".xml")
# #     try:
#     record_id = re.compile("([/])").split(record.id)[0]
#     uni_record = SeqIO.read(urllib.urlopen("http://www.uniprot.org/uniprot/"+ record_id +".xml"), "uniprot-xml")
#     names.append(record.id)
#     descriptions.append(uni_record.description)
# #     except:
# #         pass
#     count += 1
# t1 = time.time()
# total = t1-t0
# print "Parsed %d sequences." % count
# print "Time elapsed: %f seconds.\n" % total

## Write names and descriptions to file

[Back to overview](#Overview)

In [None]:
# output = open("./actin-sequences/jackhmmer/actin-mreb/test.txt", "w")
# for i in xrange(count):
#     #if unknown description dont print (implement in the future)
#     output.write(names[i] + "\n" + 'Description: ' + descriptions[i] + "\n\n")
# output.close()
# print "Writing names and descriptions to file."

In [None]:
# msa2 = "./actin-sequences/jackhmmer/actin-actin_domain/actin-actin_cyt.afa_filtered50"
# alignment2 = AlignIO.read(open(msa2), "fasta")
# for record2 in alignment[:5]:
#     print record2.id
    

# #N2 = len(alignment2)
# #for j in np.arange(N2):
#     #print j