# Interactive circos-like plots with Bokeh

<u>Prototyping for proteomic and phosphoproteomic datasets.</u> <br>
<b>Goals:</b> Clickable links to protein descriptions, ability to show multiple (large) datasets<br>
<b>Open questions:</b> How to represent proteins with multiple peptides/phosphorylations? How to categorize the proteins (which ontology?)

In [1]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import TapTool, OpenURL, ColumnDataSource

import numpy as np
import pandas as pd

In [2]:
# this line necessary to view the figure in the notebook
output_notebook() 

In [3]:
# acquiring the data
# ID, chromosome, location, intensity
# index by chromosome, location in pandas? 

file = "pro_test_data.csv"
data = pd.read_csv(file)

# select only the top most intense peptide for the protein. Is this the best way? Will likely change.
data = data.iloc[data.groupby(['Protein Group Accessions']).apply(lambda x: x['Intensity'].idxmax())]
data = data
data.set_index('Protein Group Accessions')

Unnamed: 0_level_0,Confidence Level,Search ID,Processing Node No,Sequence,Unique Sequence ID,PSM Ambiguity,Protein Descriptions,# Proteins,# Protein Groups,Modifications,...,Delta Mass [PPM],RT [min],First Scan,Last Scan,MS Order,Ions Matched,Matched Ions,Total Ions,Spectrum File,Annotation
Protein Group Accessions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2BNC7,High,A,2,LLSEIVNK,3242,Unambiguous,DNA polymerase III subunit beta OS=Prochloroco...,1,1,,...,0.08,63.24,5511,5511,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_6.raw,
A2BNC8,High,A,2,YYLVSR,631533,Unambiguous,Uncharacterized protein OS=Prochlorococcus mar...,1,1,,...,0.20,55.97,4469,4469,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_7.raw,
A2BNC9,High,A,2,SLIEAcLDAFK,4670,Unambiguous,Phosphoribosylformylglycinamidine synthase sub...,1,1,C6(Carbamidomethyl),...,0.31,104.89,20886,20886,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_8.raw,
A2BND0,High,A,2,VFFTSAAPPVR,21801,Unambiguous,Amidophosphoribosyltransferase OS=Prochlorococ...,1,1,,...,0.32,79.55,9953,9953,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_10.raw,
A2BND1,High,A,2,LIGITNTGR,23345,Unambiguous,DNA topoisomerase (ATP-hydrolyzing) OS=Prochlo...,1,1,,...,-0.18,58.75,4894,4894,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_7.raw,
A2BND1;A2BRP1,High,A,2,SYLEYAMSVIVGR,9812,Unambiguous,DNA topoisomerase (ATP-hydrolyzing) OS=Prochlo...,2,2,,...,0.62,132.01,26775,26775,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_11.raw,
A2BND2,High,A,2,LQASTEILLQNEQLKK,51887,Unambiguous,"Flp pilus assembly protein TadD, contains TPR ...",1,1,,...,0.54,74.68,9198,9198,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_7.raw,
A2BND3,High,A,2,AIVEPFVIQSDLcIAYHTIESR,321109,Unambiguous,Uncharacterized Fe-S protein OS=Prochlorococcu...,1,1,C13(Carbamidomethyl),...,1.28,99.63,20750,20750,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_10.raw,
A2BND4,High,A,2,VVLVEYPR,16293,Unambiguous,Uncharacterized conserved protein OS=Prochloro...,1,1,,...,0.21,70.43,7133,7133,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_6.raw,
A2BND5,High,A,2,LIISSGQIDIR,475582,Unambiguous,N utilization substance protein B homolog OS=P...,1,1,,...,0.40,85.47,13113,13113,MS2,0/0,0,0,150925_Noelle_pro_log_A_exclusion_8.raw,


The following code is based on the burtin example in the Bokeh gallery: http://bokeh.pydata.org/en/latest/docs/gallery/burtin.html

In [4]:
# setting dimensions for the figure, as well as for the inner/outer radii
width = 800
height = 800

inner_radius = 150
outer_radius = 350

minr = np.sqrt(np.log(.001 * 1E4))
maxr = np.sqrt(np.log(1000000 * 1E4))
a = (outer_radius - inner_radius) / (minr - maxr)
b = inner_radius - a * maxr

def rad(mic):
    return a * np.sqrt(np.log(mic * 1000000)) + b

big_angle = 2.0 * np.pi / (len(data) + 1)
small_angle = big_angle / 7
angles = np.pi/2 - big_angle/2 - data.index.to_series()*big_angle

In [5]:
# plotting the figure
data_s = ColumnDataSource(data=dict(
    intensity=data.Intensity,
    ID = data['Protein Group Accessions']
    ))

p = figure(plot_width=width, plot_height=height, title="",
    x_axis_type=None, y_axis_type=None,
    x_range=(-420, 420), y_range=(-420, 420),
    min_border=0, outline_line_color="black", background_fill_color="white",
    border_fill_color="white", tools = "wheel_zoom, crosshair, tap, pan, reset")

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

p.annular_wedge(x=0, 
                y=0, 
                outer_radius=outer_radius, 
                inner_radius=rad(data.Intensity/100000000), 
                start_angle=-big_angle+angles+1*small_angle, 
                end_angle=-big_angle+angles+2*small_angle,
                color='#4daf4a',
                source=data_s
                )

# configure the tap tool
url = "http://www.uniprot.org/uniprot/@ID"
taptool = p.select(dict(type=TapTool))
taptool.callback = OpenURL(url=url)

NameError: name 'ColumnDataSource' is not defined

In [None]:
show (p)

Seems to work! But the dataset is pretty large, and it's hard to click on the right glyph. 