In [36]:
from math import pi
import numpy as np
import pandas as pd
from bokeh.io import show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    LogColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar
)
from bokeh.palettes import all_palettes, Spectral
from bokeh.plotting import figure, show, output_notebook

## Overview

- [Information Entropy plots](#Information-Entropy-plots)
    - [Actin](#Actin-Pfam-Information-Entropy)
    - [Hsp70](#HSP70-Information-Entropy)
- [Function Definitions](#Function-Definitions)
    - [Output Data Function](#Output-Data-Function)
    - [Format DataFrame Function](#Format-DataFrame-Function)
    - [Reshape DataFrame Function](#Reshape-DataFrame-Function)
    - [Single-Site Probability Calculation Function](#Single-Site-Probability-Calculation-Function)
    - [Goto GenPlot Function Definition](#Generate-Plot-Function)


### Functions:

**genRelativeEntropyPlot**(relative_entropy_file, skip_format=*False*, skip_plot=*False*, single_site_prob=*False*)

> **Parameters**
-  **relative_entropy_file**: *DataFrame*; should be 26 columns that looks like this:

Residues|A|C|D|E|...|T|V|W|Y|Expected_Insert_Length|Insert_Probability|Delete_Probability|Model_Mask
--------|-|-|-|-|---|-|-|-|----------------------|------------------|------------------|----------
1|*S(A)*|*S(C)*|*S(D)*|*S(E)*|...|*S(T)*|*S(V)*|*S(W)*|*S(Y)*| Numbers | More Numbers| Even More | 0 

>  **Flags**: 
- **skip_format**: *bool*, *default False*; 
Skips utilizing the dataFrameFormat function
- **skip_plot**: *bool*, *default False*; 
If True, does not output Bokeh plot and returns 3 DataFrames
- **single_site_prob**: *bool*, *default False*;
If True, returns single-site probability DataFrame.


[Back to Overview](#Overview)

## Information Entropy plots

#### Actin Pfam Information Entropy

[Back to Overview](#Overview)

In [44]:
file_input = "actin-sequences/pfam-MSA/hmm_logo_info_all.txt"
f1 = pd.read_table(file_input, comment='#', header=0)
rel_ent_pfam, rel_ent_pfam_2 = genRelativeEntropyPlot(f1, slicex=400, skip_plot=False)
print "from Actin pfam"

file_input2 = "actin-sequences/jackhmmer/actin-actin_domain/hmm_logo_info_all.txt"
f2 = pd.read_table(file_input2, comment='#', header=0)
rel_ent_actinx2, rel_ent_actinx2_2 = genRelativeEntropyPlot(f2, skip_plot=False)
print "from Actin x2"

from Actin pfam


from Actin x2


#### Output Dataframe to CSV

In [42]:
# df_outfile = '/home/kmm5/actin/actin-sequences/jackhmmer/actin-actin_domain/relative_entropy_ssp.txt'
df_outfile = '/home/kmm5/actin/actin-sequences/pfam-MSA/relative_entropy_ssp.txt'
outputData(rel_ent_pfam_2, df_outfile)

#### HSP70 Information Entropy

[Back to Overview](#Overview)

In [43]:
file_input2 = "/home/kmm5/hsp70/hmmer/hsp70_rel_entropy.txt"
f3 = pd.read_table(file_input2, comment='#', header=0)
rel_ent_actinx2 = genRelativeEntropyPlot(f3, slicex=400)

[Back to Overview](#Overview)

## Function Definitions

#### Output Data Function

[Back to Overview](#Overview)

In [3]:
def outputData(dataframe, output):
    dataframe.to_csv(output, sep='\t')

#### Format DataFrame Function

[Back to Overview](#Overview)

In [37]:
def dataFrameFormat(df):
    
    # Reordering Amino Acid Letters
    df = df[['Residues','A','C','F','I','L','M','V','W','Y','P','H','K','R','D','E','N','Q','S','T','G',
         'Expected_Insert_Length','Insert_Probability','Delete_Probability','Model_Mask']]
    
    df['Residues'] = df['Residues'].astype(str)
    df = df.set_index('Residues')
    df.drop(['Expected_Insert_Length','Insert_Probability',
             'Delete_Probability', 'Model_Mask'], axis=1, inplace=True)
    df.columns.name = 'Letters'
    
    return df
 

#### Reshape DataFrame Function

[Back to Overview](#Overview)

In [38]:
def reshapeArray(df2, col_name=None):
    
    residue_num = list(df2.index)
    residue_letter = list(df2.columns)
    
    # reshape to 1D array or heights with a residue and letter for each row.
    if (col_name == None):
        df2 = pd.DataFrame(df2.stack(), columns=['Height']).reset_index()
    else:
        df2 = pd.DataFrame(df2.stack(), columns=[col_name]).reset_index()
    
    return df2, residue_num, residue_letter

#### Single Site Probability Calculation Function

[Back to Overview](#Overview)

In [39]:
def calcSingleSiteProb(relative_entropy):
    sum_heights = list(relative_entropy.sum(axis=1))
    single_site_prob = relative_entropy.copy()

    for residues in range(len(relative_entropy.iloc[:,0])):  
        single_site_prob.iloc[residues,:] = (relative_entropy.iloc[residues,:] / (sum_heights[residues]))
        
    return single_site_prob

#### Generate Plot Function

[Back to Overview](#Overview)

In [40]:
def genRelativeEntropyPlot(data_frame, skip_format=False, skip_plot=False, single_site_prob=False, slicex=375):
    # File input processing
    if (skip_format is True):
        df2, source, residue_num, residue_letter = reshapeArray(data_frame)
        df1 = df2
    else:
        df1 = dataFrameFormat(data_frame)
        df2, residue_num, residue_letter = reshapeArray(df1)
        ss_prob_df = calcSingleSiteProb(df1)
        ss_prob, rnum, rlett = reshapeArray(ss_prob_df, col_name='P_ss')
        df2 = df2.join(ss_prob['P_ss'])
        
    if (skip_plot is True):
        # returns dataframes and probability dataframe
        return df1, df2, ss_prob
    source = ColumnDataSource(df2)
    colors = ["#75968f", "#e2e2e2", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
#     colors = Spectral[9]
#     mapper = LinearColorMapper(palette=colors, low=df2.Height.min(), high=df2.Height.max())
    mapper = LogColorMapper(palette=colors, low=df2.Height.min(), high=df2.Height.max())

    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(title="Amino Acid Information Entropy", 
               x_range=residue_num[:slicex], y_range=list(reversed(residue_letter)),
               x_axis_location='above', plot_width=12000, plot_height=300,
               tools=TOOLS, toolbar_location='below')

    p.border_fill_color = "whitesmoke"
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "9pt"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = pi / 3
    
    p.rect(x='Residues', y='Letters', width=1, height=1,
           source=source,
           fill_color={'field': 'Height', 'transform': mapper},
           line_color='black', line_width=0.3)

    color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="9pt",
                         ticker=BasicTicker(desired_num_ticks=len(colors)),
                         formatter=PrintfTickFormatter(format="%.2f"), height=200,
                         major_label_text_color='black', major_tick_line_color=None,
                         label_standoff=0, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    p.select_one(HoverTool).point_policy = "follow_mouse"
    p.select_one(HoverTool).tooltips = [
         ('Residue', '@Letters @Residues'),
         ('Probability', '@P_ss{1.111}'),
         ('Information (bits)', '@Height{1.11}'),
    ]
    output_notebook()
    show(p)
    
    if (single_site_prob is True):
        return df1, df2, ss_prob_df
    else:
        return df1, df2

[Back to Overview](#Overview)