In [3]:
import json
import pandas as pd
import numpy as np
from pprint import pprint
import re

In [5]:
with open('build/dengue_denv1_frequencies.json') as data_file:  # Open JSON, load as dictionary
    frequencies = json.load(data_file)

In [6]:
def make_tidy(label, frequencies):
    ''' E.g., input (u'global_nuc:9G': [0.9972, 0.9943]) from dict.iteritems()
    returns { 'region': 'global', 'gene': 'nuc', 'site': 9, 'frequencies': [0.9972, 0.9943]}'''
    try:
        region, geneAllele = label.replace('other', 'X').split('_') # Recode 'other' with ambiguity code 'X'
        gene, siteAllele = geneAllele.split(':')
        site, allele = int(siteAllele[:-1]), siteAllele[-1]
        tidy_entry = {'region': region, 'gene': gene, 'site': site, 'allele': allele, 'frequencies': np.array(frequencies)}
        return tidy_entry
    except:
        print label

pivots = frequencies.pop(u'pivots')
counts = frequencies.pop(u'counts')
tidy_frequencies = sorted([ make_tidy(k,v) for k,v in frequencies.iteritems() ], key = lambda x: x['gene'])

In [7]:
frequencies_df = pd.DataFrame(tidy_frequencies, # Cast to data frame
                              columns=['region', 'site', 'gene', 'allele', 'frequencies']) 
frequencies_df.head()

Unnamed: 0,region,site,gene,allele,frequencies
0,global,17,2K,I,"[0.0353, 0.0353, 0.0353, 0.0353, 0.0353, 0.035..."
1,global,17,2K,M,"[0.939, 0.939, 0.939, 0.939, 0.939, 0.939, 0.9..."
2,global,17,2K,X,"[0.0257, 0.0257, 0.0257, 0.0257, 0.0257, 0.025..."
3,global,104,C,M,"[0.9691, 0.9691, 0.9691, 0.9691, 0.9691, 0.969..."
4,global,104,C,V,"[0.0309, 0.0309, 0.0309, 0.0309, 0.0309, 0.030..."


In [76]:
E_freqs = frequencies_df.loc[frequencies_df['gene'] == 'E'].copy() # Pull sites in E
E_freqs['std'] = E_freqs['frequencies'].apply(lambda x: np.std(x)) # Make a new column with the standard deviation of the frequency values for each site
E_freqs.sort_values('site', inplace=True) # Sort for most variable sites first
E_freqs.head()

Unnamed: 0,region,site,gene,allele,frequencies,std
180,global,8,E,N,"[0.9753, 0.9753, 0.9753, 0.9753, 0.9753, 0.975...",0.135333
181,global,8,E,S,"[0.0247, 0.0247, 0.0247, 0.0247, 0.0247, 0.024...",0.135333
168,global,37,E,G,"[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.000...",0.000104
167,global,37,E,D,"[0.3373, 0.3373, 0.3373, 0.3373, 0.3373, 0.337...",0.181927
166,global,37,E,N,"[0.6626, 0.6626, 0.6626, 0.6626, 0.6626, 0.662...",0.181932


In [123]:
colors = ['#547BD3','#83BA70', '#781C86', '#DF4327', '#D3AE4E']

plot_data = E_freqs[['site', 'allele', 'frequencies']].copy() # Pull relevant columns to plot
site_groups = plot_data.groupby('site') # Store which indices correspond to each site

plot_data['x'] = [pivots for i in range(len(plot_data))] # pivots: X coordinates (same for all)
plot_data.rename(columns = {'frequencies': 'y'}, inplace=True) # freqs: Y coordinates
plot_data['text'] = plot_data.apply(lambda i: str(i['site']).strip() + i['allele'], axis=1) # Display text for mouse over
plot_data['name'] = plot_data['text'] # Display text for legend
del plot_data['allele'] # Cleanup; stored elsewhere
plot_data['visible'] = False # Initially all traces are not visible
plot_data = plot_data.to_dict(orient="index") # Ordered list of observations


for i, values in plot_data.items(): # Add unique color for each allele found at a site
    color_index = list(site_groups.get_group(values['site']).index.values).index(i)
    color = colors[color_index]
    values['line'] = {'color': color, 'width': 6}
    del values['site'] # We don't need the site number anymore

plot_data = plot_data.values() # We don't need the index anymore, just the records
plot_data = sorted(plot_data, key = lambda x: x['name']) # Put back in order by site and allele
print plot_data[0] # Sanity check

{'name': u'114I', 'text': u'114I', 'visible': False, 'y': array([ 0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,
        0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,
        0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,
        0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,
        0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8928,  0.8927,
        0.8927,  0.8928,  0.8928,  0.8928,  0.8928,  0.8825,  0.8614,
        0.8387,  0.8143,  0.7885,  0.7614,  0.733 ,  0.7034,  0.6727,
        0.6412,  0.609 ,  0.5761,  0.5426,  0.563 ,  0.5832,  0.6034,
        0.6233,  0.6055,  0.5875,  0.5696,  0.5652,  0.5609,  0.5565,
        0.5522,  0.5478,  0.5434,  0.539 ,  0.5113,  0.5129,  0.5145,
        0.5079,  0.4606,  0.4595,  0.4585,  0.4575,  0.4564,  0.5453,
        0.5433,  0.4962,  0.4982,  0.5002,  0.5022,  0.4546,  0.398 ,
        0.4023,  0.4389,  0.465 ,  0.4664,  0.3789,  0.3952,  0.4842,
        0.5165,  0.5288,  0.3949

In [1]:
import plotly.plotly as py # Magic!

In [124]:
def get_visible(site):
    '''Return a vector of booleans indicating which TRACE INDICES are from that site'''
    v = [ True if int(record['name'][:-1]) == site else False for record in plot_data ]
    assert v.count(True) == pd.value_counts(E_freqs['site'])[site]
    return v

def make_visible(site):
    '''Set traces from that site to visible; all others to false'''
    for record in plot_data:
        if int(record['name'][:-1]) == site:
            record['visible'] = True
        else:
            record['visible'] = False
make_visible(8)

In [128]:
sites = sorted(list(set(E_freqs['site']))) # Pull sites in order

sites_config = []
for site in sites: # For each site....
    params = { 
        'label': site,
        'method':'restyle', # Action: change which traces are visible
        'args': ['visible', get_visible(site)]} # Determined by vector of booleans from get_visible 
                                                #(which plot_data entries are from that site)
    sites_config.append(params)
        
site_slider = [{
    'active': 0, # Initial setting
    'currentvalue': {"prefix": "Frequency at site: "}, # Label
    'pad' : {"t": 50}, # Layout padding between slider and graph
    'steps': sites_config}]

layout = dict(sliders=site_slider)
fig = {'data': plot_data, 'layout': layout}
py.iplot(fig, filename='DENV1 Mutation Frequencies')