In [1]:
import pandas as pd
import numpy as np

import bokeh.io
import bokeh.plotting
import bokeh.palettes
from bokeh.transform import jitter
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Label, LabelSet, HoverTool, Range1d
from bokeh.layouts import row, column

bokeh.io.output_notebook()

In [2]:
def plotClassDistribution(metadata, strain, y_range_max):
    Classes = metadata['Class_nb'].unique()
    p = bokeh.plotting.figure(
        width=500, 
        height=300,  
        x_axis_type='linear',
        y_axis_type='linear',
        x_axis_label = 'classes',
        y_axis_label = 'distribution',
        title="Classes distribution for "+strain,
        x_range=Range1d(0, len(Classes)+1),
        y_range=Range1d(0, y_range_max)
    )
    
    X=metadata.loc[(metadata['Strain'] == strain), 'Class_nb']
    Y=metadata.loc[(metadata['Strain'] == strain), '_rlnClassDistribution']
    
    p.vbar(x=X, top=Y, width=0.9)
    p.output_backend = 'webgl'
    p.xgrid.visible = False
    p.ygrid.visible = True
    #p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    return p

def plotFeatures(df, strain, parameter):
    means = df.loc[(df['Strain'] == strain) & (df[parameter+'_mean'] > 0), ('Strain', 'Class_nb', parameter+'_mean')]
    std = df.loc[(df['Strain'] == strain) & (df[parameter+'_mean'] > 0), ('Strain', 'Class_nb', parameter+'_std')]
    Classes=means.loc[(means['Strain'] == strain),'Class_nb'].unique()
    p = bokeh.plotting.figure(
        width=500, 
        height=300,  
        x_axis_type='linear',
        y_axis_type='linear',
        x_axis_label = 'classes',
        y_axis_label = parameter,
        title=parameter + " distribution for " + strain,
        x_range=Range1d(0, 31),
        y_range=Range1d(means.loc[(means['Strain'] == strain),parameter+'_mean'].min()*0.9, means.loc[(means['Strain'] == strain),parameter+'_mean'].max()*1.1)
    )
    p.circle(
        x=means.loc[(means['Strain'] == strain), 'Class_nb'],
        y=means.loc[(means['Strain'] == strain), parameter+'_mean'], 
        line_color = 'black',
        line_width = 2,
        fill_color = 'white',
        size = 2,
        alpha=1
    )
    for c in Classes:
        p.line(
            x=[means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), 'Class_nb'], means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), 'Class_nb']],
            y=[means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), parameter+'_mean'] - std.loc[(std['Strain'] == strain) & (std['Class_nb'] == c), parameter+'_std'], means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), parameter+'_mean'] + std.loc[(std['Strain'] == strain) & (std['Class_nb'] == c), parameter+'_std']], 
            line_color = 'black',
            line_width = 2,
            alpha=1
        )
    p.output_backend = 'webgl'
    p.xgrid.visible = False
    p.ygrid.visible = True
    #p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    return p

def getParamRange(df, parameter, n_bins):
    my_range = (df_met.loc[(df[parameter+'_mean'] > 0), parameter+'_mean'].min(), df.loc[(df_met[parameter+'_mean'] > 0), parameter+'_mean'].max())
    my_step = (my_range[1] - my_range[0])/n_bins
    return my_range, my_step

def plotHistDistribution(X, Y, step, strain, param, y_range_max):
    p = bokeh.plotting.figure(
        width=500, 
        height=300,  
        x_axis_type='linear',
        y_axis_type='linear',
        x_axis_label = param,
        y_axis_label = 'distribution',
        title= param + " distribution for "+strain,
        y_range=Range1d(0, y_range_max)
    )
    
    p.vbar(x=X, top=Y, width=step)
    p.output_backend = 'webgl'
    p.xgrid.visible = False
    p.ygrid.visible = True
    #p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    return p

def plotHistDistributionSource(source, step, strain, param, y_range_max):
    p = bokeh.plotting.figure(
        width=500, 
        height=300,  
        x_axis_type='linear',
        y_axis_type='linear',
        x_axis_label = param+' (Å)',
        y_axis_label = 'distribution',
        title= param + " distribution for "+strain,
        y_range=Range1d(0, y_range_max),
    )
    p.add_tools(HoverTool(
            tooltips=[
                ('BinMean', '@{x_values}'),
                ('BinDist', '@{y_values}'),
                ('Classes', '@{names}')
            ],
    ))
    p.vbar(x='x_values', top='y_values', width=step, source=source)
    p.output_backend = 'webgl'
    p.xgrid.visible = False
    p.ygrid.visible = True
    #p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    return p

def plotNormHist(df, strain, parameter, n_bins, y_range_max):
    a = df.loc[(df['Strain'] == strain) & (df[parameter+'_mean'] > 0), ('Strain', 'Class_nb', parameter+'_mean', '_rlnClassDistribution')]
    dist_norm_param = a._rlnClassDistribution.sum()
    a['NormClassDistribution'] = a['_rlnClassDistribution']/dist_norm_param
    param_range, param_step = getParamRange(df, parameter, n_bins)
    param_hist_X = np.linspace(param_range[0], param_range[1], n_bins)
    dist_sum = np.zeros(len(param_hist_X)-1)
    class_names = [None]*(len(param_hist_X)-1)
    hist_X_val = np.zeros(len(param_hist_X)-1)
    for i in range(len(param_hist_X)-1):
        hist_X_val[i] = (param_hist_X[i]+param_hist_X[i+1])/2
        dist_sum[i] = a.loc[(a[parameter+'_mean'] >= param_hist_X[i]) & (a[parameter+'_mean'] < param_hist_X[i+1]), 'NormClassDistribution'].sum()
        class_names[i] = list(a.loc[(a[parameter+'_mean'] >= param_hist_X[i]) & (a[parameter+'_mean'] < param_hist_X[i+1]), 'Class_nb'])
    source = ColumnDataSource(data=dict(x_values=hist_X_val,
                                        y_values=dist_sum,
                                        names=class_names
                                       ))
    p = plotHistDistributionSource(source, param_step, strain, parameter, y_range_max)
    return p, class_names

def getMetadata(df, param1, param2):
    Strains = df['Strain'].unique()
    df_out = pd.DataFrame()
    df_out['Strain']= Strains
    df_out[param1+'_weighed_mean'] = 0
    df_out[param1+'_mean'] = 0
    df_out[param2+'_weighed_mean'] = 0
    df_out[param2+'_mean'] = 0
    df_out['Volume_weighed'] = 0
    df_out['Volume'] = 0
    for strain in Strains:
        Classes_p1 = df.loc[(df['Strain'] == strain) & (df[param1+'_mean'] > 0),'Class_nb'].unique()
        p1_mean_weighed = np.zeros(len(Classes_p1))
        prevalence1= np.zeros(len(Classes_p1))
        for c, cla in enumerate(Classes_p1):
            prevalence1[c] = df.loc[(df['Strain'] == strain) & (df['Class_nb'] == cla), '_rlnClassDistribution']
        for n, c in enumerate(Classes_p1):
            p1_mean_weighed[n] = float(df.loc[(df['Strain'] == strain) & (df['Class_nb'] == c),param1+'_mean']) * float(df.loc[(df['Strain'] == strain) & (df['Class_nb'] == c), '_rlnClassDistribution'])/prevalence1.sum()
        df_out.loc[(df_out['Strain'] == strain), param1+'_weighed_mean'] = p1_mean_weighed.sum()
        df_out.loc[(df_out['Strain'] == strain), param1+'_mean'] = df_met.loc[(df['Strain'] == strain) & (df[param1+'_mean'] > 0),param1+'_mean'].mean()
        Classes_p2 = df.loc[(df['Strain'] == strain) & (df[param2+'_mean'] > 0),'Class_nb'].unique()
        p2_mean_weighed = np.zeros(len(Classes_p2))
        prevalence2= np.zeros(len(Classes_p2))
        for c, cla in enumerate(Classes_p2):
            prevalence2[c] = df.loc[(df['Strain'] == strain) & (df['Class_nb'] == cla), '_rlnClassDistribution']
        for n, c in enumerate(Classes_p2):
            p2_mean_weighed[n] = float(df.loc[(df['Strain'] == strain) & (df['Class_nb'] == c),param2+'_mean']) * float(df.loc[(df['Strain'] == strain) & (df['Class_nb'] == c), '_rlnClassDistribution'])/prevalence2.sum()
        df_out.loc[(df_out['Strain'] == strain), param2+'_weighed_mean'] = p2_mean_weighed.sum()
        df_out.loc[(df_out['Strain'] == strain), param2+'_mean'] = df_met.loc[(df['Strain'] == strain) & (df[param2+'_mean'] > 0),param2+'_mean'].mean()
        df_out.loc[(df_out['Strain'] == strain), 'Volume_weighed'] = df_out.loc[(df_out['Strain'] == strain), param2+'_weighed_mean']**2*df_out.loc[(df_out['Strain'] == strain), param1+'_weighed_mean']
        df_out.loc[(df_out['Strain'] == strain), 'Volume'] = df_met.loc[(df['Strain'] == strain) & (df[param2+'_mean'] > 0),param2+'_mean'].mean()**2*df_out.loc[(df_out['Strain'] == strain), param1+'_mean']
    return df_out

In [3]:
df_met = pd.read_csv('ClassesMetadataSummary.csv')
df_met.head()

Unnamed: 0.1,Unnamed: 0,index,_rlnReferenceImage,_rlnClassDistribution,_rlnAccuracyRotations,_rlnAccuracyTranslationsAngst,_rlnEstimatedResolution,_rlnOverallFourierCompleteness,_rlnClassPriorOffsetX,_rlnClassPriorOffsetY,_rlnHelicalRise,_rlnHelicalTwist,Strain,Class_nb,Pitch_mean,Pitch_std,Width_mean,Width_std
0,0,0,000001@Class2D/job016/run_it025_classes.mrcs,0.063675,999.0,999.0,5.773617,0.952124,-0.02029,-0.02308,6.785,0.0,WT,1,40.824286,1.886919,58.8575,0.989524
1,1,1,000002@Class2D/job016/run_it025_classes.mrcs,0.0,999.0,999.0,54.272,0.66571,0.0,0.0,6.785,0.0,WT,2,0.0,0.0,0.0,0.0
2,2,2,000003@Class2D/job016/run_it025_classes.mrcs,0.047198,0.05,0.1166,4.678621,0.995874,-0.02285,0.054086,6.785,0.0,WT,3,40.516667,1.701243,58.215,0.33946
3,3,3,000004@Class2D/job016/run_it025_classes.mrcs,0.0,999.0,999.0,54.272,0.780593,0.0,0.0,6.785,0.0,WT,4,0.0,0.0,0.0,0.0
4,4,4,000005@Class2D/job016/run_it025_classes.mrcs,0.044689,0.05,0.1166,4.599322,0.986455,-0.02476,-0.004,6.785,0.0,WT,5,40.691667,1.455107,60.685,0.780961


In [4]:
p = plotClassDistribution(df_met, 'WT', 0.15)
p2 = plotClassDistribution(df_met, 't8v', 0.15)
p3 = plotFeatures(df_met, 'WT', 'Pitch')
p4 = plotFeatures(df_met, 't8v', 'Pitch')
p5, class_list_pitch = plotNormHist(df_met, 'WT', 'Pitch', 10, 0.35)
p6, class_list = plotNormHist(df_met, 't8v', 'Pitch', 10, 0.35)
p7 = plotFeatures(df_met, 'WT', 'Width')
p8 = plotFeatures(df_met, 't8v', 'Width')
p9, class_list = plotNormHist(df_met, 'WT', 'Width', 10, 0.35)
p10, class_list = plotNormHist(df_met, 't8v', 'Width', 10, 0.35)
bokeh.io.show(column(row(p, p2), row(p3, p4), row(p5, p6), row(p7, p8), row(p9, p10)))

In [5]:
df_features=getMetadata(df_met, 'Pitch', 'Width')
df_features.head()

Unnamed: 0,Strain,Pitch_weighed_mean,Pitch_mean,Width_weighed_mean,Width_mean,Volume_weighed,Volume
0,WT,40.367465,40.518497,59.464007,59.454583,142738.069338,143226.707071
1,t8v,40.738944,40.763406,59.62466,59.686563,144831.021575,145219.051991


In [4]:
p = bokeh.plotting.figure(
    width=500, 
    height=300,  
    x_axis_type='linear',
    y_axis_type='linear',
    x_axis_label = 'width',
    y_axis_label = 'pitch',
    title= "Pitch vs width",
    x_range=Range1d(57.5, 62),
    y_range=Range1d(38.5, 42)
)

colors = ['navy', 'orange']
Strains = ['WT', 't8v']

for s, strain in enumerate(Strains):
    X=np.array(df_met.loc[df_met['Strain'] == strain, 'Width_mean'])
    Y=np.array(df_met.loc[df_met['Strain'] == strain, 'Pitch_mean'])
    Size=np.array(df_met.loc[df_met['Strain'] == strain, '_rlnClassDistribution'])*200
    for i, xs in enumerate(X):
        p.circle(
            x=X[i],
            y=Y[i], 
            line_color = None,
            line_width = 2,
            fill_color = colors[s],
            size = Size[i],
            alpha=1
        )
p.output_backend = 'webgl'
bokeh.io.show(p)