In [1]:
import pandas as pd
import numpy as np

import bokeh.io
import bokeh.plotting
import bokeh.palettes
from bokeh.transform import jitter
from bokeh.plotting import figure
from bokeh.models import HoverTool, Range1d
from bokeh.layouts import row

bokeh.io.output_notebook()

In [2]:
def plotClassDistribution(metadata, strain):
    Classes = metadata['Class_nb'].unique()
    p = bokeh.plotting.figure(
        width=500, 
        height=300,  
        x_axis_type='linear',
        y_axis_type='linear',
        x_axis_label = 'classes',
        y_axis_label = 'distribution',
        title="Classes distribution for "+strain,
        x_range=Range1d(0, len(Classes)+1),
        y_range=Range1d(0, np.array(metadata.loc[(metadata['Strain'] == strain), '_rlnClassDistribution']).max()*1.1)
    )
    
    X=metadata.loc[(metadata['Strain'] == strain), 'Class_nb']
    Y=metadata.loc[(metadata['Strain'] == strain), '_rlnClassDistribution']
    
    p.vbar(x=X, top=Y, width=0.9)
#    p.circle(
 #       x=df_met.loc[(df_met['Strain'] == strain), 'Class_nb'],
  #      y=df_met.loc[(df_met['Strain'] == strain), '_rlnClassDistribution'], 
   #     line_color = 'black',
    #    line_width = 2,
     #   fill_color = 'white',
      #  size = 2,
       # alpha=1
    #)
#    p.line(
#        x=df_met.loc[(df_met['Strain'] == strain), 'Class_nb'],
#        y=df_met.loc[(df_met['Strain'] == strain), '_rlnClassDistribution'], 
#        line_color = 'black',
#        line_width = 2,
#        alpha=1
#    )
    p.output_backend = 'webgl'
    p.xgrid.visible = True
    p.ygrid.visible = True
    #p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    return p

def plotFeatures(means, std, strain, parameter):
    Classes=means.loc[(means['Strain'] == strain),'Class_nb'].unique()
    p = bokeh.plotting.figure(
    width=500, 
    height=300,  
    x_axis_type='linear',
    y_axis_type='linear',
    x_axis_label = 'classes',
    y_axis_label = parameter,
    title=parameter + " distribution for " + strain,
    x_range=Range1d(0, 31),
    y_range=Range1d(0, means.loc[(means['Strain'] == strain),parameter].max()*1.1)
    )
    p.circle(
        x=means.loc[(means['Strain'] == strain), 'Class_nb'],
        y=means.loc[(means['Strain'] == strain), parameter], 
        line_color = 'black',
        line_width = 2,
        fill_color = 'white',
        size = 2,
        alpha=1
    )
    for c in Classes:
        p.line(
            x=[means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), 'Class_nb'], means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), 'Class_nb']],
            y=[means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), parameter] - std.loc[(std['Strain'] == strain) & (std['Class_nb'] == c), parameter], means.loc[(means['Strain'] == strain) & (means['Class_nb'] == c), parameter] + std.loc[(std['Strain'] == strain) & (std['Class_nb'] == c), parameter]], 
            line_color = 'black',
            line_width = 2,
            alpha=1
        )
    p.output_backend = 'webgl'
    p.xgrid.visible = True
    p.ygrid.visible = True
    #p.xaxis.minor_tick_line_color = None
    p.yaxis.minor_tick_line_color = None
    return p

In [3]:
df_met_raw = pd.read_csv('combined_metadata.csv')
df_met=df_met_raw.loc[df_met_raw['_rlnClassDistribution']>0]
df_pitch = pd.read_csv('combined_pitch.csv').reset_index()
df_width = pd.read_csv('combined_width.csv').reset_index()
df_met.head()

Unnamed: 0.1,Unnamed: 0,index,_rlnReferenceImage,_rlnClassDistribution,_rlnAccuracyRotations,_rlnAccuracyTranslationsAngst,_rlnEstimatedResolution,_rlnOverallFourierCompleteness,_rlnClassPriorOffsetX,_rlnClassPriorOffsetY,_rlnHelicalRise,_rlnHelicalTwist,Strain,Class_nb
0,0,0,000001@Class2D/job016/run_it025_classes.mrcs,0.063675,999.0,999.0,5.773617,0.952124,-0.02029,-0.02308,6.785,0.0,WT,1
2,2,2,000003@Class2D/job016/run_it025_classes.mrcs,0.047198,0.05,0.1166,4.678621,0.995874,-0.02285,0.054086,6.785,0.0,WT,3
4,4,4,000005@Class2D/job016/run_it025_classes.mrcs,0.044689,0.05,0.1166,4.599322,0.986455,-0.02476,-0.004,6.785,0.0,WT,5
6,6,6,000007@Class2D/job016/run_it025_classes.mrcs,0.054274,0.05,0.1166,4.307302,0.995772,-0.02492,0.065279,6.785,0.0,WT,7
7,7,7,000008@Class2D/job016/run_it025_classes.mrcs,0.091596,999.0,999.0,5.320784,0.98306,-0.04377,-0.06836,6.785,0.0,WT,8


In [4]:
df_pitch.head()

Unnamed: 0,level_0,index,Unnamed: 3,Label,Angle,Median,Class_nb,Pitch,Strain
0,0,0,1,run_it025_classes.mrcs:z:1/30 - run_it025_clas...,-7.319019,0.027197,1,38.83,WT
1,1,1,2,run_it025_classes.mrcs:z:1/30 - run_it025_clas...,-8.481606,0.000692,1,40.73,WT
2,2,2,3,run_it025_classes.mrcs:z:1/30 - run_it025_clas...,-11.309932,0.037064,1,39.64,WT
3,3,3,4,run_it025_classes.mrcs:z:1/30 - run_it025_clas...,-10.83008,-0.068289,1,41.37,WT
4,4,4,5,run_it025_classes.mrcs:z:1/30 - run_it025_clas...,-10.474235,0.026279,1,42.76,WT


In [5]:
df_width.head()

Unnamed: 0,level_0,index,Unnamed: 3,Label,Angle,Median,Class_nb,Width,Strain
0,0,0,1,run_it025_classes-1.mrcs:z:1/30 - run_it025_cl...,-98.082633,0.109167,1,60.31,WT
1,1,1,2,run_it025_classes-1.mrcs:z:1/30 - run_it025_cl...,80.193907,0.106647,1,58.09,WT
2,2,2,3,run_it025_classes-1.mrcs:z:1/30 - run_it025_cl...,-98.32565,0.074528,1,58.56,WT
3,3,3,4,run_it025_classes-1.mrcs:z:1/30 - run_it025_cl...,82.359593,0.06498,1,58.47,WT
4,4,4,5,run_it025_classes-1.mrcs:z:3/30 - run_it025_cl...,-89.65064,0.120031,3,57.95,WT


In [6]:
p = plotClassDistribution(df_met, 'WT')
p2 = plotClassDistribution(df_met, 't8v')
bokeh.io.show(row(p, p2))

In [7]:
pitch_means = df_pitch.groupby(['Strain', 'Class_nb'])['Pitch'].mean().reset_index()
pitch_std = df_pitch.groupby(['Strain', 'Class_nb'])['Pitch'].std().reset_index()

p3 = plotFeatures(pitch_means, pitch_std, 'WT', 'Pitch')
p4 = plotFeatures(pitch_means, pitch_std, 't8v', 'Pitch')

bokeh.io.show(row(p3, p4))

In [8]:
print('Number of WT classes ='+str(len(pitch_means.Pitch.loc[pitch_means['Strain']=='WT'])))

print('Number of bins ='+str(np.int(np.floor(len(pitch_means.Pitch.loc[pitch_means['Strain']=='WT'])*0.5))))
hist_p, edges_p = np.histogram(pitch_means.Pitch.loc[pitch_means['Strain']=='WT'], bins=np.int(np.floor(len(pitch_means.Pitch.loc[pitch_means['Strain']=='WT'])*0.5)))
title = 'Pitch histogram WT'
p = figure(title=title)
p.quad(top=hist_p, bottom=0, left=edges_p[:-1], right=edges_p[1:],
           fill_color="navy", line_color="white", alpha=0.5)
p.xgrid.visible = False
p.ygrid.visible = True


print('Number of t8v classes ='+str(len(pitch_means.Pitch.loc[pitch_means['Strain']=='t8v'])))

print('Number of bins ='+str(np.int(np.floor(len(pitch_means.Pitch.loc[pitch_means['Strain']=='t8v'])*0.5))))
hist_p2, edges_p2 = np.histogram(pitch_means.Pitch.loc[pitch_means['Strain']=='t8v'], bins=np.int(np.floor(len(pitch_means.Pitch.loc[pitch_means['Strain']=='t8v'])*0.5)))
title = 'Pitch histogram t8v'
p_t8v = figure(title=title)
p_t8v.quad(top=hist_p2, bottom=0, left=edges_p2[:-1], right=edges_p2[1:],
           fill_color="navy", line_color="white", alpha=0.5)
p_t8v.xgrid.visible = False
p_t8v.ygrid.visible = True
bokeh.io.show(row(p, p_t8v))

Number of WT classes =16
Number of bins =8
Number of t8v classes =23
Number of bins =11


In [9]:
width_means = df_width.groupby(['Strain', 'Class_nb'])['Width'].mean().reset_index()
width_std = df_width.groupby(['Strain', 'Class_nb'])['Width'].std().reset_index()

p5 = plotFeatures(width_means, width_std, 'WT', 'Width')
p6 = plotFeatures(width_means, width_std, 't8v', 'Width')

bokeh.io.show(row(p5, p6))

In [14]:
print('Number of WT classes ='+str(len(width_means.Width.loc[width_means['Strain']=='WT'])))

print('Number of bins ='+str(np.int(np.floor(len(width_means.Width.loc[width_means['Strain']=='WT'])*0.5))))
hist_w, edges_w = np.histogram(width_means.Width.loc[width_means['Strain']=='WT'], bins=np.int(np.floor(len(width_means.Width.loc[width_means['Strain']=='WT'])*0.5)))
title = 'Pitch histogram WT'
p = figure(title=title)
p.quad(top=hist_w, bottom=0, left=edges_w[:-1], right=edges_w[1:],
           fill_color="navy", line_color="white", alpha=0.5)
p.xgrid.visible = False
p.ygrid.visible = True


print('Number of t8v classes ='+str(len(width_means.Width.loc[width_means['Strain']=='t8v'])))

print('Number of bins ='+str(np.int(np.floor(len(width_means.Width.loc[width_means['Strain']=='t8v'])*0.5))))
hist_w2, edges_w2 = np.histogram(width_means.Width.loc[width_means['Strain']=='t8v'], bins=np.int(np.floor(len(width_means.Width.loc[width_means['Strain']=='t8v'])*0.5)))
title = 'Pitch histogram t8v'
p_t8v = figure(title=title)
p_t8v.quad(top=hist_w2, bottom=0, left=edges_w2[:-1], right=edges_w2[1:],
           fill_color="navy", line_color="white", alpha=0.5)
p_t8v.xgrid.visible = False
p_t8v.ygrid.visible = True
bokeh.io.show(row(p, p_t8v))

Number of WT classes =18
Number of bins =9
Number of t8v classes =24
Number of bins =12


In [16]:
strain='WT'
Classes = pitch_means.loc[(pitch_means['Strain'] == strain),'Class_nb'].unique()
print(len(Classes))
pitch_mean_weighed = np.zeros(len(Classes))
for n, c in enumerate(Classes):
    pitch_mean_weighed[n] = float(pitch_means.loc[(pitch_means['Strain'] == strain) & (pitch_means['Class_nb'] == c),'Pitch']) * float(df_met.loc[(df_met['Strain'] == strain) & (df_met['Class_nb'] == c), '_rlnClassDistribution'])
Classes = width_means.loc[(width_means['Strain'] == strain),'Class_nb'].unique()
print(len(Classes))
width_mean_weighed = np.zeros(len(Classes))
for n, c in enumerate(Classes):
    width_mean_weighed[n] = float(width_means.loc[(width_means['Strain'] == strain) & (width_means['Class_nb'] == c),'Width']) * float(df_met.loc[(df_met['Strain'] == strain) & (df_met['Class_nb'] == c), '_rlnClassDistribution'])

print('Weighed pitch mean = '+ str(pitch_mean_weighed.sum()))
print('Pitch mean = ' + str(pitch_means.loc[(pitch_means['Strain'] == strain),'Pitch'].mean()))
print('Weighed width mean = '+ str(width_mean_weighed.sum()))
print('Width mean = ' + str(width_means.loc[(width_means['Strain'] == strain),'Width'].mean()))
print('Volume = ' + str(pitch_mean_weighed.sum()*width_mean_weighed.sum()*width_mean_weighed.sum()*np.pi/4))

16
18
Weighed pitch mean = 36.67917044285714
Pitch mean = 40.51849702380953
Weighed width mean = 59.464066407500006
Width mean = 59.45458333333334
Volume = 101863.50035097712


In [11]:
strain='t8v'
Classes = pitch_means.loc[(pitch_means['Strain'] == strain),'Class_nb'].unique()
pitch_mean_weighed = np.zeros(len(Classes))
for n, c in enumerate(Classes):
    pitch_mean_weighed[n] = float(pitch_means.loc[(pitch_means['Strain'] == strain) & (pitch_means['Class_nb'] == c),'Pitch']) * float(df_met.loc[(df_met['Strain'] == strain) & (df_met['Class_nb'] == c), '_rlnClassDistribution'])
Classes = width_means.loc[(width_means['Strain'] == strain),'Class_nb'].unique()
width_mean_weighed = np.zeros(len(Classes))
for n, c in enumerate(Classes):
    width_mean_weighed[n] = float(width_means.loc[(width_means['Strain'] == strain) & (width_means['Class_nb'] == c),'Width']) * float(df_met.loc[(df_met['Strain'] == strain) & (df_met['Class_nb'] == c), '_rlnClassDistribution'])

print('Weighed pitch mean = '+ str(pitch_mean_weighed.sum()))
print('Pitch mean = ' + str(pitch_means.loc[(pitch_means['Strain'] == strain),'Pitch'].mean()))
print('Weighed width mean = '+ str(width_mean_weighed.sum()))
print('Width mean = ' + str(width_means.loc[(width_means['Strain'] == strain),'Width'].mean()))
print('Volume = ' + str(pitch_mean_weighed.sum()*width_mean_weighed.sum()*width_mean_weighed.sum()*np.pi/4))

Weighed pitch mean = 39.324406180000004
Pitch mean = 40.763405797101456
Weighed width mean = 59.62460022250001
Width mean = 59.68656249999999
Volume = 109800.17061055684
