In [36]:
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import row, column
from bokeh.models import BoxSelectTool, LassoSelectTool, Spacer
from bokeh.plotting import figure
from sklearn.linear_model import LinearRegression
output_notebook()

In [3]:
# Load in data, form is ID, mass, size, WHIM sizes
d2048 = np.loadtxt('./2048z3/WHIM_data.txt')
hm2048 = d2048[:,1]
WHIM2048 = d2048[:,3::]

d4096 = np.loadtxt('./4096z05/WHIM_data.txt')
hm4096 = d4096[:,1]
WHIM4096 = d4096[:,3::]

# I also have the new dask set. For 2048 this is the same, but for 4096 it has no nans!
da_d4096 = np.loadtxt('./4096z05/da_WHIM_data.txt')
da_hm4096 = da_d4096[:,1]
da_WHIM4096 = da_d4096[:,3::]

In [4]:
# Define useful tools:
# Define useful tools:
TOOLS='hover,crosshair,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select'

# Tools I'm not using: 
#lasso_select - Doesn't work v well for hist.
#pan - No need for this
#poly_select - Overkill

# Set some std colors:
line = 'black'
bck = 'white'

In [82]:
# First make a basic scatter plot.
# It would be very cool to turn this into this, but time!
# https://github.com/bokeh/bokeh/blob/master/examples/app/selection_histogram.py
# Also want to include more advanced fitting (ie. test and training set), but time.
def scatter(hm,WHIM_sizes,fit=False):
    '''
    Takes in a halo masses with their corresponding WHIM_sizes. 
    Make sure the indices are the same between the sets.
    Produces a scatter plot, and will fit with linear regression if fit=True.
    '''
    # Manipulate data:
    x = np.transpose(np.log10([hm]*26)).flatten()
    y = WHIM_sizes.flatten()
    
    # Central figure:
    p = figure(tools=TOOLS+',lasso_select',background_fill_color=bck, min_border=10, min_border_left=50,
               toolbar_location="above", x_range=(min(x)-0.01,max(x)+0.01),y_range=(0,max(y[~np.isnan(y)])),
               x_axis_location=None, y_axis_location=None)
    r = p.scatter(x=x, y=y,size=1,color='steelblue')
    p.select(BoxSelectTool).select_every_mousemove = False
    p.select(LassoSelectTool).select_every_mousemove = False
    
    # Put in several args for line at once
    LINE_ARGS = dict(color=line, line_color=None)
    
    # Create the horizontal histogram
    hhist, hedges = np.histogram(x, bins=20,range=(min(x)-0.01,max(x)+0.01))
    hzeros = np.zeros(len(hedges)-1)
    hmax = max(hhist)*1.1
    
    ph = figure(toolbar_location=None, plot_width=p.plot_width, plot_height=200, x_range=p.x_range,
                y_range=(-hmax, hmax), min_border=10, min_border_left=50, y_axis_location="right")
    ph.xgrid.grid_line_color = None
    ph.yaxis.major_label_orientation = np.pi/4
    ph.background_fill_color = bck

    ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hhist, color='lightsteelblue', line_color=line)
    hh1 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
    hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)

    # Create the vertical histogram
    vhist, vedges = np.histogram(y, bins=20,range=(min(y),max(y[~np.isnan(y)])))
    vzeros = np.zeros(len(vedges)-1)
    vmax = max(vhist)*1.1

    pv = figure(toolbar_location=None, plot_width=200, plot_height=p.plot_height, x_range=(-vmax, vmax),
                y_range=p.y_range, min_border=10, y_axis_location="right")
    pv.ygrid.grid_line_color = None
    pv.xaxis.major_label_orientation = np.pi/4
    pv.background_fill_color = bck

    pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vhist, color='lightsteelblue', line_color=line)
    vh1 = pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
    vh2 = pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)

    layout = column(row(p, pv), row(ph,Spacer(width=200, height=200)))
    
    # Fit with Linear Regression
    if fit:
        model = LinearRegression(fit_intercept=True)
        model.fit(x[:, np.newaxis], y)

        xfit = np.linspace(min(x)-0.01, max(x)+0.01, 1000)
        yfit = model.predict(xfit[:, np.newaxis])

        p.line(xfit, yfit,line_color='coral',line_width=2,line_alpha=0.5)
    
    # Axis labels
    ph.xaxis.axis_label='log10(Mass/Solar Mass)'
    ph.xaxis.axis_label_text_font_style='normal'
    pv.yaxis.axis_label='WHIM Size (Mpc/h)'
    pv.yaxis.axis_label_text_font_style='normal'
    
    show(layout)
    
    # Define the function that will happen when we select the data, ie. the side histograms will update.
    def update(attr, old, new):
        inds = np.array(new['1d']['indices'])
        if len(inds) == 0 or len(inds) == len(x):
            hhist1, hhist2 = hzeros, hzeros
            vhist1, vhist2 = vzeros, vzeros
        else:
            neg_inds = np.ones_like(x, dtype=np.bool)
            neg_inds[inds] = False
            hhist1, _ = np.histogram(x[inds], bins=hedges)
            vhist1, _ = np.histogram(y[inds], bins=vedges)
            hhist2, _ = np.histogram(x[neg_inds], bins=hedges)
            vhist2, _ = np.histogram(y[neg_inds], bins=vedges)

        hh1.data_source.data["top"]   =  hhist1
        hh2.data_source.data["top"]   = -hhist2
        vh1.data_source.data["right"] =  vhist1
        vh2.data_source.data["right"] = -vhist2

    r.data_source.on_change('selected', update)

In [6]:
# First plot histogram of mass distribution
def hm_hist(hm,nbins=10):
    '''
    Takes in a set of halo masses and makes the histogram.
    nbins = 10 by default.
    '''
    
    # Take log of halo masses:
    log_hm = np.log10(hm)

    # Get bin positions
    hist, edges = np.histogram(log_hm, density=False, bins=nbins)

    p = figure(title='',tools=TOOLS, background_fill_color=bck,x_range=(min(edges),max(edges)),y_range=(0,1.1*max(hist)))
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color=line, fill_color='steelblue')
    
    # Axis labels
    p.xaxis.axis_label='log10(Mass/Solar Mass)'
    p.xaxis.axis_label_text_font_style='normal'
    p.yaxis.axis_label='Count'
    p.yaxis.axis_label_text_font_style='normal'
    
    show(p)

In [7]:
# Now make a box plot function:
def boxplot(hm,WHIM_sizes,nbins=10):
    '''
    Takes in a halo masses with their corresponding WHIM_sizes. 
    Make sure the indices are the same between the sets.
    Will produce a boxplot with quartiles, median, and whiskers extending to 0.05 and 0.95.
    nbins is an optional argument that is by default set to 10.
    '''
    # Take log:
    log_hm = np.log10(hm)
    
    # Get the histogram data for log_hm:
    hist, edges = np.histogram(log_hm, density=False, bins=nbins)
    
    # List of indices for each bin
    ind_list = [np.argwhere((log_hm < edges[i+1]) & (log_hm >= edges[i])) for i in range(nbins)]

    # Get position for bins and lines (weighted bin position)
    pos_bin = [(edges[i+1]+edges[i])/2 for i in range(nbins)]
    pos_line = [log_hm[ind_list[i]].mean() if ind_list[i].size else pos_bin[i] for i in range(nbins)]

    # Add the WHIM_data from the correct spot.
    WHIMdata = [WHIM_sizes[ind_list[i]].flatten() for i in range(nbins)]
    
    # Also remove nans:
    for i in range(len(WHIMdata)):
        WHIMdata[i] = WHIMdata[i][~np.isnan(WHIMdata[i])]

    # Set up quartiles. If there is no data then set to zero.
    q1 = [(np.percentile(WHIM,25) if WHIM.size else 0) for WHIM in WHIMdata]
    q2 = [(np.percentile(WHIM,50) if WHIM.size else 0) for WHIM in WHIMdata]
    q3 = [(np.percentile(WHIM,75) if WHIM.size else 0) for WHIM in WHIMdata]
    upper = [(np.percentile(WHIM,95) if WHIM.size else 0) for WHIM in WHIMdata]
    lower = [(np.percentile(WHIM,5) if WHIM.size else 0) for WHIM in WHIMdata]

    p = figure(tools=TOOLS, background_fill_color=bck, title="", x_range=(min(edges),max(edges)), y_range=(0,1.1*max(upper)))

    # stems
    p.segment(pos_line, upper, pos_line, q3, line_color=line)
    p.segment(pos_line, lower, pos_line, q1, line_color=line)

    # boxes
    p.vbar(pos_bin, [edges[i+1]-edges[i] for i in range(nbins)], q2, q3, fill_color='steelblue', line_color=line)
    p.vbar(pos_bin, [edges[i+1]-edges[i] for i in range(nbins)], q1, q2, fill_color='coral', line_color=line)

    # whiskers (almost-0 height rects simpler than segments)
    p.rect(pos_line, lower, [(edges[i+1]-edges[i])/2 for i in range(nbins)], 0.001, fill_color=line,line_color=line)
    p.rect(pos_line, upper, [(edges[i+1]-edges[i])/2 for i in range(nbins)], 0.001, fill_color=line,line_color=line)
    
    # Axis labels
    p.xaxis.axis_label='log10(Mass/Solar Mass)'
    p.xaxis.axis_label_text_font_style='normal'
    p.yaxis.axis_label='WHIM Size (Mpc/h)'
    p.yaxis.axis_label_text_font_style='normal'

    show(p)

In [83]:
# Test all of my functions with the simple 2048 dataset first:
scatter(hm2048,WHIM2048)

In [76]:
hm_hist(hm2048)

In [74]:
boxplot(hm2048,WHIM2048)

In [84]:
# Since they are all working, try with 4096 dataset:
scatter(hm4096,WHIM4096)

  keep = (tmp_a >= mn)
  keep &= (tmp_a <= mx)


In [78]:
hm_hist(hm4096)

In [75]:
boxplot(hm4096,WHIM4096)

In [85]:
scatter(da_hm4096,da_WHIM4096)

In [10]:
boxplot(da_hm4096,da_WHIM4096)

In [29]:
# Now check with scaled y axis:
WHIM2048_scaled = np.copy(WHIM2048)
for i in range(len(d2048)):
    WHIM2048_scaled[i,:] /= d2048[i,2]
    
    
WHIM4096_scaled = np.copy(da_WHIM4096)
for i in range(len(d4096)):
    WHIM4096_scaled[i,:] /= da_d4096[i,2]

In [86]:
scatter(hm4096,WHIM4096_scaled,fit=True)

In [24]:
boxplot(hm4096,WHIM4096_scaled)