## Generate a timeseries from the data

First, crawl the instrument for all of the data for the past 30 days.


In [22]:
import requests, json
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import glob
import datetime as dt
import requests
from tqdm import tqdm
import numpy as np
import bokeh
# Plot a complex chart with interactive hover in a few lines of code

from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
from bokeh.sampledata.autompg import autompg_clean as df_car
from bokeh.transform import factor_cmap


In [2]:
from bokeh.io import output_notebook, show
output_notebook()

In [61]:
df_car.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name,mfr
0,18.0,8,307.0,130,3504,12.0,70,North America,chevrolet chevelle malibu,chevrolet
1,15.0,8,350.0,165,3693,11.5,70,North America,buick skylark 320,buick
2,18.0,8,318.0,150,3436,11.0,70,North America,plymouth satellite,plymouth
3,16.0,8,304.0,150,3433,12.0,70,North America,amc rebel sst,amc
4,17.0,8,302.0,140,3449,10.5,70,North America,ford torino,ford


In [93]:

df_car.cyl = df_car.cyl.astype(str)
df_car.yr = df_car.yr.astype(str)

group = df_car.groupby(by=['cyl', 'mfr'])
source = ColumnDataSource(group)

p = figure(plot_width=800, plot_height=300, title="Mean MPG by # Cylinders and Manufacturer",
           x_range=group, toolbar_location=None, tools="")

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Manufacturer grouped by # Cylinders"
p.xaxis.major_label_orientation = 1.2

index_cmap = factor_cmap('cyl_mfr', palette=['#2b83ba', '#abdda4', '#ffffbf', '#fdae61', '#d7191c'], 
                         factors=sorted(df_car.cyl.unique()), end=1)

p.vbar(x='cyl_mfr', top='mpg_mean', width=1, source=source,
       line_color="white", fill_color=index_cmap, 
       hover_line_color="darkgrey", hover_fill_color=index_cmap)

# p.add_tools(HoverTool(tooltips=[("MPG", "@mpg_mean"), ("Cyl, Mfr", "@cyl_mfr")]))

show(p)

__Load data that from classifier]__

In [17]:
last_file = sorted(glob.glob("../data/classified_*_volume.csv"))[-1]
df = pd.read_csv(last_file)
df.index = pd.to_datetime(df['dateTime'])
df.head()

def this_week(ax):
    """set x-lim of an axes to the past week"""
    now = dt.datetime.now()
    week_ago = now - dt.timedelta(days=10)
    ax.set_xlim(week_ago, now)

In [18]:
not_species = ['dateTime',"inhibitTime","runTime","syringeSize",'fileName']
species = [cols for cols in df.columns if cols not in not_species]
df['nsamples'] = df[species].sum(axis=1)

daily = df[species].resample('1D').mean()
daily['n_totals'] = df['nsamples'].resample('1D').sum()


In [19]:
# daily.reset_index()

In [44]:
not_species = ['dateTime',"inhibitTime","runTime","syringeSize",'fileName','nsamples','ntotals','volume_analyzed']
habs = ["Pseudo-nitzschia","Alexandrium_singlet","Dinophysis","Lingulodinium","Cochlodinium","Prorocentrum","Gymnodinium","Karenia","Protoperidinium"]
species = [cols for cols in df.columns if cols not in not_species]
not_habs = [cols for cols in df.columns if (cols not in not_species) & (cols not in habs)]

top_cols = df.iloc[-1][not_habs].sort_values(ascending=False)[:15].index.values
top_cols = [y for x in [habs, top_cols] for y in x]
print(top_cols)

long = pd.melt(df[top_cols].reset_index(), id_vars=['dateTime'], value_vars=top_cols)
# long.apply(lambda x: str(x.value) in species, axis=1)
long['hab'] = long['variable'].isin(habs)
long['date'] = long['dateTime'].apply(lambda x: x.strftime("%D"))

daily_long = pd.melt(daily.reset_index(), id_vars=['dateTime','n_totals'], value_vars=top_cols)
daily_long['hab'] = daily_long['variable'].isin(habs)
daily_long['habs'] = daily_long['hab'].apply(lambda x: 'Hab' if x else 'Not Hab')
daily_long

['Pseudo-nitzschia', 'Alexandrium_singlet', 'Dinophysis', 'Lingulodinium', 'Cochlodinium', 'Prorocentrum', 'Gymnodinium', 'Karenia', 'Protoperidinium', 'Centric', 'NanoP_less10', 'Ciliates', 'Ceratium', 'Chaetoceros', 'Dictyocha', 'Scrip_Het', 'Entomoneis', 'Akashiwo', 'Phaeocystis', 'Coccolithophore', 'zooplankton_misc', 'Odontella', 'Cryptophyte', 'Oxyp_Oxyt']


Unnamed: 0,dateTime,n_totals,variable,value,hab,habs
0,2021-08-20,660.354397,Pseudo-nitzschia,0.005723,True,Hab
1,2021-08-21,1169.544174,Pseudo-nitzschia,0.004226,True,Hab
2,2021-08-22,744.316879,Pseudo-nitzschia,0.007885,True,Hab
3,2021-08-23,833.783598,Pseudo-nitzschia,0.012931,True,Hab
4,2021-08-24,977.269098,Pseudo-nitzschia,0.026827,True,Hab
...,...,...,...,...,...,...
475,2021-09-04,485.425764,Oxyp_Oxyt,0.002578,False,Not Hab
476,2021-09-05,618.368717,Oxyp_Oxyt,0.004090,False,Not Hab
477,2021-09-06,1445.619776,Oxyp_Oxyt,0.034502,False,Not Hab
478,2021-09-07,5146.488931,Oxyp_Oxyt,0.090660,False,Not Hab


In [45]:
daily_long.query("dateTime > @s_date")

Unnamed: 0,dateTime,n_totals,variable,value,hab,habs
18,2021-09-07,5146.488931,Pseudo-nitzschia,0.009964,True,Hab
19,2021-09-08,5975.626411,Pseudo-nitzschia,0.005863,True,Hab
38,2021-09-07,5146.488931,Alexandrium_singlet,0.204046,True,Hab
39,2021-09-08,5975.626411,Alexandrium_singlet,0.170058,True,Hab
58,2021-09-07,5146.488931,Dinophysis,0.703104,True,Hab
59,2021-09-08,5975.626411,Dinophysis,1.310528,True,Hab
78,2021-09-07,5146.488931,Lingulodinium,4.030921,True,Hab
79,2021-09-08,5975.626411,Lingulodinium,4.036579,True,Hab
98,2021-09-07,5146.488931,Cochlodinium,0.031539,True,Hab
99,2021-09-08,5975.626411,Cochlodinium,0.034144,True,Hab


In [47]:

# s_date = daily_long['dateTime'].iloc[0].strftime("%D")
e_date = daily_long['dateTime'].iloc[-1]
s_date = e_date - dt.timedelta(days=2)


group = daily_long.query("dateTime > @s_date").groupby(by=['habs','variable'])
source = ColumnDataSource(group)
source

p = figure(plot_width=800, plot_height=400, title="Daily Average Counts - {} to {}".format(s_date.strftime("%D"),e_date.strftime("%D")),
           x_range=group, toolbar_location="right", tools = ["pan,wheel_zoom,box_zoom,reset"],min_border_bottom=20,min_border_left=20)

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Grouped by Hab"
p.xaxis.major_label_orientation = 1.2

index_cmap = factor_cmap('habs_variable', palette=['#2b83ba', '#abdda4', '#ffffbf', '#fdae61', '#d7191c'], 
                         factors=sorted(daily_long.habs.unique()), end=1)

p.vbar(x='habs_variable', top='value_mean', width=1, source=source,
       line_color="white",fill_color=index_cmap, 
       hover_line_color="darkgrey", hover_fill_color=index_cmap,)

p.add_tools(HoverTool(tooltips=[("Cells per ML ", "@value_mean"), ("Variable ","@habs_variable")]))

show(p)

In [8]:
# hab_daily = daily_long.query("hab == True")
# hab_daily['date'] = hab_daily['dateTime'].apply(lambda x: x.strftime("%D"))
# hab_daily.head()

habs_long = long.query('hab == True')
habs_long.head()
habs_long.groupby("date")[['value','variable']].mean()

NameError: name 'long' is not defined

In [9]:
source = ColumnDataSource(habs_long.groupby("date")[['value','variable']].mean())

# p = figure(x_range=sorted(hab_daily['date'].unique(),reverse=True), plot_height=250,  title="Daily Counts - Habs Species")
p = figure()

p.vbar_stack(variable, x='date', width=0.9, color=colors, source=hab_daily,
             legend_label=years)


# p.vbar_stack(stackers=['Protoperidinium','Pseudo-nitzschia'],
#              x='dateTime', source=source,
#              width=0.5)

# p.hbar_stack(hab_daily['variable'].unique(), y='fruits', height=0.9, color=GnBu3, source=ColumnDataSource(exports),
#              legend_label=["%s exports" % x for x in years])

# # p.hbar_stack(years, y='fruits', height=0.9, color=OrRd3, source=ColumnDataSource(imports),
# #              legend_label=["%s imports" % x for x in years])

# p.y_range.range_padding = 0.1
# p.ygrid.grid_line_color = None
# p.legend.location = "center_left"

show(p)

NameError: name 'habs_long' is not defined

In [10]:
week = data.loc['2010-10-01':'2010-10-08']

p = figure(x_axis_type="datetime", title="Glocose Range", plot_height=350, plot_width=800)
p.xgrid.grid_line_color=None
p.ygrid.grid_line_alpha=0.5
p.xaxis.axis_label = 'Time'
p.yaxis.axis_label = 'Value'

p.line(week.index, week.glucose)

show(p)

NameError: name 'data' is not defined

__Create list of columns of just the model classes (no metadata) for plotting__

In [11]:
critter_cols = [col for col in df.columns if col not in ['dateTime', 'inhibitTime', 'runTime', 'syringeSize','fileName']]
top_cols = df.iloc[-1][critter_cols].sort_values(ascending=False)[:15]
top_cols = list(top_cols.index)
top_cols

['nsamples',
 'Centric',
 'NanoP_less10',
 'Prorocentrum',
 'Ciliates',
 'Ceratium',
 'Chaetoceros',
 'Dictyocha',
 'Gymnodinium',
 'volume_analyzed',
 'Scrip_Het',
 'Dinophysis',
 'Entomoneis',
 'Akashiwo',
 'Phaeocystis']

In [12]:
long = pd.melt(df.reset_index(), id_vars='dateTime', value_vars=df.)
long.head()

SyntaxError: invalid syntax (<ipython-input-12-6a7bd40659c5>, line 1)

In [13]:
df['nsamples'] = df[critter_cols].sum(axis=1)
df_norm = df[critter_cols].divide(df['nsamples'],axis=0)

In [23]:
new_index = pd.date_range(start=df['dateTime'].iloc[0],end=df['dateTime'].iloc[-1],freq="23T44S")
df_reindexed = df_norm.reindex(new_index,method="nearest",tolerance='25min',limit=2)