# Compare Data Series

## Initial imports

In [34]:
import os
import datetime
from pprint import pprint

import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from IPython.display import display
import ipywidgets as widgets
from bokeh import events
from bokeh.io import output_notebook, show
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, HoverTool, RangeTool, BoxAnnotation, CustomJS, Div
from bokeh.plotting import figure
from bokeh.transform import linear_cmap
from bokeh.palettes import Vibrant3 as colors

output_notebook()

## List files in a specified directory

In [35]:
srcdir = 'data/dst'
ext = '.parquet'

parquet_files = [file for file in os.listdir(srcdir) if file.endswith(ext)]

## The Plot Function

In [36]:
def do_plot(df1, col1, df2, col2, label1=None, label2=None):

    tooltips = HoverTool(
        tooltips = [('date', '@date{%F %T %Z}'),('obs','@obs')],
        formatters = { '@date': 'datetime'},
        mode='vline',
        attachment='vertical')

    # create column data sources for the observations
    dates1 = np.array(df1.index, dtype=np.datetime64)
    source1 = ColumnDataSource(data=dict(date=dates1, obs=df1[col1]))

    dates2 = np.array(df2.index, dtype=np.datetime64)
    source2 = ColumnDataSource(data=dict(date=dates2, obs=df2[col2]))
                  
    # determine num periods to make the range slider, min of 5-day range or index length
    range_ix = initialize_range(df1)

    # plot the selected range
    obs_range_view = figure(height=300, width=800,
                            tools="xpan,save",
                            x_axis_type="datetime", x_axis_location="above",
                            x_range=(dates1[0], dates1[range_ix]),
                            y_axis_label=f'{col1}, {col2}',
                            background_fill_color="#efefef",
                            title='Timeseries range view')
    obs_range_view.line('date', 'obs', source=source1, color=colors[0], legend_label=label1)
    obs_range_view.line('date', 'obs', source=source2, color=colors[1], legend_label=label2)
    obs_range_view.add_tools(tooltips)
    obs_range_view.legend.location = 'top_left'

    # create a range tool with range equal to the view
    range_tool = RangeTool(x_range=obs_range_view.x_range, start_gesture="pan")
    range_tool.overlay.fill_color = "navy"
    range_tool.overlay.fill_alpha = 0.2

    # plot the entire data and add the range selection tool
    obs_series_select = figure(height=130, width=800,
                               tools="save",
                               x_axis_type="datetime", x_axis_location='above',
                               y_axis_type=None, y_range=obs_range_view.y_range,
                               background_fill_color="#efefef",
                               title='Series range selector')
    obs_series_select.line('date', 'obs', source=source1, color=colors[0])
    obs_series_select.line('date', 'obs', source=source2, color=colors[1])
    obs_series_select.ygrid.grid_line_color = None
    obs_series_select.add_tools(range_tool)

    div = Div(width=800)
    obs_range_view.js_on_event(events.RangesUpdate, display_daterange(div, attributes=['x0','x1']))

    show(column(obs_range_view, obs_series_select, row(div)))

In [37]:
def initialize_range(df, percent=10, period=None):
    # determine num periods to make the range slider based on percent of series or min of a given timedelta or index length
    r, c = df.shape
    if period:
        samples = int(period/pd.to_timedelta(df.index.freq))
        return min([samples, r])
    else:
        return int(r*percent/100)

## Define functions to handle UI events

In [38]:
# callbacks for the data selection UI
def load_df(file):
    df = pd.read_parquet(srcdir + '/' + file)

    # THIS IS NOT GETTING FREQ CORRECT FOR MAYFLY
    if df.index.freq is None:
        in_freq = pd.infer_freq(df.index)
        print(f'index frequency is None, updating to {in_freq}')
        df = df.asfreq(in_freq)
    return df

def do_load(click):
    global df1, df2
    df1 = load_df(file1_dropdown.value)
    df2 = load_df(file2_dropdown.value)
    
def dropdown_handler1(change):
    global schema1
    schema1 = pq.read_schema(srcdir+'/'+change.new)
    if change.new:
        col1_dropdown.options=schema1.names
        
def dropdown_handler2(change):
    global schema2
    schema2 = pq.read_schema(srcdir+'/'+change.new)
    if change.new:
        col2_dropdown.options=schema2.names

In [39]:
def display_daterange(div: Div, attributes: list[str] = []) -> CustomJS:
    """
    Function to build a suitable CustomJS to display the current event
    in the div model.
    """
    style = 'float: left; clear: left; font-size: 13px'
    return CustomJS(args=dict(div=div), code=f"""
        const attrs = {attributes};
        const args = [];
        for (let i = 0; i < attrs.length; i++) {{
            //const val = JSON.stringify(cb_obj[attrs[i]], function(key, val) {{
            //    var dt = new Date(+(val));
            //    return dt.toISOString();
            //}})
            
            const dt = new Date(+(cb_obj[attrs[i]]));
            const val = dt.toISOString();
            args.push(val)
        }}
        const line = "<span style={style!r}><b>" + cb_obj.event_name + ":</b> (" + args.join(", ") + ")</span>\\n";
        div.text = line;
    """)

## Create the data selection UI

In [40]:
# create the data selection UI
file1_dropdown = widgets.Dropdown(
    options=parquet_files,
    value=None,
    disabled=False,
)

col1_dropdown = widgets.Dropdown(
    options=[],
    value=None,
    disabled=False,
)

file2_dropdown = widgets.Dropdown(
    options=parquet_files,
    value=None,
    disabled=False,
)

col2_dropdown = widgets.Dropdown(
    options=[],
    value=None,
    disabled=False,
)

file1_dropdown.observe(dropdown_handler1, names='value')
file2_dropdown.observe(dropdown_handler2, names='value')

button = widgets.Button(
    description='Load data',
    disabled=False,
    button_style='primary',
    tooltip='Load data'
)

button.on_click(do_load)

# widget Layout
form_item_layout = widgets.Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between',
)

form_items = [         
    widgets.Box([widgets.Label(value='File 1:'),
         file1_dropdown], layout=form_item_layout),
    widgets.Box([widgets.Label(value='Column 1:'),
         col1_dropdown], layout=form_item_layout),
    widgets.Box([widgets.Label(value='File 2:'),
         file2_dropdown], layout=form_item_layout),
    widgets.Box([widgets.Label(value='Column 2:'),
         col2_dropdown], layout=form_item_layout),
    widgets.Box([button], layout=form_item_layout)
]

form = widgets.Box(form_items, layout=widgets.Layout(
    display='flex',
    flex_flow='column',
    border='solid 1px',
    align_items='stretch',
    width='50%',
    padding = '1%'
))

## Display the form

In [41]:
display(form)

Box(children=(Box(children=(Label(value='File 1:'), Dropdown(options=('21852517 2024-07-22 10_25_20 UTC (Data …

## Display the plot

In [42]:
# plot the data
do_plot(df1, col1_dropdown.value, df2, col2_dropdown.value, label1=file1_dropdown.value, label2=file2_dropdown.value)

  dates1 = np.array(df1.index, dtype=np.datetime64)
  dates2 = np.array(df2.index, dtype=np.datetime64)


## Difference

In [43]:
diff = df1[col1_dropdown.value] - df2[col2_dropdown.value]

In [44]:
diff_dates = np.array(diff.index, dtype=np.datetime64)
diff_source = ColumnDataSource(data=dict(date=diff_dates, obs=diff.values))

  diff_dates = np.array(diff.index, dtype=np.datetime64)


In [45]:
diff_view = figure(height=300, width=800,
                   tools="xpan", toolbar_location=None,
                   x_axis_type="datetime", x_axis_location="above",
                   y_axis_label="series1 - series2",
                   background_fill_color="#efefef",
                  title='Series difference')
diff_view.line('date', 'obs', source=diff_source)

pos_region = BoxAnnotation(bottom=0, fill_alpha=0.2, fill_color=colors[0])
neg_region = BoxAnnotation(top=0, fill_alpha=0.2, fill_color=colors[1])
diff_view.add_layout(pos_region)
diff_view.add_layout(neg_region)

show(column(diff_view))

In [46]:
xplot_df = df1.join(df2, how='outer', lsuffix='_df1', rsuffix='_df2')
xplot_df

Unnamed: 0_level_0,DateTimeEDT_df1,Temp_df1,DateTimeEDT_df2,Temp_df2
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-07-29 18:40:00+00:00,NaT,,NaT,
2024-07-29 18:50:00+00:00,NaT,,2024-07-29 14:50:56-04:00,33.871375
2024-07-29 19:00:00+00:00,NaT,,2024-07-29 15:00:56-04:00,27.693719
2024-07-29 19:10:00+00:00,NaT,,2024-07-29 15:10:56-04:00,25.068215
2024-07-29 19:20:00+00:00,NaT,,2024-07-29 15:20:56-04:00,24.822253
...,...,...,...,...
2024-09-14 16:50:00+00:00,2024-09-14 12:50:57-04:00,18.979219,NaT,
2024-09-14 17:00:00+00:00,2024-09-14 13:00:57-04:00,19.139381,NaT,
2024-09-14 17:10:00+00:00,2024-09-14 13:10:57-04:00,19.365323,NaT,
2024-09-14 17:20:00+00:00,2024-09-14 13:20:57-04:00,19.565525,NaT,


In [47]:
obsmin = xplot_df[['Temp_df1','Temp_df2']].min().min()
obsmax = xplot_df[['Temp_df1','Temp_df2']].max().max()

xplot_source = ColumnDataSource(data=dict(xobs=xplot_df['Temp_df1'], yobs=xplot_df['Temp_df2']))
xplot_view = figure(height=500, width=500,
                    toolbar_location=None,
                    match_aspect=True,
                    x_axis_label='Temp_df1',
                    y_axis_label='Temp_df2',
                    background_fill_color="#efefef",
                   title='Series crossplot')
xplot_view.scatter('xobs', 'yobs',
                   source=xplot_source,
                   alpha=0.2)
xplot_view.line([obsmin,obsmax], [obsmin,obsmax], color="red", line_width=2)

show(xplot_view)