# Inspect Timeseries in a Data File

## Initial imports

In [1]:
import os
import datetime
from pprint import pprint

import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from IPython.display import display
import ipywidgets as widgets
from bokeh.io import output_notebook, show
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, HoverTool, RangeTool
from bokeh.plotting import figure
from bokeh.transform import linear_cmap

output_notebook()

## List data files in a specified directory

In [2]:
srcdir = 'data/dst'
ext = '.parquet'

parquet_files = [file for file in os.listdir(srcdir) if file.endswith(ext)]

## Define functions to handle UI events

In [3]:
# define callbacks for the data selection UI
def load_df(file):
    df = pd.read_parquet(srcdir + '/' + file)

    # THIS IS NOT GETTING FREQ CORRECT FOR MAYFLY
    if df.index.freq is None:
        in_freq = pd.infer_freq(df.index)
        print(f'index frequency is None, updating to {in_freq}')
        df = df.asfreq(in_freq)
    return df


def do_load(click):
    global df
    df = load_df(file_dropdown.value)


def dropdown_handler(change):
    global schema
    schema = pq.read_schema(srcdir+'/'+change.new)
    if change.new:
        col_dropdown.options=schema.names

## The plotting function

In [4]:
def do_plot(df, file, col):

    tooltips = HoverTool(
        tooltips = [('date', '@date{%F %T %Z}'),('obs','@obs')],
        formatters = { '@date': 'datetime'},
        mode='vline')
    
    # create a column data source for the observations
    dates = np.array(df.index, dtype=np.datetime64)
    source = ColumnDataSource(data=dict(date=dates, obs=df[col], is_valid=pd.isnull(df[col])+0.5))

    # determine num periods to make the range slider, min of 5-day range or index length
    periods_in_5d = int(datetime.timedelta(days=5)/pd.to_timedelta(df.index.freq))
    range_ix = min([periods_in_5d, df.index.size])

    # plot the selected range
    obs_range_view = figure(height=300, width=800,
                        tools='xpan', toolbar_location=None,
                        x_axis_type='datetime', x_axis_location='above',
                        x_range=(dates[0], dates[range_ix]),
                        y_axis_label = col,
                        background_fill_color='#efefef',
                        title=f"File: {file}")
    obs_range_view.line('date', 'obs', source=source)
    obs_range_view.add_tools(tooltips)
    
    # create a RangeTool where the x_range corresponds to the range_view
    range_tool = RangeTool(x_range=obs_range_view.x_range, start_gesture='pan')
    range_tool.overlay.fill_color = 'navy'
    range_tool.overlay.fill_alpha = 0.2

    # plot the entire series with the range selection tool
    obs_series_select = figure(height=130, width=800,
                          tools="", toolbar_location=None,
                          x_axis_type='datetime', x_axis_location='above',
                          y_range=obs_range_view.y_range, y_axis_type=None,
                          y_axis_label = col,
                          background_fill_color='#efefef',
                          title=f"Range selection")
    obs_series_select.line('date', 'obs', source=source)
    obs_series_select.ygrid.grid_line_color = None
    obs_series_select.add_tools(range_tool)

    # setup y-axis category styling for the valid data plots
    categories = ['Is Data','Is Null'] # we're testing for null so True is 1
    cmap = linear_cmap(field_name='is_valid', palette='Spectral6', low=0.5, high=1.5)

    # plot the valid data mask over the selected range
    valid_range_view = figure(height=80, width=800,
                        tools='xpan',
                        x_axis_type='datetime',
                        x_range=(dates[0], dates[range_ix]),
                        y_range=categories)
    valid_range_view.scatter(x='date', y='is_valid', source=source, marker='dash', angle=1.571, color=cmap) 

    # link the x_range in our close-up range views
    valid_range_view.x_range = obs_range_view.x_range

    # plot the entire valid data mask
    valid_series_select = figure(height=80, width=800,
                          y_range=categories,
                          x_axis_type='datetime')
    valid_series_select.scatter(x='date', y='is_valid', source=source, marker='dash', angle=1.571, color=cmap)
    valid_series_select.add_tools(range_tool)

    # arrange the subplots in a column and show the plot
    show(column(obs_range_view, valid_range_view, obs_series_select, valid_series_select))

## Create the data selection UI

In [5]:
# create the data selection UI widgets 
file_dropdown = widgets.Dropdown(
    options=parquet_files,
    value=None,
    disabled=False,
)

col_dropdown = widgets.Dropdown(
    options=[],
    value=None,
    disabled=False,
)

button = widgets.Button(
    description='Load data',
    disabled=False,
    button_style='primary',
    tooltip='Load data'
)

# specify the form layout and fill it with widgets
form_item_layout = widgets.Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between',
)

form_items = [         
    widgets.Box([widgets.Label(value='File:'),
         file_dropdown], layout=form_item_layout),
    widgets.Box([widgets.Label(value='Column:'),
         col_dropdown], layout=form_item_layout),
    widgets.Box([button], layout=form_item_layout)
]

form = widgets.Box(form_items, layout=widgets.Layout(
    display='flex',
    flex_flow='column',
    border='solid 1px',
    align_items='stretch',
    width='50%',
    padding = '1%'
))

# assign handlers to the form widgets
file_dropdown.observe(dropdown_handler, names='value')
button.on_click(do_load)

## Display the form

In [6]:
display(form)

Box(children=(Box(children=(Label(value='File:'), Dropdown(options=('21852517 2024-07-22 10_25_20 UTC (Data UT…

## Display the plot

In [7]:
do_plot(df, file_dropdown.value, col_dropdown.value)

  dates = np.array(df.index, dtype=np.datetime64)


In [None]:
file = file_dropdown.value
col = col_dropdown.value
print(file, col)