# Implement Kalman model using FastAI

> need to implement custom data preparation pipeline and loss function 

## Data Preparation

The aim of the data preparation pipeline is to:
- take the original time series and split it into time blocks
- for each block generate a random gap (need to figure out the properties of the gap)
- split some time blocks for testing

the input of the pipeline is:
- a dataframe containing all observations

the input of the model is:
- observed data (potentially containing NaN where data is missing)
- missing data mask (which is telling where the data is missing)
- the data needs to be standardized

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| hide
#| default_exp kalman.fastai

In [None]:
#| export
from meteo_imp.utils import *
from meteo_imp.gaussian import *

In [None]:
reset_seed()

In [None]:
import torch

In [None]:
from fastai.tabular.core import *
from fastai.data.core import *

In [None]:
#| export
from fastcore.transform import *
from fastcore.basics import *
from fastcore.foundation import *
from fastcore.all import *
from fastai.tabular import *
from fastai.torch_core import default_device, to_device

from meteo_imp.data import read_fluxnet_csv, hai_path

import collections

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
@cache_disk(cache_dir / "full_hai")
def load_data(dtype=np.float32):
    return read_fluxnet_csv(hai_path, None, num_dtype=dtype)

hai = load_data()
hai64 = load_data(np.float64)

### 1) Blocks

the first step is to transfrom the original dataframe into blocks of a specified `block_len`

two different strategies are possible:

- contigous blocks
- random block in the dataframe

In [None]:
#| export
class BlockDfTransform(Transform):
    """divide timeseries DataFrame into blocks"""
    def __init__(self, df, block_len=200): 
        self.df = df 
        self.block_len = block_len
        self.n = len(df)
        
    def encodes(self, i:int) -> pd.DataFrame:       
        start = i * self.block_len
        end = (i+1) * self.block_len
        assert end <= self.n 
        
        block = self.df[start:end]
        
        return block

In [None]:
blk = BlockDfTransform(hai, 10)

In [None]:
blk

BlockDfTransform:
encodes: (int,object) -> encodes
decodes: 

In [None]:
blk(1)

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 05:30:00,-0.23,0.0,0.138
2000-01-01 06:00:00,-0.23,0.0,0.122
2000-01-01 06:30:00,-0.22,0.0,0.098
2000-01-01 07:00:00,-0.24,0.0,0.066
2000-01-01 07:30:00,-0.23,0.0,0.044
2000-01-01 08:00:00,-0.22,0.0,0.026
2000-01-01 08:30:00,-0.19,0.45,0.016
2000-01-01 09:00:00,-0.14,3.7,0.01
2000-01-01 09:30:00,-0.03,7.26,0.006
2000-01-01 10:00:00,0.04,12.24,0.006


In [None]:
180 * 24 * 2 / 10

864.0

we are taking a day in the summer so there is an higher values for the variables

In [None]:
blk(800)

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 16:30:00,14.65,468.190002,6.454
2000-06-15 17:00:00,14.22,224.800003,5.799
2000-06-15 17:30:00,14.11,195.279999,6.577
2000-06-15 18:00:00,14.23,244.169998,6.931
2000-06-15 18:30:00,14.4,253.919998,7.286
2000-06-15 19:00:00,14.09,177.309998,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.709999,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164


In [None]:
tfms1 = TfmdLists([800,801,802,803], [BlockDfTransform(hai, 10)])

In [None]:
tfms1[0]

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 16:30:00,14.65,468.190002,6.454
2000-06-15 17:00:00,14.22,224.800003,5.799
2000-06-15 17:30:00,14.11,195.279999,6.577
2000-06-15 18:00:00,14.23,244.169998,6.931
2000-06-15 18:30:00,14.4,253.919998,7.286
2000-06-15 19:00:00,14.09,177.309998,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.709999,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164


### 2) Gaps

adds a mask which includes a random gap

In [None]:
#| export
class MaskedDf:
    def __init__(self,*args):
        self.data = args[0]
        self.mask = args[1]
    def __iter__(self): return iter((self.data, self.mask,))
    __repr__ = basic_repr("data, mask")
    def _repr_html_(self):
        return row_dfs({'data': self.data, 'mask': self.mask}, title="Masked Df")

In [None]:
#| exports
def _make_random_gap(
    gap_length: int, # The length of the gap
    total_length: int, # The total number of observations
    gap_start: int = None # Optional start of gap
): # (total_length) array of bools to indicicate if the data is missing or not
    "Add a continous gap of ginve length at random position"
    if(gap_length >= total_length):
        return np.repeat(True, total_length)
    gap_start = np.random.randint(total_length - gap_length) if gap_start is None else gap_start
    return np.hstack([
        np.repeat(False, gap_start),
        np.repeat(True, gap_length),
        np.repeat(False, total_length - (gap_length + gap_start))
    ])

In [None]:
#| export
from fastcore.basics import *

In [None]:
#| export
class AddGapTransform(DisplayedTransform):
    """Adds a random gap to a dataframe"""
    def __init__(self,
                variables,
                gap_length,
                ):
        store_attr()
    def encodes(self, df: pd.DataFrame):
        gap = _make_random_gap(self.gap_length, df.shape[0])
        mask = np.ones_like(df, dtype=bool)
        col_sel = L(*df.columns).argwhere(lambda x: x in self.variables)
        mask[np.argwhere(gap), col_sel] = False
        return MaskedDf(df, pd.DataFrame(mask, index=df.index, columns=df.columns))

In [None]:
a_gap = AddGapTransform(['TA', 'VPD'], 5)
a_gap

AddGapTransform -- {'variables': ['TA', 'VPD'], 'gap_length': 5}:
encodes: (DataFrame,object) -> encodes
decodes: 

In [None]:
a_gap(blk(800))

TA,SW_IN,VPD
14.65,468.19,6.454
14.22,224.8,5.799
14.11,195.28,6.577
14.23,244.17,6.931
14.4,253.92,7.286
14.09,177.31,7.251
13.71,97.07,6.683
13.08,39.71,5.851
12.41,10.65,5.254
12.27,0.32,5.164

TA,SW_IN,VPD
True,True,True
True,True,True
True,True,True
False,True,False
False,True,False
False,True,False
False,True,False
False,True,False
True,True,True
True,True,True


In [None]:
m_df = a_gap(blk(800))

In [None]:
display_as_row({'data': m_df.data, 'mask': m_df.mask})

TA,SW_IN,VPD
14.65,468.19,6.454
14.22,224.8,5.799
14.11,195.28,6.577
14.23,244.17,6.931
14.4,253.92,7.286
14.09,177.31,7.251
13.71,97.07,6.683
13.08,39.71,5.851
12.41,10.65,5.254
12.27,0.32,5.164

TA,SW_IN,VPD
False,True,False
False,True,False
False,True,False
False,True,False
False,True,False
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True


In [None]:
tfms2 = TfmdLists([800,801,802,803], [BlockDfTransform(hai, 10), AddGapTransform(['TA','SW_IN'], 2)])

In [None]:
tfms2[0]

TA,SW_IN,VPD
14.65,468.19,6.454
14.22,224.8,5.799
14.11,195.28,6.577
14.23,244.17,6.931
14.4,253.92,7.286
14.09,177.31,7.251
13.71,97.07,6.683
13.08,39.71,5.851
12.41,10.65,5.254
12.27,0.32,5.164

TA,SW_IN,VPD
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
False,False,True
False,False,True
True,True,True


In [None]:
#| export
@patch
def tidy(self: MaskedDf):
    data = self.data.reset_index().melt("time")
    mask = self.mask.reset_index().melt("time", value_name="is_present")
    
    return pd.merge(data, mask, on=["time", "variable"])

In [None]:
m_df.tidy()

Unnamed: 0,time,variable,value,is_present
0,2000-06-15 16:30:00,TA,14.65,False
1,2000-06-15 17:00:00,TA,14.22,False
2,2000-06-15 17:30:00,TA,14.11,False
3,2000-06-15 18:00:00,TA,14.23,False
4,2000-06-15 18:30:00,TA,14.4,False
5,2000-06-15 19:00:00,TA,14.09,True
6,2000-06-15 19:30:00,TA,13.71,True
7,2000-06-15 20:00:00,TA,13.08,True
8,2000-06-15 20:30:00,TA,12.41,True
9,2000-06-15 21:00:00,TA,12.27,True


#### Plotting

In [None]:
#| export
import altair as alt
from altair import datum

In [None]:
#| exporti
def def_selection():
    return alt.selection_interval(bind="scales")

##### Rug

In [None]:
#| exporti
def plot_rug(df, sel = def_selection(), props = {}):
    if 'height' in props:
        props = props.copy() 
        props.pop('height') # rug should have default heigth
    return alt.Chart(df).mark_tick(
            color='black',
        ).encode(
            x = "time",
            color = alt.condition(datum.is_present, alt.value('white'), alt.value('black'))
        ).add_params(
            sel
        ).properties(**props) 

In [None]:
plot_rug(m_df.tidy())

In [None]:
df = m_df.tidy()

In [None]:
df = df[df.variable=="TA"].copy()

In [None]:
df['row_number'] = df.reset_index().index

In [None]:
df

Unnamed: 0,time,variable,value,is_present,row_number
0,2000-06-15 16:30:00,TA,14.65,False,0
1,2000-06-15 17:00:00,TA,14.22,False,1
2,2000-06-15 17:30:00,TA,14.11,False,2
3,2000-06-15 18:00:00,TA,14.23,False,3
4,2000-06-15 18:30:00,TA,14.4,False,4
5,2000-06-15 19:00:00,TA,14.09,True,5
6,2000-06-15 19:30:00,TA,13.71,True,6
7,2000-06-15 20:00:00,TA,13.08,True,7
8,2000-06-15 20:30:00,TA,12.41,True,8
9,2000-06-15 21:00:00,TA,12.27,True,9


In [None]:
df.iloc[1]

time          2000-06-15 17:00:00
variable                       TA
value                       14.22
is_present                  False
row_number                      1
Name: 1, dtype: object

In [None]:
df.loc[2, "is_present"] = True

In [None]:
df

Unnamed: 0,time,variable,value,is_present,row_number
0,2000-06-15 16:30:00,TA,14.65,False,0
1,2000-06-15 17:00:00,TA,14.22,False,1
2,2000-06-15 17:30:00,TA,14.11,True,2
3,2000-06-15 18:00:00,TA,14.23,False,3
4,2000-06-15 18:30:00,TA,14.4,False,4
5,2000-06-15 19:00:00,TA,14.09,True,5
6,2000-06-15 19:30:00,TA,13.71,True,6
7,2000-06-15 20:00:00,TA,13.08,True,7
8,2000-06-15 20:30:00,TA,12.41,True,8
9,2000-06-15 21:00:00,TA,12.27,True,9


In [None]:
i = 1
prev, curr, next = df.iloc[i-1], df.iloc[i], df.iloc[i+1]

In [None]:
prev, curr, next

(time          2000-06-15 16:30:00
 variable                       TA
 value                       14.65
 is_present                  False
 row_number                      0
 Name: 0, dtype: object,
 time          2000-06-15 17:00:00
 variable                       TA
 value                       14.22
 is_present                  False
 row_number                      1
 Name: 1, dtype: object,
 time          2000-06-15 17:30:00
 variable                       TA
 value                       14.11
 is_present                   True
 row_number                      2
 Name: 2, dtype: object)

In [None]:
df

Unnamed: 0,time,variable,value,is_present,row_number
0,2000-06-15 16:30:00,TA,14.65,False,0
1,2000-06-15 17:00:00,TA,14.22,False,1
2,2000-06-15 17:30:00,TA,14.11,True,2
3,2000-06-15 18:00:00,TA,14.23,False,3
4,2000-06-15 18:30:00,TA,14.4,False,4
5,2000-06-15 19:00:00,TA,14.09,True,5
6,2000-06-15 19:30:00,TA,13.71,True,6
7,2000-06-15 20:00:00,TA,13.08,True,7
8,2000-06-15 20:30:00,TA,12.41,True,8
9,2000-06-15 21:00:00,TA,12.27,True,9


In [None]:
for i in range(len(df)):
    # handle boundaries
    prev = df.iloc[i-1].is_present if i>0 else True 
    next = df.iloc[i+1].is_present if i<(len(df)-1) else True 
    curr = df.iloc[i]
    if not curr.is_present and prev:
        print("gap start", curr.time)
    if not curr.is_present and next:
        print("gap end", curr.time)

gap start 2000-06-15 16:30:00
gap end 2000-06-15 17:00:00
gap start 2000-06-15 18:00:00
gap end 2000-06-15 18:30:00


In [None]:
#| export
def find_gap_limits(df):
    gap_starts, gap_ends = [], []
    for i in range(len(df)):
        prev = df.iloc[i-1].is_present if i>0 else True 
        next = df.iloc[i+1].is_present if i<(len(df)-1) else True 
        curr = df.iloc[i]
        if not curr.is_present and prev: gap_starts.append(curr.time)
        if not curr.is_present and next: gap_ends.append(curr.time)
    return pd.DataFrame({'gap_start': gap_starts, 'gap_end': gap_ends})
    

In [None]:
find_gap_limits(df)

Unnamed: 0,gap_start,gap_end
0,2000-06-15 16:30:00,2000-06-15 17:00:00
1,2000-06-15 18:00:00,2000-06-15 18:30:00


In [None]:
#| export
def plot_missing_area(df, sel = def_selection(), props={}):
    gap_limits = find_gap_limits(df)
    start = alt.Chart(gap_limits).mark_rule().encode(
        x = alt.X('gap_start', axis=alt.Axis(domain=False, labels = False, ticks=False, title=None)),
    )
    end = alt.Chart(gap_limits).mark_rule().encode(
        x = alt.X('gap_end', axis=alt.Axis(domain=False, labels = False, ticks=False, title=None))
    )
    area = alt.Chart(gap_limits).mark_rect(color='black', opacity=.2).encode(
        x = alt.X('gap_start', axis=alt.Axis(domain=False, labels = False, ticks=False, title=None)),
        x2 = 'gap_end'
    )
    return (start + end + area)#.add_params(sel).properties(**props)

In [None]:
plot_missing_area(df)

##### Points

In [None]:
#| export
def plot_points(df, y = "value", y_label = "", sel = def_selection(), props = {}):
    return alt.Chart(df).mark_point(
            color='black',
            strokeWidth = 1,
            fillOpacity = 1
        ).encode(
            x = alt.X("time", axis=alt.Axis(domain=True, labels = True, ticks=True, title="time")),
            y = alt.Y(y, title = y_label, scale=alt.Scale(zero=False)),
            fill= alt.Fill("is_present", scale = alt.Scale(range=["black", "#ffffff00"]),
                           legend = alt.Legend(title =["Observed data"])),
            shape = "is_present",
        )

In [None]:
plot_points(m_df.tidy())

##### Line

In [None]:
#| exporti
def plot_line(df, only_present=True, y="value", y_label = "", sel = def_selection(), props = {}):
    # df = df[df.is_present] if only_present else df
    # TODO remove onle_present
    return alt.Chart(df).mark_line().encode(
        x = "time",    
        y = alt.Y(y, title = y_label, scale=alt.Scale(zero=False)),
        color='variable'
    ).add_params(
        sel
    ).properties(
        **props
    )#.transform_filter(
    #     datum.is_present
    # )

    

In [None]:
plot_line(m_df.tidy())

##### Errorband

In [None]:
#| exporti
def plot_error(df, y = "value", y_label = "", sel = def_selection(), props = {}):
    df.loc[:,'err_low'] = df[y] - 2 * df['std']
    df.loc[:,'err_high'] = df[y] + 2 * df['std']
    return alt.Chart(df).mark_errorband().encode(
        x = "time",    
        y = alt.Y("err_low:Q", title = y_label, scale=alt.Scale(zero=False)),
        y2 = "err_high:Q",
        color=alt.Color("variable",
                        legend = alt.Legend(title=["Line: pred. mean", "area: +/- 2 std"])
                       )
    ).add_params(
        sel
    ).properties(
        **props
    )
    

In [None]:
plot_error(m_df.tidy().assign(std=5))

##### Variable

In [None]:
#| exporti
def plot_variable(df, variable, ys=["value", "value"], title="", y_label="", sel = None, error=False, props = {}):
    df = df[df.variable == variable].copy()
    sel = ifnone(sel, def_selection())
    # rug = plot_rug(df, sel, props)
    points = plot_points(df, ys[0], y_label, sel, props)
    if not df.is_present.all(): points += plot_missing_area(df, sel, props) # there is a gap
    line = plot_line(df, True, ys[1], y_label, sel, props)
    if error: line = plot_error(df, y=ys[1], y_label=y_label, sel=sel, props=props) + line
    
    return (points + line).properties(title=title)
    
    # return alt.VConcatChart(vconcat=[(points + line), rug], spacing=-10).properties(title=title)

In [None]:
plot_variable(m_df.tidy(), "TA", title="title TA")

In [None]:
plot_variable(m_df.tidy().assign(std=.5), "TA", title="title TA", error=True)

##### Facet

In [None]:
#| export
def facet_variable(df, # tidy dataframe
                   n_cols: int = 3,
                   bind_interaction: bool =True, # Whether the sub-plots for each variable should be connected for zooming/panning
                   error:bool=False, # plot error bar
                   ys:list=["value", "value"],
                   props:dict|None = None, # additional properties for altair plot (eg. size)
                   ) -> alt.Chart:
    """Plot all values of the column `variable` in different subplots"""
    props = ifnone(props, {'width': 200, 'height': 150})
    vars = df.variable.unique()
    plot_list = [alt.hconcat() for _ in range(0, len(vars), n_cols)]
    selection_scale = alt.selection_interval(bind="scales", encodings=['x']) if bind_interaction else None
    for idx, variable in enumerate(vars):
        plot = plot_variable(df,
                            variable,
                            ys = ys,
                            title = variable,
                            y_label = variable,
                            sel = selection_scale,
                            props=props,
                            error=error)
        
        plot_list[idx // n_cols] |= plot
    
    plot = alt.vconcat(*plot_list)
    
    return plot

##### Show

In [None]:
#| export
@patch
def show(self: MaskedDf, ax=None, ctx=None, 
        n_cols: int = 3,
        bind_interaction: bool =True, # Whether the sub-plots for each variable should be connected for zooming/panning
        props:dict = None # additional properties (eg. size) for altair plot
       ) -> alt.Chart:
    
    df = self.tidy()
    return facet_variable(df, n_cols, bind_interaction, props)

In [None]:
m_df.show(bind_interaction = False)

In [None]:
a_gap(blk(799)).show()

In [None]:
idx = L(*blk(1).columns).argwhere(lambda x: x in ['TA','SW_IN'])

In [None]:
mask = np.ones_like(blk(1), dtype=bool)

In [None]:
mask

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [None]:
gap = _make_random_gap(2, 10, 2)

In [None]:
gap

array([False, False,  True,  True, False, False, False, False, False,
       False])

In [None]:
np.argwhere(gap)

array([[2],
       [3]])

In [None]:
mask[np.argwhere(gap), idx] = False

In [None]:
mask

array([[ True,  True,  True],
       [ True,  True,  True],
       [False, False,  True],
       [False, False,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [None]:
mask[gap]

array([[False, False,  True],
       [False, False,  True]])

### 3) To Tensor

this needs to handle both the init with a list of items and when the first item is a sequence of list of items

In [None]:
#| export
class MaskedTensor(collections.abc.Sequence):
    def __init__(self,*args):
        if len(args)==2:
            self.data = args[0]
            self.mask = args[1]
        elif len(args)==1 and len(args[0])==2:
            self.data = args[0][0]
            self.mask = args[0][1]
        else:
            raise ValueError(f"Incorrect number of arguments. got {len(args)} args")

    def __iter__(self): return iter((self.data, self.mask,))
    __len__ = 2
    def __getitem__(self, key):
        if key == 0: return self.data
        elif key == 1: return self.mask
        else: raise IndexError("index bigger than 2")
    __repr__ = basic_repr('data, mask')

In [None]:
#| export
class MaskedDf2Tensor(Transform):
    def setups(self, items):
        self.columns = list(items[0].data.columns)
    def encodes(self, df: MaskedDf) -> MaskedTensor:
        data = torch.tensor(df.data.to_numpy())
        mask = torch.tensor(df.mask.to_numpy())
        return MaskedTensor(data, mask)
        
    def decodes(self, x: MaskedTensor) -> MaskedDf:
        data = pd.DataFrame(x.data.detach().cpu().numpy(), columns = self.columns)
        mask = pd.DataFrame(x.mask.cpu().numpy(), columns = self.columns)
        return MaskedDf(data, mask)

In [None]:
to_t = MaskedDf2Tensor()

In [None]:
to_t.setup(tfms2)

In [None]:
to_t(tfms2[0])

__main__.MaskedTensor(data=tensor([[1.4650e+01, 4.6819e+02, 6.4540e+00],
        [1.4220e+01, 2.2480e+02, 5.7990e+00],
        [1.4110e+01, 1.9528e+02, 6.5770e+00],
        [1.4230e+01, 2.4417e+02, 6.9310e+00],
        [1.4400e+01, 2.5392e+02, 7.2860e+00],
        [1.4090e+01, 1.7731e+02, 7.2510e+00],
        [1.3710e+01, 9.7070e+01, 6.6830e+00],
        [1.3080e+01, 3.9710e+01, 5.8510e+00],
        [1.2410e+01, 1.0650e+01, 5.2540e+00],
        [1.2270e+01, 3.2000e-01, 5.1640e+00]]), mask=tensor([[ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [False, False,  True],
        [False, False,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True]]))

In [None]:
to_t.decode(to_t(tfms2[0]));

In [None]:
tfms2[0]

TA,SW_IN,VPD
14.65,468.19,6.454
14.22,224.8,5.799
14.11,195.28,6.577
14.23,244.17,6.931
14.4,253.92,7.286
14.09,177.31,7.251
13.71,97.07,6.683
13.08,39.71,5.851
12.41,10.65,5.254
12.27,0.32,5.164

TA,SW_IN,VPD
True,True,True
False,False,True
False,False,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True


In [None]:
type(MaskedDf2Tensor())

__main__.MaskedDf2Tensor

In [None]:
tfms3 = TfmdLists([800, 801, 802], [BlockDfTransform(hai, 10), AddGapTransform(['TA','SW_IN'], 2), MaskedDf2Tensor()])

In [None]:
tfms3[0]

__main__.MaskedTensor(data=tensor([[1.4650e+01, 4.6819e+02, 6.4540e+00],
        [1.4220e+01, 2.2480e+02, 5.7990e+00],
        [1.4110e+01, 1.9528e+02, 6.5770e+00],
        [1.4230e+01, 2.4417e+02, 6.9310e+00],
        [1.4400e+01, 2.5392e+02, 7.2860e+00],
        [1.4090e+01, 1.7731e+02, 7.2510e+00],
        [1.3710e+01, 9.7070e+01, 6.6830e+00],
        [1.3080e+01, 3.9710e+01, 5.8510e+00],
        [1.2410e+01, 1.0650e+01, 5.2540e+00],
        [1.2270e+01, 3.2000e-01, 5.1640e+00]]), mask=tensor([[ True,  True,  True],
        [ True,  True,  True],
        [False, False,  True],
        [False, False,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True]]))

In [None]:
type(tfms3[0])

__main__.MaskedTensor

In [None]:
tfms3.decode(tfms3[0])

TA,SW_IN,VPD
14.65,468.19,6.454
14.22,224.8,5.799
14.11,195.28,6.577
14.23,244.17,6.931
14.4,253.92,7.286
14.09,177.31,7.251
13.71,97.07,6.683
13.08,39.71,5.851
12.41,10.65,5.254
12.27,0.32,5.164

TA,SW_IN,VPD
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
False,False,True
False,False,True
True,True,True


### 4) Normalize

In [None]:
#| export
from meteo_imp.utils import *
from fastai.torch_core import to_cpu

from torch import Tensor

In [None]:
collections.namedtuple

<function collections.namedtuple(typename, field_names, *, rename=False, defaults=None, module=None)>

In [None]:
nt = collections.namedtuple("nt", "a")

In [None]:
isinstance(nt(1), tuple)

True

In [None]:
#| export
class NormalsParams(list):
    def __init__(self,*args):
        if len(args)==2:
            self.mean = args[0]
            self.std = args[1]
        elif isinstance(args[0], Generator):
            args = list(args[0])
            self.mean = args[0]
            self.std = args[1]
        elif len(args)==1 and len(args[0])==2:
            self.mean = tuple(args[0])[0]
            self.std = tuple(args[0])[1]                     
        else:
            raise ValueError(f"Incorrect number of arguments. got {len(args)} args")
    def __iter__(self): return iter((self.mean, self.std,))
    def __next__(self): return next(self.__iter__())
    def __len__(self): return 2
    def __getitem__(self, key):
        if key == 0: return self.mean
        elif key == 1: return self.std
        else: raise IndexError("index bigger than 2")
    __repr__ = basic_repr('mean, std')

In [None]:
NormalsParams(0,1)

__main__.NormalsParams(mean=0, std=1)

In [None]:
#| export
def get_stats(df, device='cpu'):
    return torch.tensor(df.mean(axis=0).to_numpy(), device=device), torch.tensor(df.std(axis=0).to_numpy(), device=device)

In [None]:
#| export
class NormalizeMasked(Transform):
    "Normalize/denorm MaskedTensor column-wise "
    @property
    def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"

    def __init__(self, mean=None, std=None): store_attr()

    def encodes(self, x:MaskedTensor)-> MaskedTensor:
        return MaskedTensor((x.data -self.mean) / self.std, x.mask)

    def decodes(self, x:MaskedTensor)->MaskedTensor:
        f = to_cpu if x[0].device.type=='cpu' else noop
        return MaskedTensor(x[0] * f(self.std) + f(self.mean), x[1])
    
    def decodes(self, x:NormalsParams):
        f = partial(to_device, device=(x[0].device))
        mean = x.mean * f(self.std) + f(self.mean)
        std = x.std * f(self.std)
        
        return NormalsParams(mean, std)

In [None]:
norm = NormalizeMasked(*get_stats(hai))

In [None]:
tfms3[0]

__main__.MaskedTensor(data=tensor([[1.4650e+01, 4.6819e+02, 6.4540e+00],
        [1.4220e+01, 2.2480e+02, 5.7990e+00],
        [1.4110e+01, 1.9528e+02, 6.5770e+00],
        [1.4230e+01, 2.4417e+02, 6.9310e+00],
        [1.4400e+01, 2.5392e+02, 7.2860e+00],
        [1.4090e+01, 1.7731e+02, 7.2510e+00],
        [1.3710e+01, 9.7070e+01, 6.6830e+00],
        [1.3080e+01, 3.9710e+01, 5.8510e+00],
        [1.2410e+01, 1.0650e+01, 5.2540e+00],
        [1.2270e+01, 3.2000e-01, 5.1640e+00]]), mask=tensor([[ True,  True,  True],
        [ True,  True,  True],
        [False, False,  True],
        [False, False,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True]]))

In [None]:
test_close(norm.decode(norm(tfms3[0]))[0], tfms3[0][0], eps=2e-5)

Test that NormalsParams decode actually works

In [None]:
Npars = NormalsParams(torch.tensor(1), torch.tensor(.1))

In [None]:
norm.decode(Npars)

__main__.NormalsParams(mean=tensor([ 16.2585, 324.9604,   7.7491]), std=tensor([ 0.7925, 20.4003,  0.4368]))

In [None]:
tfms4 = TfmdLists([800,801,803], [BlockDfTransform(hai, 10), 
                           AddGapTransform(['TA','SW_IN'], 2),
                           MaskedDf2Tensor(),
                           NormalizeMasked(*get_stats(hai,device='cpu'), ) ])

In [None]:
tfms4[0]

__main__.MaskedTensor(data=tensor([[ 0.7970,  1.7021,  0.7035],
        [ 0.7428,  0.5090,  0.5536],
        [ 0.7289,  0.3643,  0.7317],
        [ 0.7440,  0.6040,  0.8127],
        [ 0.7655,  0.6518,  0.8940],
        [ 0.7264,  0.2762,  0.8860],
        [ 0.6784, -0.1171,  0.7560],
        [ 0.5989, -0.3983,  0.5655],
        [ 0.5144, -0.5407,  0.4288],
        [ 0.4967, -0.5914,  0.4082]]), mask=tensor([[ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [False, False,  True],
        [False, False,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True],
        [ True,  True,  True]]))

In [None]:
tfms4.decode(tfms4[0])

TA,SW_IN,VPD
14.65,468.19,6.454
14.22,224.8,5.799
14.11,195.28,6.577
14.23,244.17,6.931
14.4,253.92,7.286
14.09,177.31,7.251
13.71,97.07,6.683
13.08,39.71,5.851
12.41,10.65,5.254
12.27,0.32,5.164

TA,SW_IN,VPD
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
True,True,True
False,False,True
False,False,True
True,True,True
True,True,True


is workinggggggggggggggggg 

### 5) To Tuple

Fastai likes to work with tuples ... for now convert to a tuple. Should add a decode step and maybe find a way to mimic a tuple in MaskedTensor

In [None]:
#| export
def to_tuple(x):
    return tuple(x)

### Pipeline

In [None]:
#| export
from fastai.data.transforms import *

In [None]:
block_len = 10
block_ids = list(range(0, (len(hai) // block_len) - 1))[:10]
gap_len = 2

In [None]:
#| export
def imp_pipeline(df,
                 block_len,
                 gap_len
                ):
    block_ids = list(range(0, (len(df) // block_len) - 1))
    return [BlockDfTransform(df, block_len),
            AddGapTransform(['TA','SW_IN'], gap_len),
            MaskedDf2Tensor,
            NormalizeMasked(*get_stats(df)),
            to_tuple
           ], block_ids

In [None]:
pipeline, block_ids = imp_pipeline(hai, block_len, gap_len)

In [None]:
pipeline

[BlockDfTransform:
 encodes: (int,object) -> encodes
 decodes: ,
 AddGapTransform -- {'variables': ['TA', 'SW_IN'], 'gap_length': 2}:
 encodes: (DataFrame,object) -> encodes
 decodes: ,
 __main__.MaskedDf2Tensor,
 NormalizeMasked -- {'mean': tensor([  8.3339, 120.9578,   3.3807]), 'std': tensor([  7.9246, 204.0026,   4.3684])}:
 encodes: (MaskedTensor,object) -> encodes
 decodes: (MaskedTensor,object) -> decodes
 (NormalsParams,object) -> decodes,
 <function __main__.to_tuple(x)>]

In [None]:
pp = Pipeline(pipeline)

In [None]:
pp

Pipeline: BlockDfTransform -> AddGapTransform -- {'variables': ['TA', 'SW_IN'], 'gap_length': 2} -> MaskedDf2Tensor -> NormalizeMasked -- {'mean': tensor([  8.3339, 120.9578,   3.3807]), 'std': tensor([  7.9246, 204.0026,   4.3684])} -> to_tuple

### Dataloader

random splitter for validation/training set

In [None]:
reset_seed()

In [None]:
splits = RandomSplitter()(block_ids)

Repeat twice the pipeline since is the same pipeline both for training data and for labels

In [None]:
import collections

In [None]:
def to_tuple(x):
    return tuple(x)

In [None]:
isinstance(tfms4[0], Sequence)

True

In [None]:
ds = Datasets(block_ids, [pipeline, pipeline], splits=splits)

In [None]:
dls = ds.dataloaders(bs=1)

In [None]:
dls.device

device(type='cuda', index=0)

In [None]:
dls.one_batch()

((tensor([[[ 1.1857,  2.3551,  2.1478],
           [ 1.2122,  2.2766,  2.1885],
           [ 1.2172,  1.8685,  2.2476],
           [ 1.2235,  1.4807,  2.2382],
           [ 1.2008,  0.9235,  2.1695],
           [ 1.1769,  0.5868,  2.1063],
           [ 1.1314,  0.2027,  2.0113],
           [ 1.0544, -0.2726,  1.7872],
           [ 0.9472, -0.5001,  1.5363],
           [ 0.8525, -0.5864,  1.3749]]], device='cuda:0'),
  tensor([[[False, False,  True],
           [False, False,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True]]], device='cuda:0')),
 (tensor([[[ 1.1857,  2.3551,  2.1478],
           [ 1.2122,  2.2766,  2.1885],
           [ 1.2172,  1.8685,  2.2476],
           [ 1.2235,  1.4807,  2.2382],
           [ 1.2008,  0.9235,  2.1695],
           [ 1.1769

In [None]:
@typedispatch
def show_batch(x: MaskedDf, y, samples, ctxs=None, max_n=6):
    return x

In [None]:
# dls.show_batch()

In [None]:
dls._types

{tuple: [{tuple: [torch.Tensor, torch.Tensor]},
  {tuple: [torch.Tensor, torch.Tensor]}]}

In [None]:
Datasets

fastai.data.core.Datasets

In [None]:
#| export
def make_dataloader(df, block_len, gap_len, bs=10):
    pipeline, block_ids = imp_pipeline(df, block_len, gap_len)
    
    splits = RandomSplitter()(block_ids)
    ds = Datasets(block_ids, [pipeline, pipeline], splits=splits)
    
    return ds.dataloaders(bs=bs)
    

In [None]:
dls = make_dataloader(hai, 200, 10)

In [None]:
dls.one_batch()[0][0].shape

torch.Size([10, 200, 3])

In [None]:
dls = dls.cpu()

## Model

### Forward Function

in order to the a pytorch module we need a forward method to the kalman filter

In [None]:
#| export
from meteo_imp.kalman.filter import *
from torch.distributions import MultivariateNormal

In [None]:
#| export
@patch
def _predict_filter(self: KalmanFilter, data, mask):
    """Predict every obsevation using only the filter step"""
    # use the predicted state not the filtered state!
    obs, mask = self._parse_obs(data, mask)
    pred_state_mean, pred_state_cov, _, _ = self._filter_all(obs, mask)
    mean, cov = self._obs_from_state(ListMNormal(pred_state_mean.squeeze(-1), pred_state_cov))
    
    return ListNormal(mean, cov2std(cov))

In [None]:
model = KalmanFilter.init_simple(n_dim = hai.shape[-1])

In [None]:
model._predict_filter(*dls.one_batch()[0]);

In [None]:
#| export
@patch
def forward(self: KalmanFilter, masked_data: MaskedTensor):
    data, mask = masked_data
    assert not data.isnan().any()
    use_smooth = self.use_smooth if hasattr(self, 'use_smooth') else True
    
    mean, std = (self.predict(obs=data, mask=mask, smooth=True) if use_smooth
                        else self._predict_filter(data, mask))
    return NormalsParams(mean, std) # to have fastai working this needs to be a tuple subclass

In [None]:
input = dls.one_batch()[0]
target = dls.one_batch()[1]

In [None]:
model.state_dict()

OrderedDict([('trans_matrix',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('trans_off', tensor([0., 0., 0.])),
             ('trans_cov_raw',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('obs_matrix',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('obs_off', tensor([0., 0., 0.])),
             ('obs_cov_raw',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('init_state_mean', tensor([0., 0., 0.])),
             ('init_state_cov_raw',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]]))])

In [None]:
data = input[0][0]
data.shape

torch.Size([200, 3])

In [None]:
mask = input[1][0]

In [None]:
mask.shape

torch.Size([200, 3])

In [None]:
data.device

device(type='cpu')

In [None]:
torch.device

torch.device

In [None]:
data.shape, mask.shape

(torch.Size([200, 3]), torch.Size([200, 3]))

In [None]:
model.predict(data.unsqueeze(0), mask.unsqueeze(0));

In [None]:
model.use_smooth = True

In [None]:
pred = model(input)

In [None]:
pred[0].shape

torch.Size([10, 200, 3])

In [None]:
pred[1].shape

torch.Size([10, 200, 3])

In [None]:
model.use_smooth = False

In [None]:
pred_filt = model(input)

In [None]:
pred_filt[1].shape

torch.Size([10, 200, 3])

In [None]:
pred

__main__.NormalsParams(mean=tensor([[[-0.2289, -0.3637, -0.3121],
         [-0.3050, -0.4982, -0.4284],
         [-0.3160, -0.5379, -0.4667],
         ...,
         [-0.3424, -0.5090, -0.6207],
         [-0.3303, -0.5105, -0.6043],
         [-0.3213, -0.4977, -0.5894]],

        [[-0.4270, -0.3664, -0.4671],
         [-0.5904, -0.5064, -0.6465],
         [-0.6533, -0.5599, -0.7156],
         ...,
         [-0.3523, -0.5929, -0.7021],
         [-0.3310, -0.5929, -0.6569],
         [-0.3109, -0.5929, -0.6206]],

        [[ 0.1127,  0.4905, -0.3426],
         [ 0.1886,  0.5478, -0.4317],
         [ 0.2603,  0.8844, -0.4022],
         ...,
         [ 1.1013,  0.0036,  0.0374],
         [ 1.1363,  0.0922,  0.0757],
         [ 1.1496,  0.0142,  0.0759]],

        ...,

        [[-0.3895, -0.3659, -0.4285],
         [-0.5319, -0.5048, -0.5966],
         [-0.5825, -0.5555, -0.6701],
         ...,
         [-0.1904,  0.2193, -0.6186],
         [-0.0656,  0.4846, -0.5678],
         [ 0.0168,  0.

In [None]:
type(pred), type(pred_filt)

(__main__.NormalsParams, __main__.NormalsParams)

In [None]:
test_ne(pred, pred_filt)

### Loss Function

add support for complete loss (also outside gap) and for filter loss (don't run the smooher)

There are two ways to compute the loss, one is to do it for all predictions the other is for doing it for only the gap
- only_gap

Play around with flatting + diagonal

In [None]:
a = torch.diag(torch.tensor([1,2,3]))
d = torch.stack([a, a*10])
m = torch.stack([a.diag(), a.diag()*10])
d

tensor([[[ 1,  0,  0],
         [ 0,  2,  0],
         [ 0,  0,  3]],

        [[10,  0,  0],
         [ 0, 20,  0],
         [ 0,  0, 30]]])

In [None]:
m.flatten()

tensor([ 1,  2,  3, 10, 20, 30])

In [None]:
d

tensor([[[ 1,  0,  0],
         [ 0,  2,  0],
         [ 0,  0,  3]],

        [[10,  0,  0],
         [ 0, 20,  0],
         [ 0,  0, 30]]])

In [None]:
torch.diagonal(d, dim1=1, dim2=2).flatten()

tensor([ 1,  2,  3, 10, 20, 30])

In [None]:
means, stds = pred
data, mask = target

In [None]:
# make a big matrix with all variables and observations and compute ll
mask = mask.flatten() 
obs = data.flatten()[mask]
means = data.flatten()[mask]
stds = stds.flatten()[mask] # need to support batches

MultivariateNormal(means, torch.diag(stds)).log_prob(obs)

tensor(-5930.4453, grad_fn=<SubBackward0>)

In [None]:
#| export
class KalmanLoss():
    def __init__(self,
                 only_gap:bool=True, # loss for all predictions or only gap
                 reduction:str='sum' # one of ['sum', 'mean', 'none']
                ):
        store_attr()
    
    def __call__(self, pred: NormalsParams, target: MaskedTensor):
        data, mask = target
        means, stds = pred        
        assert not stds.isnan().any()
        losses = torch.empty(data.shape[0], device=data.device, dtype=data.dtype)
        for i, (d, m, mean, std) in enumerate(zip(data, mask, means, stds)):
            losses[i] = self._loss_batch(d,m,mean, std)
        if self.reduction == 'none': return losses
        elif self.reduction == 'mean': return losses.mean()
        elif self.reduction == 'sum': return losses.sum()
    
    def _loss_batch(self, data, mask, mean, std):
        # make a big vector with all variables and observations and compute ll
        mask = mask.flatten() if self.only_gap else torch.fill(mask, True).flatten()
        obs = data.flatten()[mask]
        mean = mean.flatten()[mask]
        std = std.flatten()[mask] 
        
        return - MultivariateNormal(mean, torch.diag(std)).log_prob(obs)
        

In [None]:
pred = model(input)

In [None]:
data, mask = input

In [None]:
data.shape, mask.shape

(torch.Size([10, 200, 3]), torch.Size([10, 200, 3]))

In [None]:
pred.mean.shape

torch.Size([10, 200, 3])

In [None]:
means, stds = pred

In [None]:
stds.shape

torch.Size([10, 200, 3])

In [None]:
means.shape

torch.Size([10, 200, 3])

In [None]:
data.isnan().any()

tensor(False)

In [None]:
mask.isnan().any()

tensor(False)

In [None]:
means.isnan().any()

tensor(False)

In [None]:
stds.isnan().sum()

tensor(0)

In [None]:
stds.shape

torch.Size([10, 200, 3])

In [None]:
is_posdef_eigv(torch.diag(stds.flatten()))

(tensor(True),
 tensor([1.4142, 1.4142, 1.4142,  ..., 3.5522, 3.5522, 3.5522],
        grad_fn=<LinalgEighBackward0>))

In [None]:
KalmanLoss(only_gap=True)(pred, target)

tensor(9620.4248, grad_fn=<SumBackward0>)

In [None]:
KalmanLoss(only_gap=False)(pred, target)

tensor(9901.6641, grad_fn=<SumBackward0>)

In [None]:
pred.mean.device, target[0].device

(device(type='cpu'), device(type='cpu'))

In [None]:
pred.mean.shape, target[0].shape

(torch.Size([10, 200, 3]), torch.Size([10, 200, 3]))

In [None]:
target[0].shape

torch.Size([10, 200, 3])

In [None]:
KalmanLoss(only_gap=False, reduction='mean')(pred, target)

tensor(990.1664, grad_fn=<MeanBackward0>)

### Metrics

Wrapper around fastai metrics to support masked tensors and normal distributions

In [None]:
#| export
def to_msk_metric(metric, name):
    def msk_metric(imp, targ):
        return metric(imp[0], targ[0]) # first element are the means
    msk_metric.__name__ = name
    return msk_metric

In [None]:
#| export
from fastai.metrics import *

In [None]:
#| export
msk_rmse = to_msk_metric(rmse, 'rmse')

In [None]:
msk_rmse.__name__

'rmse'

In [None]:
msk_rmse(pred, target)

TensorBase(1.2510)

In [None]:
#| export
msk_r2 = to_msk_metric(R2Score(), 'r2')

In [None]:
msk_r2(pred, target)

-0.3594884242364691

### Callback

save the model state 

In [None]:
#| export
from fastai.callback.all import *

In [None]:
#| export
class SaveParams(Callback):
    def __init__(self, param_name):
        super().__init__()
        self.params = []
        self.param_name = param_name
    def after_batch(self):
        param = getattr(self.model, self.param_name).detach()
        self.params.append(param)

In [None]:
#| export
class SaveParams(Callback):
    def __init__(self, param_name):
        super().__init__()
        self.params = []
        self.param_name = param_name
    def after_batch(self):
        param = getattr(self.model, self.param_name).detach()
        self.params.append(param)

In [None]:
debug_preds = []

In [None]:
class DebugPredCallback(Callback):
    order = 0
    def after_validate(self):
        if hasattr(self, 'gather_preds'):
            debug_preds.append(self.gather_preds.preds)

### Learner

In [None]:
#| export
from fastai.learner import * 

from fastai.tabular.all import *

from fastai.tabular.learner import *

from fastai.callback.progress import ShowGraphCallback

In [None]:
obs_cov_history = SaveParams('obs_cov')

In [None]:
all_data = CollectDataCallback()

In [None]:
model = KalmanFilter.init_random(n_dim_obs = hai.shape[1], n_dim_state = hai.shape[1]).cuda()

In [None]:
model.use_smooth = False

In [None]:
# model._set_constraint('obs_cov', model.obs_cov, train=False)

In [None]:
pipeline, block_ids = imp_pipeline(hai[:20000], block_len, gap_len)
    
splits = RandomSplitter()(block_ids)
ds = Datasets(block_ids, [pipeline, pipeline], splits=splits)

In [None]:
dls = ds.dataloaders(bs=10, device='cuda')

In [None]:
dls.one_batch()[0][0].device

device(type='cuda', index=0)

In [None]:
input, target = dls.one_batch()

In [None]:
pred = model(input)
KalmanLoss()(pred, target)

tensor(2270.8555, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
learn = Learner(dls, model, loss_func=KalmanLoss(only_gap=False), cbs = [DebugPredCallback] , metrics = [msk_rmse, msk_r2])

In [None]:
learn.fit(1, 1e-3)

epoch,train_loss,valid_loss,rmse,r2,time
0,767.793274,621.327209,2.006266,-5.094438,00:10


#### Float64

In [None]:
model64 = KalmanFilter.init_random(hai.shape[1], hai.shape[1], dtype=torch.float64).cuda()

In [None]:
#| export
class Float64Callback(Callback):
    order = Recorder.order + 10 # run after Recorder 
    def before_fit(self):
        self.recorder.smooth_loss.val = torch.tensor(0, dtype=torch.float64) # default is a float 32

In [None]:
dls64 = make_dataloader(hai64, 10, 2, bs=10)

In [None]:
input64 = dls64.one_batch()[0]
target64 = dls64.one_batch()[1]

In [None]:
data64, mask64 = input64

In [None]:
data64.device, data64.dtype

(device(type='cuda', index=0), torch.float64)

In [None]:
model64.predict(data64);

In [None]:
pred = model64(input)

In [None]:
KalmanLoss()(pred, target)

tensor(340.9094, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
model64.use_smooth = False

In [None]:
learn64 = Learner(dls64, model64, loss_func=KalmanLoss(), cbs = [Float64Callback] )

In [None]:
learn64.fit(1, 1e-3)

epoch,train_loss,valid_loss,time
0,95.226314,88.306383,01:41


### Predictions

The transformation pipeline is not working properly (there is a problem in `decode_batch` as the `_types` are more nested than the predictions, which results in an error) + the pipeline is anyway not reproducible + the test dataloaders seems that they are actually not deterministic .....
soo reimplement everything almost from scratch

see https://github.com/mone27/meteo_imp/blob/0335003405ec9bd3e3bd2641bc6d7924f34a0788/lib_nbs/kalman/10_fastai.ipynb for all details

In [None]:
#| export
class NormalsDf:
    """DataFrames of Normal parameters (mean and std)"""
    def __init__(self, mean, std): store_attr()
    def tidy(self, prefix=""):
        """Tidy version"""
        mean = self.mean.reset_index().melt("time", value_name=prefix + "mean")
        std = self.std.reset_index().melt("time", value_name=prefix + "std")
        return pd.merge(mean, std, on=["time", "variable"])
    __repr__ = basic_repr("mean, std")

In [None]:
pipe0, pipe1 = tfms4.fs[0,1], tfms4.fs[2,3] 

In [None]:
pipe0, pipe1

((#2) [BlockDfTransform:
encodes: (int,object) -> encodes
decodes: ,AddGapTransform -- {'variables': ['TA', 'SW_IN'], 'gap_length': 2}:
encodes: (DataFrame,object) -> encodes
decodes: ],
 (#2) [MaskedDf2Tensor:
encodes: (MaskedDf,object) -> encodes
decodes: (MaskedTensor,object) -> decodes
,NormalizeMasked -- {'mean': tensor([  8.3339, 120.9578,   3.3807]), 'std': tensor([  7.9246, 204.0026,   4.3684])}:
encodes: (MaskedTensor,object) -> encodes
decodes: (MaskedTensor,object) -> decodes
(NormalsParams,object) -> decodes
])

In [None]:
#| export
def preds2df(preds, targs):
    """Final step to decode preds by getting a dataframe"""
    # preds this is a tuple (data, mask)
    out = []
    for pred, targ in zip(preds, targs):
        # convert to dataframe using structure for
        mean = pd.DataFrame(pred[0].squeeze(0).detach().cpu().numpy(), columns = targ.data.columns, index=targ.data.index)
        std = pd.DataFrame(pred[1].squeeze(0).detach().cpu().numpy(), columns = targ.data.columns, index=targ.data.index)
        out.append(NormalsDf(mean, std))
    return out

In [None]:
#| export
def predict_items(items, learn, pipe0, pipe1):
    pipe0, pipe1 = Pipeline(pipe0), Pipeline(pipe1)
    preds, targs, losses = [], [], []
    for item in items:
        targ = pipe0(item)
        data, mask = pipe1(targ)
        input = MaskedTensor(data.cuda().unsqueeze(0), mask.cuda().unsqueeze(0))
        pred = learn.model(input)
        loss = learn.loss_func(pred, input)
        # denormalize
        pred = pipe1.decode(pred)
        preds.append(pred), targs.append(targ), losses.append(loss)
        
    return preds2df(preds, targs), targs, losses
        

In [None]:
preds, targs, losses = predict_items([0,1,3], learn, pipe0, pipe1)

this is the same data!!

In [None]:
predict_items([0], learn, pipe0, pipe1)[1][0].data == predict_items([0], learn, pipe0, pipe1)[1][0].data

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:30:00,True,True,True
2000-01-01 01:00:00,True,True,True
2000-01-01 01:30:00,True,True,True
2000-01-01 02:00:00,True,True,True
2000-01-01 02:30:00,True,True,True
2000-01-01 03:00:00,True,True,True
2000-01-01 03:30:00,True,True,True
2000-01-01 04:00:00,True,True,True
2000-01-01 04:30:00,True,True,True
2000-01-01 05:00:00,True,True,True


#### Plot results

In [None]:
#| export
def plot_result(pred, targ, loss, **kwargs):
    df = pd.merge(targ.tidy(), pred.tidy(), on=["time", "variable"])
    # return df
    return facet_variable(df, ys=["value", "mean"], error=True, **kwargs).properties(title=f"loss: {loss.item():.6f}")

In [None]:
y = "mean"

In [None]:
plot_result(preds[0], targs[0], torch.tensor(1))

In [None]:
#| export
def plot_results(preds, targs, losses, **kwargs):
    plots = [plot_result(targ, pred, loss, n_cols=1, **kwargs) for targ, pred, loss in zip(preds, targs, losses)]
    return alt.hconcat(*plots)

In [None]:
plot_results(preds, targs, losses)

#### Show Results

In [None]:
random.choices(learn.dls.items, k=3)

[503, 1855, 1365]

In [None]:
#|export
def get_results(learn, n=3, items=None, dls=None):
    dls = ifnone(dls, learn.dls)
    items = ifnone(items, random.choices(dls.items, k=3))
    pipe0, pipe1 = dls.fs[0][0,1], dls.fs[0][2,3]
    return predict_items(items, learn, pipe0, pipe1)

In [None]:
#| export
def show_results(learn, n=3, items=None, **kwargs):
    return plot_results(*get_results(learn,n,items), **kwargs)
    

In [None]:
learn.model.use_smooth = False

In [None]:
show_results(learn)

In [None]:
show_results(learn, items=[1,2,3])

In [None]:
display_as_row(learn.model.get_info())

latent,z_0,z_1,z_2
z_0,0.7677,0.8855,0.2192
z_1,0.8028,0.1462,0.3398
z_2,0.1828,0.8168,0.1924

latent,z_0,z_1,z_2
z_0,0.4421,0.865,0.7292
z_1,0.865,1.7721,1.4024
z_2,0.7292,1.4024,1.6913

latent,offset
z_0,0.5653
z_1,0.1405
z_2,0.8181

variable,z_0,z_1,z_2
x_0,0.1918,0.3717,-0.0126
x_1,0.2597,0.493,0.9982
x_2,0.1968,0.9201,0.9421

variable,x_0,x_1,x_2
x_0,1.2067,1.1546,1.0598
x_1,1.1546,1.1593,1.1029
x_2,1.0598,1.1029,1.2102

variable,offset
x_0,0.7426
x_1,0.568
x_2,0.561

latent,mean
z_0,0.0635
z_1,0.6929
z_2,0.9313

latent,z_0,z_1,z_2
z_0,1.1366,0.9955,0.6543
z_1,0.9955,1.1963,0.6032
z_2,0.6543,0.6032,1.3697


#### Interactive

In [None]:
#| export
from ipywidgets import IntSlider, interact_manual, Text

In [None]:
#| export
def results_custom_gap(learn, df, var_sel, gap_len, items_idx, block_len):
    pipeline = [BlockDfTransform(df, block_len),
                AddGapTransform(var_sel, gap_len),
                MaskedDf2Tensor,
                NormalizeMasked(*get_stats(df))]
    
    dls = Datasets(items_idx, [pipeline, pipeline]).dataloaders(bs=len(items_idx))
    return get_results(learn, items=items_idx, dls=dls)

In [None]:
plot_results(*results_custom_gap(learn64, hai64, ['TA'], 10, [800, 801], 200))

In [None]:
#| export
def interact_results(learn, df):
    interact_args = {
        'gap_len': IntSlider(10, 1, 100),
        'items_idx': Text(value='10, 100', placeholder="space separated indices"),
        'block_len': IntSlider(200, 10, 1000, 10),
        **{var_name: True for var_name in df.columns}
    }
    
    def _inner(gap_len, items_idx, block_len, **var_names):
        var_sel = [var_name for var_name, var_use in var_names.items() if var_use]
        items_idx = list(map(int, items_idx.split(",")))
        return plot_results(*results_custom_gap(learn=learn, df=df, var_sel=var_sel, gap_len=gap_len, items_idx=items_idx, block_len=block_len))
    return interact_manual(_inner, **interact_args)

In [None]:
interact_results(learn64, hai64)

interactive(children=(IntSlider(value=10, description='gap_len', min=1), Text(value='10, 100', description='it…

<function __main__.interact_results.<locals>._inner(gap_len, items_idx, block_len, **var_names)>

## Export 

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()