# Implement Kalman model using FastAI

> need to implement custom data preparation pipeline and loss function 

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| hide
#| default_exp kalman.fastai

In [None]:
from fastai.tabular.core import *
from fastai.data.core import *

In [None]:
#| export
from meteo_imp.utils import *
from meteo_imp.gaussian import *
from meteo_imp.data import *

from fastcore.transform import *
from fastcore.basics import *
from fastcore.foundation import *
from fastcore.all import *
from fastai.tabular import *
from fastai.torch_core import default_device, to_device

import torch

import collections

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
reset_seed()

In [None]:
hai_path

Path('/home/simone/Documents/uni/Thesis/GPFA_imputation/data/FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4_float32.parquet')

In [None]:
hai = pd.read_parquet(hai_path)
hai64 = pd.read_parquet(hai_path64)
hai_era = pd.read_parquet(hai_era_path)
hai_era64 = pd.read_parquet(hai_era_path64)

## Data Preparation

The aim of the data preparation pipeline is to:
- take the original time series and split it into time blocks
- for each block generate a random gap (need to figure out the properties of the gap)
- split some time blocks for testing

the input of the pipeline is:
- a dataframe containing all observations

the input of the model is:
- observed data (potentially containing NaN where data is missing)
- missing data mask (which is telling where the data is missing)
- the data needs to be standardized

### 1) Block Index

the first step is to transfrom the original dataframe into blocks of a specified `block_len`

two different strategies are possible:

- contigous blocks
- random block in the dataframe

for now contigous blocks are used

In [None]:
from dataclasses import dataclass

In [None]:
#| export
class BlockIndexTransform(Transform):
    """divide timeseries DataFrame index into blocks"""
    def __init__(self, idx: pd.DatetimeIndex, block_len:int =200, offset=1):
        store_attr()
        self.n = len(idx)
        
    def encodes(self, i:int) -> pd.DatetimeIndex:       
        start = i * self.block_len + self.offset
        end = (i+1) * self.block_len + self.offset
        assert end <= self.n 
        
        return self.idx[start:end] 

In [None]:
blk = BlockIndexTransform(hai.index, 10)

In [None]:
blk

BlockIndexTransform:
encodes: (int,object) -> encodes
decodes: 

In [None]:
hai.index

DatetimeIndex(['2000-01-01 00:30:00', '2000-01-01 01:00:00',
               '2000-01-01 01:30:00', '2000-01-01 02:00:00',
               '2000-01-01 02:30:00', '2000-01-01 03:00:00',
               '2000-01-01 03:30:00', '2000-01-01 04:00:00',
               '2000-01-01 04:30:00', '2000-01-01 05:00:00',
               ...
               '2012-12-31 19:30:00', '2012-12-31 20:00:00',
               '2012-12-31 20:30:00', '2012-12-31 21:00:00',
               '2012-12-31 21:30:00', '2012-12-31 22:00:00',
               '2012-12-31 22:30:00', '2012-12-31 23:00:00',
               '2012-12-31 23:30:00', '2013-01-01 00:00:00'],
              dtype='datetime64[ns]', name='time', length=227952, freq=None)

In [None]:
blk(2)

DatetimeIndex(['2000-01-01 11:00:00', '2000-01-01 11:30:00',
               '2000-01-01 12:00:00', '2000-01-01 12:30:00',
               '2000-01-01 13:00:00', '2000-01-01 13:30:00',
               '2000-01-01 14:00:00', '2000-01-01 14:30:00',
               '2000-01-01 15:00:00', '2000-01-01 15:30:00'],
              dtype='datetime64[ns]', name='time', freq=None)

### 2) Meteo Imp Block DataFrames

Get a chunck out of dataframes given an index

In [None]:
#| export
@dataclass
class DataControl:
    data: pd.DataFrame
    control: pd.DataFrame
    def _repr_html_(self):
            return row_dfs({'data': self.data, 'control': self.control}, title="Data Control", hide_idx=False)
    def __iter__(self): return iter((self.data, self.control,))

In [None]:
df = hai.loc[blk(1)]

In [None]:
#| export
def _rename_lag(lag):
    def _inner(col_name):
        return f"{col_name}_lag_{lag}"
    return _inner

In [None]:
df.rename(columns=_rename_lag(1))

Unnamed: 0_level_0,TA_lag_1,SW_IN_lag_1,VPD_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 06:00:00,-0.23,0.0,0.122
2000-01-01 06:30:00,-0.22,0.0,0.098
2000-01-01 07:00:00,-0.24,0.0,0.066
2000-01-01 07:30:00,-0.23,0.0,0.044
2000-01-01 08:00:00,-0.22,0.0,0.026
2000-01-01 08:30:00,-0.19,0.45,0.016
2000-01-01 09:00:00,-0.14,3.7,0.01
2000-01-01 09:30:00,-0.03,7.26,0.006
2000-01-01 10:00:00,0.04,12.24,0.006
2000-01-01 10:30:00,0.1,16.51,0.006


In [None]:
#| export
def _lag_df(df, lag):
    "add lagged columns"
    df_lag = df.shift(lag).rename(columns = _rename_lag(lag))
    return df_lag
    

In [None]:
_lag_df(df, 1)

Unnamed: 0_level_0,TA_lag_1,SW_IN_lag_1,VPD_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 06:00:00,,,
2000-01-01 06:30:00,-0.23,0.0,0.122
2000-01-01 07:00:00,-0.22,0.0,0.098
2000-01-01 07:30:00,-0.24,0.0,0.066
2000-01-01 08:00:00,-0.23,0.0,0.044
2000-01-01 08:30:00,-0.22,0.0,0.026
2000-01-01 09:00:00,-0.19,0.45,0.016
2000-01-01 09:30:00,-0.14,3.7,0.01
2000-01-01 10:00:00,-0.03,7.26,0.006
2000-01-01 10:30:00,0.04,12.24,0.006


In [None]:
#| export
def _add_lags_df(df, lags):
    df_lagged = df
    for lag in listify(lags):
        df_lagged = pd.merge(df_lagged, _lag_df(df, lag), left_index=True, right_index=True)
    return df_lagged

In [None]:
_add_lags_df(df, [1,2])

Unnamed: 0_level_0,TA,SW_IN,VPD,TA_lag_1,SW_IN_lag_1,VPD_lag_1,TA_lag_2,SW_IN_lag_2,VPD_lag_2
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-01 06:00:00,-0.23,0.0,0.122,,,,,,
2000-01-01 06:30:00,-0.22,0.0,0.098,-0.23,0.0,0.122,,,
2000-01-01 07:00:00,-0.24,0.0,0.066,-0.22,0.0,0.098,-0.23,0.0,0.122
2000-01-01 07:30:00,-0.23,0.0,0.044,-0.24,0.0,0.066,-0.22,0.0,0.098
2000-01-01 08:00:00,-0.22,0.0,0.026,-0.23,0.0,0.044,-0.24,0.0,0.066
2000-01-01 08:30:00,-0.19,0.45,0.016,-0.22,0.0,0.026,-0.23,0.0,0.044
2000-01-01 09:00:00,-0.14,3.7,0.01,-0.19,0.45,0.016,-0.22,0.0,0.026
2000-01-01 09:30:00,-0.03,7.26,0.006,-0.14,3.7,0.01,-0.19,0.45,0.016
2000-01-01 10:00:00,0.04,12.24,0.006,-0.03,7.26,0.006,-0.14,3.7,0.01
2000-01-01 10:30:00,0.1,16.51,0.006,0.04,12.24,0.006,-0.03,7.26,0.006


In [None]:
#| export
class BlockDfTransform(Transform):
    """divide timeseries DataFrame index into blocks"""
    def __init__(self, data: pd.DataFrame, control: pd.DataFrame, control_lags: int|Iterable[int]):
        store_attr()
        self.control = _add_lags_df(control, control_lags)
    def encodes(self, idx: pd.DatetimeIndex) -> DataControl:
        return DataControl(self.data.loc[idx], self.control.loc[idx])

In [None]:
blkdf = BlockDfTransform(hai, hai_era, 1)

In [None]:
blkdf(blk(1))

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 06:00:00,-0.23,0.0,0.122
2000-01-01 06:30:00,-0.22,0.0,0.098
2000-01-01 07:00:00,-0.24,0.0,0.066
2000-01-01 07:30:00,-0.23,0.0,0.044
2000-01-01 08:00:00,-0.22,0.0,0.026
2000-01-01 08:30:00,-0.19,0.45,0.016
2000-01-01 09:00:00,-0.14,3.7,0.01
2000-01-01 09:30:00,-0.03,7.26,0.006
2000-01-01 10:00:00,0.04,12.24,0.006
2000-01-01 10:30:00,0.1,16.51,0.006

Unnamed: 0_level_0,TA_ERA,SW_IN_ERA,VPD_ERA,TA_ERA_lag_1,SW_IN_ERA_lag_1,VPD_ERA_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-01 06:00:00,-0.28,0.0,0.501,-0.349,0.0,0.497
2000-01-01 06:30:00,-0.211,0.0,0.506,-0.28,0.0,0.501
2000-01-01 07:00:00,-0.142,0.0,0.51,-0.211,0.0,0.506
2000-01-01 07:30:00,-0.074,0.0,0.515,-0.142,0.0,0.51
2000-01-01 08:00:00,0.044,0.0,0.527,-0.074,0.0,0.515
2000-01-01 08:30:00,0.161,0.0,0.539,0.044,0.0,0.527
2000-01-01 09:00:00,0.279,2.655,0.551,0.161,0.0,0.539
2000-01-01 09:30:00,0.396,7.08,0.563,0.279,2.655,0.551
2000-01-01 10:00:00,0.514,10.988,0.575,0.396,7.08,0.563
2000-01-01 10:30:00,0.631,18.012,0.586,0.514,10.988,0.575


taking a day in the summer so there is an higher values for the variables

In [None]:
blkdf(blk(800)).data

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,14.22,224.800003,5.799
2000-06-15 17:30:00,14.11,195.279999,6.577
2000-06-15 18:00:00,14.23,244.169998,6.931
2000-06-15 18:30:00,14.4,253.919998,7.286
2000-06-15 19:00:00,14.09,177.309998,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.709999,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164
2000-06-15 21:30:00,12.2,0.0,5.037


In [None]:
tfms1 = TfmdLists([800,801,802,803], [BlockIndexTransform(hai.index, 10), BlockDfTransform(hai, hai_era, control_lags=1)])

In [None]:
tfms1[0]

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,14.22,224.8,5.799
2000-06-15 17:30:00,14.11,195.28,6.577
2000-06-15 18:00:00,14.23,244.17,6.931
2000-06-15 18:30:00,14.4,253.92,7.286
2000-06-15 19:00:00,14.09,177.31,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.71,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164
2000-06-15 21:30:00,12.2,0.0,5.037

Unnamed: 0_level_0,TA_ERA,SW_IN_ERA,VPD_ERA,TA_ERA_lag_1,SW_IN_ERA_lag_1,VPD_ERA_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-15 17:00:00,15.05,255.193,5.102,15.139,287.1,4.9
2000-06-15 17:30:00,14.961,221.427,5.305,15.05,255.193,5.102
2000-06-15 18:00:00,14.872,186.38,5.507,14.961,221.427,5.305
2000-06-15 18:30:00,14.783,150.65,5.71,14.872,186.38,5.507
2000-06-15 19:00:00,14.694,114.849,5.912,14.783,150.65,5.71
2000-06-15 19:30:00,14.606,34.728,6.114,14.694,114.849,5.912
2000-06-15 20:00:00,14.38,19.843,6.074,14.606,34.728,6.114
2000-06-15 20:30:00,14.155,5.712,6.034,14.38,19.843,6.074
2000-06-15 21:00:00,13.929,0.0,5.994,14.155,5.712,6.034
2000-06-15 21:30:00,13.704,0.0,5.954,13.929,0.0,5.994


### 3) Gaps

adds a mask which includes a random gap

#### Make random Gap

In [None]:
idx = L(*tfms1[0].data.columns).argwhere(lambda x: x in ['TA','SW_IN'])

In [None]:
mask = np.ones_like(tfms1[0].data, dtype=bool)

In [None]:
mask

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [None]:
#| exports
def _make_random_gap(
    gap_length: int, # The length of the gap
    total_length: int, # The total number of observations
    gap_start: Optional[int] = None # Optional start of gap
)-> np.ndarray: # [total_length] array of bools to indicicate if the data is missing or not
    "Add a continous gap of given length at random position"
    if(gap_length >= total_length):
        return np.repeat(True, total_length)
    gap_start = np.random.randint(total_length - gap_length) if gap_start is None else gap_start
    return np.hstack([
        np.repeat(False, gap_start),
        np.repeat(True, gap_length),
        np.repeat(False, total_length - (gap_length + gap_start))
    ])

In [None]:
gap = _make_random_gap(2, 10, 2)

In [None]:
gap

array([False, False,  True,  True, False, False, False, False, False,
       False])

In [None]:
np.argwhere(gap)

array([[2],
       [3]])

In [None]:
mask[np.argwhere(gap), idx] = False

In [None]:
mask

array([[ True,  True,  True],
       [ True,  True,  True],
       [False, False,  True],
       [False, False,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [None]:
mask[gap]

array([[False, False,  True],
       [False, False,  True]])

#### Add Gap Transform

In [None]:
#| export
class MeteoImpDf:
    def __init__(self,*args):
        self.data = args[0]
        self.mask = args[1]
        self.control = args[2]
    def __iter__(self): return iter((self.data, self.mask, self.control,))
    __repr__ = basic_repr("data, mask, control")
    def _repr_html_(self):
        return row_dfs({'data': self.data, 'mask': self.mask, 'control': self.control}, title="Meteo Imp Df", hide_idx=False)

In [None]:
#| export
class AddGapTransform(DisplayedTransform):
    """Adds a random gap to a dataframe"""
    def __init__(self,
                variables,
                gap_length,
                ):
        store_attr()
    def encodes(self, dc: DataControl) -> MeteoImpDf:
        df, control = dc
        gap = _make_random_gap(self.gap_length, df.shape[0])
        mask = np.ones_like(df, dtype=bool)
        col_sel = L(*df.columns).argwhere(lambda x: x in self.variables)
        mask[np.argwhere(gap), col_sel] = False
        mask = pd.DataFrame(mask, index=df.index, columns=df.columns)
        return MeteoImpDf(df, mask, control)

In [None]:
a_gap = AddGapTransform(['TA', 'VPD'], 5)
a_gap

AddGapTransform -- {'variables': ['TA', 'VPD'], 'gap_length': 5}:
encodes: (DataControl,object) -> encodes
decodes: 

In [None]:
a_gap(tfms1[0])

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,14.22,224.8,5.799
2000-06-15 17:30:00,14.11,195.28,6.577
2000-06-15 18:00:00,14.23,244.17,6.931
2000-06-15 18:30:00,14.4,253.92,7.286
2000-06-15 19:00:00,14.09,177.31,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.71,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164
2000-06-15 21:30:00,12.2,0.0,5.037

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,True,True,True
2000-06-15 17:30:00,True,True,True
2000-06-15 18:00:00,True,True,True
2000-06-15 18:30:00,True,True,True
2000-06-15 19:00:00,False,True,False
2000-06-15 19:30:00,False,True,False
2000-06-15 20:00:00,False,True,False
2000-06-15 20:30:00,False,True,False
2000-06-15 21:00:00,False,True,False
2000-06-15 21:30:00,True,True,True

Unnamed: 0_level_0,TA_ERA,SW_IN_ERA,VPD_ERA,TA_ERA_lag_1,SW_IN_ERA_lag_1,VPD_ERA_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-15 17:00:00,15.05,255.193,5.102,15.139,287.1,4.9
2000-06-15 17:30:00,14.961,221.427,5.305,15.05,255.193,5.102
2000-06-15 18:00:00,14.872,186.38,5.507,14.961,221.427,5.305
2000-06-15 18:30:00,14.783,150.65,5.71,14.872,186.38,5.507
2000-06-15 19:00:00,14.694,114.849,5.912,14.783,150.65,5.71
2000-06-15 19:30:00,14.606,34.728,6.114,14.694,114.849,5.912
2000-06-15 20:00:00,14.38,19.843,6.074,14.606,34.728,6.114
2000-06-15 20:30:00,14.155,5.712,6.034,14.38,19.843,6.074
2000-06-15 21:00:00,13.929,0.0,5.994,14.155,5.712,6.034
2000-06-15 21:30:00,13.704,0.0,5.954,13.929,0.0,5.994


In [None]:
tfms2 = TfmdLists([800,801,802,803], [*tfms1.fs, AddGapTransform(['TA','SW_IN'], 2)])

In [None]:
tfms2[0]

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,14.22,224.8,5.799
2000-06-15 17:30:00,14.11,195.28,6.577
2000-06-15 18:00:00,14.23,244.17,6.931
2000-06-15 18:30:00,14.4,253.92,7.286
2000-06-15 19:00:00,14.09,177.31,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.71,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164
2000-06-15 21:30:00,12.2,0.0,5.037

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,True,True,True
2000-06-15 17:30:00,True,True,True
2000-06-15 18:00:00,True,True,True
2000-06-15 18:30:00,True,True,True
2000-06-15 19:00:00,True,True,True
2000-06-15 19:30:00,True,True,True
2000-06-15 20:00:00,False,False,True
2000-06-15 20:30:00,False,False,True
2000-06-15 21:00:00,True,True,True
2000-06-15 21:30:00,True,True,True

Unnamed: 0_level_0,TA_ERA,SW_IN_ERA,VPD_ERA,TA_ERA_lag_1,SW_IN_ERA_lag_1,VPD_ERA_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-15 17:00:00,15.05,255.193,5.102,15.139,287.1,4.9
2000-06-15 17:30:00,14.961,221.427,5.305,15.05,255.193,5.102
2000-06-15 18:00:00,14.872,186.38,5.507,14.961,221.427,5.305
2000-06-15 18:30:00,14.783,150.65,5.71,14.872,186.38,5.507
2000-06-15 19:00:00,14.694,114.849,5.912,14.783,150.65,5.71
2000-06-15 19:30:00,14.606,34.728,6.114,14.694,114.849,5.912
2000-06-15 20:00:00,14.38,19.843,6.074,14.606,34.728,6.114
2000-06-15 20:30:00,14.155,5.712,6.034,14.38,19.843,6.074
2000-06-15 21:00:00,13.929,0.0,5.994,14.155,5.712,6.034
2000-06-15 21:30:00,13.704,0.0,5.954,13.929,0.0,5.994


In [None]:
m_df = tfms2[0]

In [None]:
#| export
@patch
def tidy(self: MeteoImpDf):
    data = self.data.reset_index().melt("time")
    mask = self.mask.reset_index().melt("time", value_name="is_present")
    
    return pd.merge(data, mask, on=["time", "variable"])

In [None]:
m_df.tidy()

Unnamed: 0,time,variable,value,is_present
0,2000-06-15 17:00:00,TA,14.22,True
1,2000-06-15 17:30:00,TA,14.11,True
2,2000-06-15 18:00:00,TA,14.23,True
3,2000-06-15 18:30:00,TA,14.4,True
4,2000-06-15 19:00:00,TA,14.09,True
5,2000-06-15 19:30:00,TA,13.71,True
6,2000-06-15 20:00:00,TA,13.08,True
7,2000-06-15 20:30:00,TA,12.41,False
8,2000-06-15 21:00:00,TA,12.27,False
9,2000-06-15 21:30:00,TA,12.2,True


#### Plotting

In [None]:
#| export
import altair as alt
from altair import datum

In [None]:
#| exporti
def def_selection():
    return alt.selection_interval(bind="scales")

##### Rug

In [None]:
#| exporti
def plot_rug(df, sel = def_selection(), props = {}):
    if 'height' in props:
        props = props.copy() 
        props.pop('height') # rug should have default heigth
    return alt.Chart(df).mark_tick(
            color='black',
        ).encode(
            x = "time",
            color = alt.condition(datum.is_present, alt.value('white'), alt.value('black'))
        ).add_params(
            sel
        ).properties(**props) 

In [None]:
plot_rug(m_df.tidy())

In [None]:
df = m_df.tidy()

In [None]:
df = df[df.variable=="TA"].copy()

In [None]:
df['row_number'] = df.reset_index().index

In [None]:
df

Unnamed: 0,time,variable,value,is_present,row_number
0,2000-06-15 17:00:00,TA,14.22,True,0
1,2000-06-15 17:30:00,TA,14.11,True,1
2,2000-06-15 18:00:00,TA,14.23,True,2
3,2000-06-15 18:30:00,TA,14.4,True,3
4,2000-06-15 19:00:00,TA,14.09,True,4
5,2000-06-15 19:30:00,TA,13.71,True,5
6,2000-06-15 20:00:00,TA,13.08,True,6
7,2000-06-15 20:30:00,TA,12.41,False,7
8,2000-06-15 21:00:00,TA,12.27,False,8
9,2000-06-15 21:30:00,TA,12.2,True,9


In [None]:
df.iloc[1]

time          2000-06-15 17:30:00
variable                       TA
value                       14.11
is_present                   True
row_number                      1
Name: 1, dtype: object

In [None]:
df.loc[2, "is_present"] = True

In [None]:
df

Unnamed: 0,time,variable,value,is_present,row_number
0,2000-06-15 17:00:00,TA,14.22,True,0
1,2000-06-15 17:30:00,TA,14.11,True,1
2,2000-06-15 18:00:00,TA,14.23,True,2
3,2000-06-15 18:30:00,TA,14.4,True,3
4,2000-06-15 19:00:00,TA,14.09,True,4
5,2000-06-15 19:30:00,TA,13.71,True,5
6,2000-06-15 20:00:00,TA,13.08,True,6
7,2000-06-15 20:30:00,TA,12.41,False,7
8,2000-06-15 21:00:00,TA,12.27,False,8
9,2000-06-15 21:30:00,TA,12.2,True,9


In [None]:
i = 1
prev, curr, next = df.iloc[i-1], df.iloc[i], df.iloc[i+1]

In [None]:
prev, curr, next

(time          2000-06-15 17:00:00
 variable                       TA
 value                       14.22
 is_present                   True
 row_number                      0
 Name: 0, dtype: object,
 time          2000-06-15 17:30:00
 variable                       TA
 value                       14.11
 is_present                   True
 row_number                      1
 Name: 1, dtype: object,
 time          2000-06-15 18:00:00
 variable                       TA
 value                       14.23
 is_present                   True
 row_number                      2
 Name: 2, dtype: object)

In [None]:
df

Unnamed: 0,time,variable,value,is_present,row_number
0,2000-06-15 17:00:00,TA,14.22,True,0
1,2000-06-15 17:30:00,TA,14.11,True,1
2,2000-06-15 18:00:00,TA,14.23,True,2
3,2000-06-15 18:30:00,TA,14.4,True,3
4,2000-06-15 19:00:00,TA,14.09,True,4
5,2000-06-15 19:30:00,TA,13.71,True,5
6,2000-06-15 20:00:00,TA,13.08,True,6
7,2000-06-15 20:30:00,TA,12.41,False,7
8,2000-06-15 21:00:00,TA,12.27,False,8
9,2000-06-15 21:30:00,TA,12.2,True,9


In [None]:
for i in range(len(df)):
    # handle boundaries
    prev = df.iloc[i-1].is_present if i>0 else True 
    next = df.iloc[i+1].is_present if i<(len(df)-1) else True 
    curr = df.iloc[i]
    if not curr.is_present and prev:
        print("gap start", curr.time)
    if not curr.is_present and next:
        print("gap end", curr.time)

gap start 2000-06-15 20:30:00
gap end 2000-06-15 21:00:00


In [None]:
#| export
def find_gap_limits(df):
    gap_starts, gap_ends = [], []
    for i in range(len(df)):
        prev = df.iloc[i-1].is_present if i>0 else True 
        next = df.iloc[i+1].is_present if i<(len(df)-1) else True 
        curr = df.iloc[i]
        if not curr.is_present and prev: gap_starts.append(curr.time)
        if not curr.is_present and next: gap_ends.append(curr.time)
    return pd.DataFrame({'gap_start': gap_starts, 'gap_end': gap_ends})
    

In [None]:
find_gap_limits(df)

Unnamed: 0,gap_start,gap_end
0,2000-06-15 20:30:00,2000-06-15 21:00:00


In [None]:
#| export
def plot_missing_area(df, sel = def_selection(), props={}):
    gap_limits = find_gap_limits(df)
    start = alt.Chart(gap_limits).mark_rule().encode(
        x = alt.X('gap_start', axis=alt.Axis(domain=False, labels = False, ticks=False, title=None)),
    )
    end = alt.Chart(gap_limits).mark_rule().encode(
        x = alt.X('gap_end', axis=alt.Axis(domain=False, labels = False, ticks=False, title=None))
    )
    area = alt.Chart(gap_limits).mark_rect(color='black', opacity=.2).encode(
        x = alt.X('gap_start', axis=alt.Axis(domain=False, labels = False, ticks=False, title=None)),
        x2 = 'gap_end'
    )
    return (start + end + area)#.add_params(sel).properties(**props)

In [None]:
plot_missing_area(df)

##### Points

In [None]:
#| export
def plot_points(df, y = "value", y_label = "", sel = def_selection(), props = {}):
    return alt.Chart(df).mark_point(
            color='black',
            strokeWidth = 1,
            fillOpacity = 1
        ).encode(
            x = alt.X("time", axis=alt.Axis(domain=True, labels = True, ticks=True, title="time")),
            y = alt.Y(y, title = y_label, scale=alt.Scale(zero=False)),
            fill= alt.Fill("is_present", scale = alt.Scale(range=["black", "#ffffff00"]),
                           legend = alt.Legend(title =["Observed data"])),
            shape = "is_present",
        )

In [None]:
plot_points(m_df.tidy())

##### Line

In [None]:
#| exporti
def plot_line(df, only_present=True, y="value", y_label = "", sel = def_selection(), props = {}):
    # df = df[df.is_present] if only_present else df
    # TODO remove onle_present
    return alt.Chart(df).mark_line().encode(
        x = "time",    
        y = alt.Y(y, title = y_label, scale=alt.Scale(zero=False)),
        color='variable'
    ).add_params(
        sel
    ).properties(
        **props
    )#.transform_filter(
    #     datum.is_present
    # )

    

In [None]:
plot_line(m_df.tidy())

##### Errorband

In [None]:
#| exporti
def plot_error(df, y = "value", y_label = "", sel = def_selection(), props = {}):
    df.loc[:,'err_low'] = df[y] - 2 * df['std']
    df.loc[:,'err_high'] = df[y] + 2 * df['std']
    return alt.Chart(df).mark_errorband().encode(
        x = "time",    
        y = alt.Y("err_low:Q", title = y_label, scale=alt.Scale(zero=False)),
        y2 = "err_high:Q",
        color=alt.Color("variable",
                        legend = alt.Legend(title=["Line: pred. mean", "area: +/- 2 std"])
                       )
    ).add_params(
        sel
    ).properties(
        **props
    )
    

In [None]:
plot_error(m_df.tidy().assign(std=5))

##### Variable

In [None]:
#| exporti
def plot_variable(df, variable, ys=["value", "value"], title="", y_label="", sel = None, error=False, props = {}):
    df = df[df.variable == variable].copy()
    sel = ifnone(sel, def_selection())
    # rug = plot_rug(df, sel, props)
    points = plot_points(df, ys[0], y_label, sel, props)
    if not df.is_present.all(): points += plot_missing_area(df, sel, props) # there is a gap
    line = plot_line(df, True, ys[1], y_label, sel, props)
    if error: line = plot_error(df, y=ys[1], y_label=y_label, sel=sel, props=props) + line
    
    return (points + line).properties(title=title)
    
    # return alt.VConcatChart(vconcat=[(points + line), rug], spacing=-10).properties(title=title)

In [None]:
plot_variable(m_df.tidy(), "TA", title="title TA")

In [None]:
plot_variable(m_df.tidy().assign(std=.5), "TA", title="title TA", error=True)

##### Facet

In [None]:
#| export
def facet_variable(df, # tidy dataframe
                   n_cols: int = 3,
                   bind_interaction: bool =True, # Whether the sub-plots for each variable should be connected for zooming/panning
                   error:bool=False, # plot error bar
                   ys:list=["value", "value"],
                   props:dict|None = None, # additional properties for altair plot (eg. size)
                   ) -> alt.Chart:
    """Plot all values of the column `variable` in different subplots"""
    props = ifnone(props, {'width': 200, 'height': 150})
    vars = df.variable.unique()
    plot_list = [alt.hconcat() for _ in range(0, len(vars), n_cols)]
    selection_scale = alt.selection_interval(bind="scales", encodings=['x']) if bind_interaction else None
    for idx, variable in enumerate(vars):
        plot = plot_variable(df,
                            variable,
                            ys = ys,
                            title = variable,
                            y_label = variable,
                            sel = selection_scale,
                            props=props,
                            error=error)
        
        plot_list[idx // n_cols] |= plot
    
    plot = alt.vconcat(*plot_list)
    
    return plot

##### Show

In [None]:
#| export
@patch
def show(self: MeteoImpDf, ax=None, ctx=None, 
        n_cols: int = 3,
        bind_interaction: bool =True, # Whether the sub-plots for each variable should be connected for zooming/panning
        props:dict = None # additional properties (eg. size) for altair plot
       ) -> alt.Chart:
    
    df = self.tidy()
    return facet_variable(df, n_cols, bind_interaction, props)

In [None]:
m_df.show(bind_interaction = False)

In [None]:
tfms2[0].show()

In [None]:
tfms2[3].show()

### 4) To Tensor

this needs to handle both the init with a list of items and when the first item is a sequence of list of items

In [None]:
#| export
class MeteoImpTensor(collections.abc.Sequence):
    def __init__(self,*args):
        if len(args)==3:
            self.data = args[0]
            self.mask = args[1]
            self.control = args[2]
        elif len(args)==1 and len(args[0])==2:
            self.data = args[0][0]
            self.mask = args[0][1]
        else:
            raise ValueError(f"Incorrect number of arguments. got {len(args)} args")

    def __iter__(self): return iter((self.data, self.mask,self.control))
    __len__ = 3
    def __getitem__(self, key):
        if key == 0: return self.data
        elif key == 1: return self.mask
        elif key == 2: return self.control
        else: raise IndexError("index bigger than 2")
    __repr__ = basic_repr('data, mask, control')
    def _repr_html_(self):
        return row_items(data = self.data, mask = self.mask, control = self.control)

In [None]:
#| export
class MeteoImpDf2Tensor(Transform):
    def setups(self, items):
        self.columns = list(items[0].data.columns)
    def encodes(self, df: MeteoImpDf) -> MeteoImpTensor:
        data = torch.tensor(df.data.to_numpy())
        mask = torch.tensor(df.mask.to_numpy())
        control = torch.tensor(df.control.to_numpy())
        return MeteoImpTensor(data, mask, control)
        
    # def decodes(self, x: MeteoImpTensor) -> MeteoImpDf:
    #     data = pd.DataFrame(x.data.detach().cpu().numpy(), columns = self.columns)
    #     mask = pd.DataFrame(x.mask.cpu().numpy(), columns = self.columns)
    #     control = pd.DataFrame(x.control.cpu().numpy(), columns = self.columns)
    #     return MeteoImpDf(data, mask, control)

In [None]:
to_t = MeteoImpDf2Tensor()

In [None]:
to_t.setup(tfms2)

In [None]:
to_t(tfms2[0])

In [None]:
to_t.decode(to_t(tfms2[0]));

In [None]:
tfms2[0]

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,14.22,224.8,5.799
2000-06-15 17:30:00,14.11,195.28,6.577
2000-06-15 18:00:00,14.23,244.17,6.931
2000-06-15 18:30:00,14.4,253.92,7.286
2000-06-15 19:00:00,14.09,177.31,7.251
2000-06-15 19:30:00,13.71,97.07,6.683
2000-06-15 20:00:00,13.08,39.71,5.851
2000-06-15 20:30:00,12.41,10.65,5.254
2000-06-15 21:00:00,12.27,0.32,5.164
2000-06-15 21:30:00,12.2,0.0,5.037

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-06-15 17:00:00,False,False,True
2000-06-15 17:30:00,False,False,True
2000-06-15 18:00:00,True,True,True
2000-06-15 18:30:00,True,True,True
2000-06-15 19:00:00,True,True,True
2000-06-15 19:30:00,True,True,True
2000-06-15 20:00:00,True,True,True
2000-06-15 20:30:00,True,True,True
2000-06-15 21:00:00,True,True,True
2000-06-15 21:30:00,True,True,True

Unnamed: 0_level_0,TA_ERA,SW_IN_ERA,VPD_ERA,TA_ERA_lag_1,SW_IN_ERA_lag_1,VPD_ERA_lag_1
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-06-15 17:00:00,15.05,255.193,5.102,15.139,287.1,4.9
2000-06-15 17:30:00,14.961,221.427,5.305,15.05,255.193,5.102
2000-06-15 18:00:00,14.872,186.38,5.507,14.961,221.427,5.305
2000-06-15 18:30:00,14.783,150.65,5.71,14.872,186.38,5.507
2000-06-15 19:00:00,14.694,114.849,5.912,14.783,150.65,5.71
2000-06-15 19:30:00,14.606,34.728,6.114,14.694,114.849,5.912
2000-06-15 20:00:00,14.38,19.843,6.074,14.606,34.728,6.114
2000-06-15 20:30:00,14.155,5.712,6.034,14.38,19.843,6.074
2000-06-15 21:00:00,13.929,0.0,5.994,14.155,5.712,6.034
2000-06-15 21:30:00,13.704,0.0,5.954,13.929,0.0,5.994


In [None]:
tfms3 = TfmdLists([800, 801, 802], [*tfms2.fs, MeteoImpDf2Tensor()])

In [None]:
tfms3[0]

In [None]:
type(tfms3[0])

__main__.MeteoImpTensor

In [None]:
# tfms3.decode(tfms3[0])

### 5) Normalize

In [None]:
#| export
from meteo_imp.utils import *
from fastai.torch_core import to_cpu

from torch import Tensor

In [None]:
collections.namedtuple

<function collections.namedtuple(typename, field_names, *, rename=False, defaults=None, module=None)>

In [None]:
nt = collections.namedtuple("nt", "a")

In [None]:
isinstance(nt(1), tuple)

True

In [None]:
#| export
class NormalsParams(list):
    def __init__(self,*args):
        if len(args)==2:
            self.mean = args[0]
            self.std = args[1]
        elif isinstance(args[0], Generator):
            args = list(args[0])
            self.mean = args[0]
            self.std = args[1]
        elif len(args)==1 and len(args[0])==2:
            self.mean = tuple(args[0])[0]
            self.std = tuple(args[0])[1]                     
        else:
            raise ValueError(f"Incorrect number of arguments. got {len(args)} args")
    def __iter__(self): return iter((self.mean, self.std,))
    def __next__(self): return next(self.__iter__())
    def __len__(self): return 2
    def __getitem__(self, key):
        if key == 0: return self.mean
        elif key == 1: return self.std
        else: raise IndexError("index bigger than 2")
    __repr__ = basic_repr('mean, std')

In [None]:
NormalsParams(0,1)

__main__.NormalsParams(mean=0, std=1)

In [None]:
#| export
def get_stats(df, repeat=1, device='cpu'):
    return torch.tensor(df.mean(axis=0).to_numpy(), device=device).repeat(repeat), torch.tensor(df.std(axis=0).to_numpy(), device=device).repeat(repeat)

In [None]:
#| export
class MeteoImpNormalize(Transform):
    "Normalize/denorm MeteoImpTensor column-wise "
    @property
    def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"

    def __init__(self, mean_data, std_data, mean_control, std_control): store_attr()

    def encodes(self, x:MeteoImpTensor)-> MeteoImpTensor:
        return MeteoImpTensor((x.data -self.mean_data) / self.std_data, x.mask, (x.control - self.mean_control)/self.std_control)

    def decodes(self, x:MeteoImpTensor)->MeteoImpTensor:
        f = partial(to_device, device=(x[0].device))
        return MeteoImpTensor(x.data * f(self.std_data) + f(self.mean_data), x.mask, x.control * f(self.std_control) + f(self.mean_control))
    
    def decodes(self, x:NormalsParams):
        f = partial(to_device, device=(x[0].device))
        mean = x.mean * f(self.std_data) + f(self.mean_data)
        std = x.std * f(self.std_data)
        
        return NormalsParams(mean, std)

In [None]:
norm = MeteoImpNormalize(*get_stats(hai), *get_stats(hai_era,2))

In [None]:
tfms3[0]

In [None]:
norm

MeteoImpNormalize -- {'mean_data': tensor([  8.3339, 120.9578,   3.3807]), 'std_data': tensor([  7.9246, 204.0026,   4.3684]), 'mean_control': tensor([  8.1948, 120.6864,   3.3253,   8.1948, 120.6864,   3.3253]), 'std_control': tensor([  7.5459, 187.1730,   3.6871,   7.5459, 187.1730,   3.6871])}:
encodes: (MeteoImpTensor,object) -> encodes
decodes: (MeteoImpTensor,object) -> decodes
(NormalsParams,object) -> decodes

In [None]:
norm(tfms3[0])

In [None]:
test_close(norm.decode(norm(tfms3[0]))[0], tfms3[0][0], eps=2e-5)

Test that NormalsParams decode actually works

In [None]:
Npars = NormalsParams(torch.tensor(1), torch.tensor(.1))

In [None]:
norm.decode(Npars)

__main__.NormalsParams(mean=tensor([ 16.2585, 324.9604,   7.7491]), std=tensor([ 0.7925, 20.4003,  0.4368]))

In [None]:
tfms4 = TfmdLists([800,801,803], [*tfms3.fs,MeteoImpNormalize(*get_stats(hai),*get_stats(hai_era, 2) ) ])

In [None]:
tfms4[0]

In [None]:
tfms4.decode(tfms4[0])

is workinggggggggggggggggg 

### 6) To Tuple

Fastai likes to work with tuples (in particular for collating)... for now convert to a tuple. Maybe find a way to mimic a tuple in `MeteoImpTensor`

In [None]:
#| export
class ToTuple(Transform):
    def encodes(self, x): return tuple(x)
    def decodes(self, x): return MeteoImpTensor(x)

In [None]:
ToTuple()

ToTuple:
encodes: (object,object) -> encodes
decodes: (object,object) -> decodes

In [None]:
tfms5 = TfmdLists([800,801,803], [*tfms4.fs,ToTuple])

In [None]:
tfms5[0];

### Pipeline

In [None]:
#| export
from fastai.data.transforms import *

In [None]:
block_len = 10
control_lags = [1]
control_repeat = len(control_lags)
block_ids = list(range(control_lag, (len(hai) // block_len) - 1))[:10]
gap_len = 2

In [None]:
#| export
def imp_pipeline(df,
                 control,
                 block_len,
                 gap_len,
                 control_lags
                ):
    offset = max(control_lags)
    block_ids = list(range(offset, (len(df) // block_len) - 1))
    return [BlockIndexTransform(df.index, block_len=block_len, offset=offset),
            BlockDfTransform(data = df, control = control,  control_lags = control_lags),
            AddGapTransform(['TA','SW_IN'], gap_len),
            MeteoImpDf2Tensor,
            MeteoImpNormalize(*get_stats(df), *get_stats(control, 1+len(control_lags))),
            ToTuple
           ], block_ids

In [None]:
pipeline, block_ids = imp_pipeline(hai, hai_era, block_len, gap_len, control_lags)

In [None]:
pipeline

[BlockIndexTransform:
 encodes: (int,object) -> encodes
 decodes: ,
 BlockDfTransform:
 encodes: (DatetimeIndex,object) -> encodes
 decodes: ,
 AddGapTransform -- {'variables': ['TA', 'SW_IN'], 'gap_length': 2}:
 encodes: (DataControl,object) -> encodes
 decodes: ,
 __main__.MeteoImpDf2Tensor,
 MeteoImpNormalize -- {'mean_data': tensor([  8.3339, 120.9578,   3.3807]), 'std_data': tensor([  7.9246, 204.0026,   4.3684]), 'mean_control': tensor([  8.1948, 120.6864,   3.3253,   8.1948, 120.6864,   3.3253]), 'std_control': tensor([  7.5459, 187.1730,   3.6871,   7.5459, 187.1730,   3.6871])}:
 encodes: (MeteoImpTensor,object) -> encodes
 decodes: (MeteoImpTensor,object) -> decodes
 (NormalsParams,object) -> decodes,
 __main__.ToTuple]

In [None]:
pp = Pipeline(pipeline)

In [None]:
pp

Pipeline: BlockIndexTransform -> BlockDfTransform -> AddGapTransform -- {'variables': ['TA', 'SW_IN'], 'gap_length': 2} -> MeteoImpDf2Tensor -> MeteoImpNormalize -- {'mean_data': tensor([  8.3339, 120.9578,   3.3807]), 'std_data': tensor([  7.9246, 204.0026,   4.3684]), 'mean_control': tensor([  8.1948, 120.6864,   3.3253,   8.1948, 120.6864,   3.3253]), 'std_control': tensor([  7.5459, 187.1730,   3.6871,   7.5459, 187.1730,   3.6871])} -> ToTuple

### Dataloader

random splitter for validation/training set

In [None]:
reset_seed()

In [None]:
splits = RandomSplitter()(block_ids)

Repeat twice the pipeline since is the same pipeline both for training data and for labels.

In theory could optimize the label creation and get the data only from the gap and not the control, but for now it works and the overhead is minimal

In [None]:
ds = Datasets(block_ids, [pipeline, pipeline], splits=splits)

In [None]:
dls = ds.dataloaders(bs=1)

In [None]:
dls.device

device(type='cuda', index=0)

In [None]:
dls.one_batch()

((tensor([[[ 0.4752, -0.5929, -0.1684],
           [ 0.5030, -0.5929, -0.1091],
           [ 0.4992, -0.5929, -0.1313],
           [ 0.5106, -0.5929, -0.0899],
           [ 0.5207, -0.5929, -0.0613],
           [ 0.5181, -0.5929, -0.0707],
           [ 0.4841, -0.5929, -0.1480],
           [ 0.4651, -0.5929, -0.2121],
           [ 0.4828, -0.5929, -0.2050],
           [ 0.4853, -0.5929, -0.2108]]], device='cuda:0'),
  tensor([[[ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [False, False,  True],
           [False, False,  True],
           [ True,  True,  True],
           [ True,  True,  True],
           [ True,  True,  True]]], device='cuda:0'),
  tensor([[[ 0.3370, -0.6448, -0.5119,  0.3316, -0.6448, -0.5287],
           [ 0.3423, -0.6448, -0.4951,  0.3370, -0.6448, -0.5119],
           [ 0.3622, -0.6448, -0.4880,  0.3423, -0.6448, -0.4951],
           [ 0.3820

In [None]:
@typedispatch
def show_batch(x: tuple, y, samples, ctxs=None, max_n=6):
    return x

In [None]:
# dls.show_batch()

In [None]:
dls._types

{tuple: [{tuple: [torch.Tensor, torch.Tensor, torch.Tensor]},
  {tuple: [torch.Tensor, torch.Tensor, torch.Tensor]}]}

In [None]:
#| export
def make_dataloader(df, control, block_len, gap_len, control_lags, bs):
    pipeline, block_ids = imp_pipeline(df, control, block_len, gap_len, control_lags)
    
    splits = RandomSplitter()(block_ids)
    ds = Datasets(block_ids, [pipeline, pipeline], splits=splits)
    
    return ds.dataloaders(bs=bs)
    

In [None]:
dls = make_dataloader(hai, hai_era, 200, 10, [1], 10)

In [None]:
dls.one_batch()[0][0].shape

torch.Size([10, 200, 3])

In [None]:
dls = dls.cpu()

## Model

### Forward Function

in order to the a pytorch module we need a forward method to the kalman filter

In [None]:
#| export
from meteo_imp.kalman.filter import *
from torch.distributions import MultivariateNormal

In [None]:
#| export
@patch
def _predict_filter(self: KalmanFilter, data, mask, control):
    """Predict every obsevation using only the filter step"""
    # use the predicted state not the filtered state!
    pred_state_mean, pred_state_cov, _, _ = self._filter_all(data, mask, control)
    mean, cov = self._obs_from_state(ListMNormal(pred_state_mean.squeeze(-1), pred_state_cov))
    
    return ListNormal(mean, cov2std(cov))

In [None]:
model = KalmanFilter.init_random(n_dim_obs = hai.shape[-1], n_dim_state = hai.shape[-1], n_dim_contr = hai_era.shape[-1]*control_repeat)

In [None]:
model._predict_filter(*dls.one_batch()[0]);

NameError: name 'contr_matrix' is not defined

In [None]:
#| export
@patch
def forward(self: KalmanFilter, masked_data: MaskedTensor):
    data, mask = masked_data
    assert not data.isnan().any()
    use_smooth = self.use_smooth if hasattr(self, 'use_smooth') else True
    
    mean, std = (self.predict(obs=data, mask=mask, smooth=True) if use_smooth
                        else self._predict_filter(data, mask))
    return NormalsParams(mean, std) # to have fastai working this needs to be a tuple subclass

In [None]:
input = dls.one_batch()[0]
target = dls.one_batch()[1]

In [None]:
model.state_dict()

OrderedDict([('trans_matrix',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('trans_off', tensor([0., 0., 0.])),
             ('trans_cov_raw',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('obs_matrix',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('obs_off', tensor([0., 0., 0.])),
             ('obs_cov_raw',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]])),
             ('init_state_mean', tensor([0., 0., 0.])),
             ('init_state_cov_raw',
              tensor([[1., 0., 0.],
                      [0., 1., 0.],
                      [0., 0., 1.]]))])

In [None]:
data = input[0][0]
data.shape

torch.Size([200, 3])

In [None]:
mask = input[1][0]

In [None]:
mask.shape

torch.Size([200, 3])

In [None]:
data.device

device(type='cpu')

In [None]:
torch.device

torch.device

In [None]:
data.shape, mask.shape

(torch.Size([200, 3]), torch.Size([200, 3]))

In [None]:
model.predict(data.unsqueeze(0), mask.unsqueeze(0));

In [None]:
model.use_smooth = True

In [None]:
pred = model(input)

In [None]:
pred[0].shape

torch.Size([10, 200, 3])

In [None]:
pred[1].shape

torch.Size([10, 200, 3])

In [None]:
model.use_smooth = False

In [None]:
pred_filt = model(input)

In [None]:
pred_filt[1].shape

torch.Size([10, 200, 3])

In [None]:
pred

__main__.NormalsParams(mean=tensor([[[-3.0440e-02, -3.6128e-01, -2.0692e-01],
         [-4.9185e-02, -4.9092e-01, -3.1615e-01],
         [-5.6050e-02, -5.1855e-01, -3.5977e-01],
         ...,
         [ 6.0673e-02, -5.5834e-01, -6.3950e-01],
         [ 5.4511e-02, -5.5101e-01, -6.6336e-01],
         [ 5.1616e-02, -5.4834e-01, -6.8085e-01]],

        [[-1.1211e+00, -3.5555e-01, -4.6935e-01],
         [-1.5444e+00, -4.7373e-01, -6.4904e-01],
         [-1.7021e+00, -4.8982e-01, -7.1740e-01],
         ...,
         [-1.5386e+00,  2.9052e-01, -5.1738e-01],
         [-1.4834e+00,  4.1266e-01, -4.9085e-01],
         [-1.4373e+00,  4.7991e-01, -4.6789e-01]],

        [[-3.1530e-01, -1.8284e-01, -3.5549e-01],
         [-4.0533e-01, -1.2309e-01, -4.5077e-01],
         [-4.0048e-01,  8.7761e-02, -4.4589e-01],
         ...,
         [-7.5815e-01, -9.1102e-02, -3.1101e-01],
         [-7.6516e-01, -1.7842e-01, -3.4100e-01],
         [-7.8852e-01, -1.9300e-01, -3.8816e-01]],

        ...,

        [[

In [None]:
type(pred), type(pred_filt)

(__main__.NormalsParams, __main__.NormalsParams)

In [None]:
test_ne(pred, pred_filt)

### Loss Function

add support for complete loss (also outside gap) and for filter loss (don't run the smooher)

There are two ways to compute the loss, one is to do it for all predictions the other is for doing it for only the gap
- only_gap

Play around with flatting + diagonal

In [None]:
a = torch.diag(torch.tensor([1,2,3]))
d = torch.stack([a, a*10])
m = torch.stack([a.diag(), a.diag()*10])
d

tensor([[[ 1,  0,  0],
         [ 0,  2,  0],
         [ 0,  0,  3]],

        [[10,  0,  0],
         [ 0, 20,  0],
         [ 0,  0, 30]]])

In [None]:
m.flatten()

tensor([ 1,  2,  3, 10, 20, 30])

In [None]:
d

tensor([[[ 1,  0,  0],
         [ 0,  2,  0],
         [ 0,  0,  3]],

        [[10,  0,  0],
         [ 0, 20,  0],
         [ 0,  0, 30]]])

In [None]:
torch.diagonal(d, dim1=1, dim2=2).flatten()

tensor([ 1,  2,  3, 10, 20, 30])

In [None]:
means, stds = pred
data, mask = target

In [None]:
# make a big matrix with all variables and observations and compute ll
mask = mask.flatten() 
obs = data.flatten()[mask]
means = data.flatten()[mask]
stds = stds.flatten()[mask] # need to support batches

MultivariateNormal(means, torch.diag(stds)).log_prob(obs)

tensor(-5930.4453, grad_fn=<SubBackward0>)

In [None]:
#| export
class KalmanLoss():
    def __init__(self,
                 only_gap:bool=True, # loss for all predictions or only gap
                 reduction:str='sum' # one of ['sum', 'mean', 'none']
                ):
        store_attr()
    
    def __call__(self, pred: NormalsParams, target: MaskedTensor):
        data, mask = target
        means, stds = pred        
        assert not stds.isnan().any()
        losses = torch.empty(data.shape[0], device=data.device, dtype=data.dtype)
        for i, (d, m, mean, std) in enumerate(zip(data, mask, means, stds)):
            losses[i] = self._loss_batch(d,m,mean, std)
        if self.reduction == 'none': return losses
        elif self.reduction == 'mean': return losses.mean()
        elif self.reduction == 'sum': return losses.sum()
    
    def _loss_batch(self, data, mask, mean, std):
        # make a big vector with all variables and observations and compute ll
        mask = mask.flatten() if self.only_gap else torch.fill(mask, True).flatten()
        obs = data.flatten()[mask]
        mean = mean.flatten()[mask]
        std = std.flatten()[mask] 
        
        return - MultivariateNormal(mean, torch.diag(std)).log_prob(obs)
        

In [None]:
pred = model(input)

In [None]:
data, mask = input

In [None]:
data.shape, mask.shape

(torch.Size([10, 200, 3]), torch.Size([10, 200, 3]))

In [None]:
pred.mean.shape

torch.Size([10, 200, 3])

In [None]:
means, stds = pred

In [None]:
stds.shape

torch.Size([10, 200, 3])

In [None]:
means.shape

torch.Size([10, 200, 3])

In [None]:
data.isnan().any()

tensor(False)

In [None]:
mask.isnan().any()

tensor(False)

In [None]:
means.isnan().any()

tensor(False)

In [None]:
stds.isnan().sum()

tensor(0)

In [None]:
stds.shape

torch.Size([10, 200, 3])

In [None]:
is_posdef_eigv(torch.diag(stds.flatten()))

(tensor(True),
 tensor([1.4142, 1.4142, 1.4142,  ..., 3.5522, 3.5522, 3.5522],
        grad_fn=<LinalgEighBackward0>))

In [None]:
KalmanLoss(only_gap=True)(pred, target)

tensor(10409.3438, grad_fn=<SumBackward0>)

In [None]:
KalmanLoss(only_gap=False)(pred, target)

tensor(10763.8877, grad_fn=<SumBackward0>)

In [None]:
pred.mean.device, target[0].device

(device(type='cpu'), device(type='cpu'))

In [None]:
pred.mean.shape, target[0].shape

(torch.Size([10, 200, 3]), torch.Size([10, 200, 3]))

In [None]:
target[0].shape

torch.Size([10, 200, 3])

In [None]:
KalmanLoss(only_gap=False, reduction='mean')(pred, target)

tensor(1076.3888, grad_fn=<MeanBackward0>)

### Metrics

Wrapper around fastai metrics to support masked tensors and normal distributions

In [None]:
#| export
def to_msk_metric(metric, name):
    def msk_metric(imp, targ):
        return metric(imp[0], targ[0]) # first element are the means
    msk_metric.__name__ = name
    return msk_metric

In [None]:
#| export
from fastai.metrics import *

In [None]:
#| export
msk_rmse = to_msk_metric(rmse, 'rmse')

In [None]:
msk_rmse.__name__

'rmse'

In [None]:
msk_rmse(pred, target)

TensorBase(1.4308)

In [None]:
#| export
msk_r2 = to_msk_metric(R2Score(), 'r2')

In [None]:
msk_r2(pred, target)

-1.222838662188531

### Callback

save the model state 

In [None]:
#| export
from fastai.callback.all import *

In [None]:
#| export
class SaveParams(Callback):
    def __init__(self, param_name):
        super().__init__()
        self.params = []
        self.param_name = param_name
    def after_batch(self):
        param = getattr(self.model, self.param_name).detach()
        self.params.append(param)

In [None]:
#| export
class SaveParams(Callback):
    def __init__(self, param_name):
        super().__init__()
        self.params = []
        self.param_name = param_name
    def after_batch(self):
        param = getattr(self.model, self.param_name).detach()
        self.params.append(param)

In [None]:
debug_preds = []

In [None]:
class DebugPredCallback(Callback):
    order = 0
    def after_validate(self):
        if hasattr(self, 'gather_preds'):
            debug_preds.append(self.gather_preds.preds)

### Learner

In [None]:
#| export
from fastai.learner import * 

from fastai.tabular.all import *

from fastai.tabular.learner import *

from fastai.callback.progress import ShowGraphCallback

In [None]:
obs_cov_history = SaveParams('obs_cov')

In [None]:
all_data = CollectDataCallback()

In [None]:
model = KalmanFilter.init_random(n_dim_obs = hai.shape[1], n_dim_state = hai.shape[1]).cuda()

In [None]:
model.use_smooth = False

In [None]:
# model._set_constraint('obs_cov', model.obs_cov, train=False)

In [None]:
pipeline, block_ids = imp_pipeline(hai[:20000], block_len, gap_len)
    
splits = RandomSplitter()(block_ids)
ds = Datasets(block_ids, [pipeline, pipeline], splits=splits)

In [None]:
dls = ds.dataloaders(bs=10, device='cuda')

In [None]:
dls.one_batch()[0][0].device

device(type='cuda', index=0)

In [None]:
input, target = dls.one_batch()

In [None]:
pred = model(input)
KalmanLoss()(pred, target)

tensor(2345.7764, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
learn = Learner(dls, model, loss_func=KalmanLoss(only_gap=False), cbs = [DebugPredCallback] , metrics = [msk_rmse, msk_r2])

In [None]:
learn.fit(1, 1e-3)

epoch,train_loss,valid_loss,rmse,r2,time
0,771.520874,618.335266,1.989478,-4.994122,00:08


#### Float64

In [None]:
model64 = KalmanFilter.init_random(hai.shape[1], hai.shape[1], dtype=torch.float64).cuda()

In [None]:
#| export
class Float64Callback(Callback):
    order = Recorder.order + 10 # run after Recorder 
    def before_fit(self):
        self.recorder.smooth_loss.val = torch.tensor(0, dtype=torch.float64) # default is a float 32

In [None]:
dls64 = make_dataloader(hai64, 10, 2, bs=10)

In [None]:
input64 = dls64.one_batch()[0]
target64 = dls64.one_batch()[1]

In [None]:
data64, mask64 = input64

In [None]:
data64.device, data64.dtype

(device(type='cuda', index=0), torch.float64)

In [None]:
model64.predict(data64);

In [None]:
pred = model64(input)

In [None]:
KalmanLoss()(pred, target)

tensor(424.9265, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
model64.use_smooth = False

In [None]:
learn64 = Learner(dls64, model64, loss_func=KalmanLoss(), cbs = [Float64Callback] )

In [None]:
learn64.fit(1, 1e-3)

epoch,train_loss,valid_loss,time
0,88.283538,91.264939,01:29


### Predictions

The transformation pipeline is not working properly (there is a problem in `decode_batch` as the `_types` are more nested than the predictions, which results in an error) + the pipeline is anyway not reproducible + the test dataloaders seems that they are actually not deterministic .....
soo reimplement everything almost from scratch

see https://github.com/mone27/meteo_imp/blob/0335003405ec9bd3e3bd2641bc6d7924f34a0788/lib_nbs/kalman/10_fastai.ipynb for all details

In [None]:
#| export
class NormalsDf:
    """DataFrames of Normal parameters (mean and std)"""
    def __init__(self, mean, std): store_attr()
    def tidy(self, prefix=""):
        """Tidy version"""
        mean = self.mean.reset_index().melt("time", value_name=prefix + "mean")
        std = self.std.reset_index().melt("time", value_name=prefix + "std")
        return pd.merge(mean, std, on=["time", "variable"])
    __repr__ = basic_repr("mean, std")

In [None]:
pipe0, pipe1 = tfms4.fs[0,1], tfms4.fs[2,3] 

In [None]:
pipe0, pipe1

((#2) [BlockDfTransform:
encodes: (int,object) -> encodes
decodes: ,AddGapTransform -- {'variables': ['TA', 'SW_IN'], 'gap_length': 2}:
encodes: (DataFrame,object) -> encodes
decodes: ],
 (#2) [MaskedDf2Tensor:
encodes: (MaskedDf,object) -> encodes
decodes: (MaskedTensor,object) -> decodes
,NormalizeMasked -- {'mean': tensor([  8.3339, 120.9578,   3.3807]), 'std': tensor([  7.9246, 204.0026,   4.3684])}:
encodes: (MaskedTensor,object) -> encodes
decodes: (MaskedTensor,object) -> decodes
(NormalsParams,object) -> decodes
])

In [None]:
#| export
def preds2df(preds, targs):
    """Final step to decode preds by getting a dataframe"""
    # preds this is a tuple (data, mask)
    out = []
    for pred, targ in zip(preds, targs):
        # convert to dataframe using structure for
        mean = pd.DataFrame(pred[0].squeeze(0).detach().cpu().numpy(), columns = targ.data.columns, index=targ.data.index)
        std = pd.DataFrame(pred[1].squeeze(0).detach().cpu().numpy(), columns = targ.data.columns, index=targ.data.index)
        out.append(NormalsDf(mean, std))
    return out

In [None]:
#| export
def predict_items(items, learn, pipe0, pipe1):
    pipe0, pipe1 = Pipeline(pipe0), Pipeline(pipe1)
    preds, targs, losses = [], [], []
    for item in items:
        targ = pipe0(item)
        data, mask = pipe1(targ)
        input = MaskedTensor(data.cuda().unsqueeze(0), mask.cuda().unsqueeze(0))
        pred = learn.model(input)
        loss = learn.loss_func(pred, input)
        # denormalize
        pred = pipe1.decode(pred)
        preds.append(pred), targs.append(targ), losses.append(loss)
        
    return preds2df(preds, targs), targs, losses
        

In [None]:
preds, targs, losses = predict_items([0,1,3], learn, pipe0, pipe1)

this is the same data!!

In [None]:
predict_items([0], learn, pipe0, pipe1)[1][0].data == predict_items([0], learn, pipe0, pipe1)[1][0].data

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:30:00,True,True,True
2000-01-01 01:00:00,True,True,True
2000-01-01 01:30:00,True,True,True
2000-01-01 02:00:00,True,True,True
2000-01-01 02:30:00,True,True,True
2000-01-01 03:00:00,True,True,True
2000-01-01 03:30:00,True,True,True
2000-01-01 04:00:00,True,True,True
2000-01-01 04:30:00,True,True,True
2000-01-01 05:00:00,True,True,True


#### Plot results

In [None]:
#| export
def plot_result(pred, targ, loss, **kwargs):
    df = pd.merge(targ.tidy(), pred.tidy(), on=["time", "variable"])
    # return df
    return facet_variable(df, ys=["value", "mean"], error=True, **kwargs).properties(title=f"loss: {loss.item():.6f}")

In [None]:
y = "mean"

In [None]:
plot_result(preds[0], targs[0], torch.tensor(1))

In [None]:
#| export
def plot_results(preds, targs, losses, **kwargs):
    plots = [plot_result(targ, pred, loss, n_cols=1, **kwargs) for targ, pred, loss in zip(preds, targs, losses)]
    return alt.hconcat(*plots)

In [None]:
plot_results(preds, targs, losses)

#### Show Results

In [None]:
random.choices(learn.dls.items, k=3)

[402, 269, 1864]

In [None]:
#|export
def get_results(learn, n=3, items=None, dls=None):
    dls = ifnone(dls, learn.dls)
    items = ifnone(items, random.choices(dls.items, k=3))
    pipe0, pipe1 = dls.fs[0][0,1], dls.fs[0][2,3]
    return predict_items(items, learn, pipe0, pipe1)

In [None]:
#| export
def show_results(learn, n=3, items=None, **kwargs):
    return plot_results(*get_results(learn,n,items), **kwargs)
    

In [None]:
learn.model.use_smooth = False

In [None]:
show_results(learn)

In [None]:
show_results(learn, items=[1,2,3])

In [None]:
display_as_row(learn.model.get_info())

latent,z_0,z_1,z_2
z_0,0.762,0.883,0.2147
z_1,0.7923,0.1411,0.3346
z_2,0.1847,0.8327,0.1961

latent,z_0,z_1,z_2
z_0,0.4607,0.8907,0.727
z_1,0.8907,1.8019,1.3905
z_2,0.727,1.3905,1.6383

latent,offset
z_0,0.5652
z_1,0.1393
z_2,0.8201

variable,z_0,z_1,z_2
x_0,0.1849,0.3723,-0.0151
x_1,0.2541,0.4939,1.0004
x_2,0.1905,0.9132,0.9354

variable,x_0,x_1,x_2
x_0,1.2213,1.1493,1.0629
x_1,1.1493,1.1359,1.0939
x_2,1.0629,1.0939,1.2183

variable,offset
x_0,0.7413
x_1,0.5732
x_2,0.548

latent,mean
z_0,0.0672
z_1,0.6931
z_2,0.9265

latent,z_0,z_1,z_2
z_0,1.1024,0.9936,0.6432
z_1,0.9936,1.2147,0.5984
z_2,0.6432,0.5984,1.3727


#### Interactive

In [None]:
#| export
from ipywidgets import IntSlider, interact_manual, Text

In [None]:
#| export
def results_custom_gap(learn, df, var_sel, gap_len, items_idx, block_len):
    pipeline = [BlockDfTransform(df, block_len),
                AddGapTransform(var_sel, gap_len),
                MaskedDf2Tensor,
                NormalizeMasked(*get_stats(df))]
    
    dls = Datasets(items_idx, [pipeline, pipeline]).dataloaders(bs=len(items_idx))
    return get_results(learn, items=items_idx, dls=dls)

In [None]:
plot_results(*results_custom_gap(learn64, hai64, ['TA'], 10, [800, 801], 200))

In [None]:
#| export
def interact_results(learn, df):
    interact_args = {
        'gap_len': IntSlider(10, 1, 100),
        'items_idx': Text(value='10, 100', placeholder="space separated indices"),
        'block_len': IntSlider(200, 10, 1000, 10),
        **{var_name: True for var_name in df.columns}
    }
    
    def _inner(gap_len, items_idx, block_len, **var_names):
        var_sel = [var_name for var_name, var_use in var_names.items() if var_use]
        items_idx = list(map(int, items_idx.split(",")))
        return plot_results(*results_custom_gap(learn=learn, df=df, var_sel=var_sel, gap_len=gap_len, items_idx=items_idx, block_len=block_len))
    return interact_manual(_inner, **interact_args)

In [None]:
interact_results(learn64, hai64)

interactive(children=(IntSlider(value=10, description='gap_len', min=1), Text(value='10, 100', description='it…

<function __main__.interact_results.<locals>._inner(gap_len, items_idx, block_len, **var_names)>

## Export 

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()