# Imputation Kalman Model
> Imputation using Kalman Models

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
#| default_exp kalman.imputation

In [None]:
#| export
import pandas as pd
from meteo_imp.kalman.model import KalmanModel, LocalLevelModel
from meteo_imp.results import ImputationResult
from meteo_imp.utils import *
from fastcore.basics import store_attr, patch
from numpy.ma import masked_array
import numpy as np
from sklearn.preprocessing import StandardScaler
import sklearn

## Standard Scaler

make a standard scaler that can also inverse transfor standard deviations. see `Standardizer` for details of implementation

In [None]:
reset_seed()
xx = np.random.random((4, 10))

In [None]:
s = StandardScaler().fit(xx)

In [None]:
s.transform(xx)

array([[ 0.07263978,  0.63279488, -0.9975139 ,  0.50899177,  0.15537652,
         1.45555506,  1.56629646, -1.60237369,  1.51674974,  1.29584745],
       [ 1.58579521,  0.83086419, -0.68281902,  0.51578245, -0.62395756,
        -1.19720248, -0.43000476,  1.1539719 , -0.74724819, -0.85525414],
       [-1.05809926, -1.69049694,  0.0895118 , -1.72684476, -1.08418417,
         0.32617669, -1.16657374,  0.2345773 ,  0.26525847,  0.64349108],
       [-0.60033573,  0.22683787,  1.59082112,  0.70207053,  1.55276521,
        -0.58452927,  0.03028204,  0.21382449, -1.03476002, -1.08408439]])

In [None]:
s.mean_

array([0.40358703, 0.6758362 , 0.77934606, 0.70748673, 0.34417949,
       0.62067044, 0.48500116, 0.54921643, 0.34604713, 0.3660338 ])

In [None]:
s.scale_

array([0.30471427, 0.21926148, 0.04405831, 0.31536161, 0.25229864,
       0.24649441, 0.26061043, 0.21187396, 0.26093989, 0.22927816])

In [None]:
#| export
@patch
def inverse_transform_std(self: sklearn.preprocessing.StandardScaler, 
                         x_std # standard deviations
                        ):
    return x_std * self.scale_

## Imputation

In [None]:
x = np.stack([np.eye(3)*i for i in  range(1,4)])

In [None]:
x

array([[[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[2., 0., 0.],
        [0., 2., 0.],
        [0., 0., 2.]],

       [[3., 0., 0.],
        [0., 3., 0.],
        [0., 0., 3.]]])

In [None]:
np.diagonal(x, axis1=1, axis2=2)

array([[1., 1., 1.],
       [2., 2., 2.],
       [3., 3., 3.]])

In [None]:
#| export
class KalmanImputation:
    """Imputation using a kalman model"""
    def __init__(self, data: pd.DataFrame,
                 model: KalmanModel, # a subclass of MLEModel tto be used as model
                 # model_args: dict = {}, # Optional args for model
                 pred_all: bool = False, # If the dataset should be replaced by the model predictions
                ):
        self.data = data
        self.train_idx = ~self.data.isna().any(axis=1)
        # uses numpy maskes for pykalman
        train_data = data.to_numpy()
        self.scaler = StandardScaler().fit(train_data)
        train_data = self.scaler.transform(train_data)
        self.train_data = masked_array(train_data, mask=data.isna())
        
        self.T = np.arange(self.data.shape[0])
        self.model = model(self.train_data)
    def fit(self, **kwargs) -> 'KalmanImputation':
        """Fit model parameters"""
        self.model.fit(**kwargs)
        return self

    def impute(self,
               pred_all = False, # If the dataset should be replaced by the model predictions
                                # or only the gaps imputed using the model
              ):
        """Impute data in tidy format using model"""
        # predict either no all dataset or only on part
        if pred_all:
            time_mask = self.T
            data_mask = np.ones_like(self.train_idx, dtype=bool)
        else:
            time_mask = self.T[~self.train_idx]
            data_mask = ~self.train_idx.to_numpy()

        pred = self.model.predict(time_mask)
        
        imp_mean = self.data.copy()
        mean = self.scaler.inverse_transform(pred.mean)
        imp_mean.iloc[data_mask, :] = mean
        imp_mean = imp_mean.assign(time=self.T).melt('time', value_name = 'mean')
        
        # for observations std is 0
        imp_std = pd.DataFrame(np.zeros_like(self.data), columns=self.data.columns)
        # get the diagonal of the covariance matrices (the variance) and transform to std
        std = np.diagonal(np.sqrt(pred.cov), axis1=1, axis2=2)
        std = self.scaler.inverse_transform_std(std)
        imp_std.iloc[data_mask, :] = std
        imp_std = imp_std.assign(time=self.T).melt('time',value_name = 'std')
        
        return pd.merge(imp_mean, imp_std, on=['time', 'variable'])       

In [None]:
from meteo_imp.data_preparation import MeteoDataTest

In [None]:
reset_seed(1)
data = MeteoDataTest.generate_gpfa(2, 5).add_random_missing()

In [None]:
data.data

Unnamed: 0,x0,x1
0,0.023263,
1,0.219627,0.268028
2,-0.039892,0.063075
3,,
4,-0.64549,-0.144866


In [None]:
k_imp = KalmanImputation(data.data, LocalLevelModel)

In [None]:
k_imp.fit()

<__main__.KalmanImputation>

In [None]:
k_imp.impute()

Unnamed: 0,time,variable,mean,std
0,0,x0,-0.117147,0.394932
1,1,x0,0.219627,0.0
2,2,x0,-0.039892,0.0
3,3,x0,-0.170874,0.395551
4,4,x0,-0.64549,0.0
5,0,x1,0.062976,0.206893
6,1,x1,0.268028,0.0
7,2,x1,0.063075,0.0
8,3,x1,0.062976,0.206893
9,4,x1,-0.144866,0.0


In [None]:
k_imp.impute(pred_all=True)

Unnamed: 0,time,variable,mean,std
0,0,x0,-0.117147,0.394932
1,1,x0,-0.118482,0.383
2,2,x0,-0.100529,0.384614
3,3,x0,-0.170874,0.395551
4,4,x0,-0.241218,0.389215
5,0,x1,0.062976,0.206893
6,1,x1,0.062976,0.206893
7,2,x1,0.062976,0.206893
8,3,x1,0.062976,0.206893
9,4,x1,0.062976,0.206893


#### Result

In [None]:
#| export
@patch
def to_result(self: KalmanImputation, data_compl, var_names=None, units=None, pred_all=False):
    return ImputationResult(self.impute(pred_all), data_compl, self.model.get_info(var_names), units)

In [None]:
X = np.hstack([np.arange(0,3.), np.arange(3., 0, -1)]).reshape(6, 1)

In [None]:
res = k_imp.to_result(data.data_compl_tidy)

In [None]:
res.display_results()

  for col_name, dtype in df.dtypes.iteritems():




variable,r2
x0,0.63
x1,0.6336

variable,rmse
x0,0.1959
x1,0.0975

variable,r2
x0,
x1,-0.182

variable,rmse
x0,0.415
x1,0.1541


z0,z1
1.0,0.0
0.0,1.0

z0,z1
1.0,0.0
0.0,1.0

0,1
1.1428,1.1225
1.1225,1.2565

0,1
0.2046,-0.0
-0.0,-0.0


## Export 

In [2]:
#| hide
from nbdev import nbdev_export
nbdev_export()