the goal of this notebook is to be able to:

- take a timeseries that contains gaps
- train a GPFA Learner using the available data
- impute the gap, by using the predictions of the learner
- to improve the imputation by conditioning the distribution on the other observation at the same time stamp

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
#| default_exp imputation

In [None]:
#| export
from gpfa_imputation.learner import *
from gpfa_imputation.data_preparation import *

import torch

import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error, r2_score

from fastcore.foundation import patch, patch_to
from fastcore.meta import delegates
from fastcore.basics import store_attr, listify
from fastcore.test import test_close
from itertools import zip_longest

import matplotlib.pyplot as plt
import altair as alt
from altair import datum

from functools import lru_cache

In [None]:
torch.manual_seed(277); # set seed for reproducibility of examples

## GPFA Imputation

This is the core class that does the imputation using a GPFA

The inputs is:

- a dataframe containing the observed data, where the row with missing data have been removed
- a vector of times where the data is missing

it returns:

- a complete dataframe with the prediction of the model

The goal is that `GPFAImputation` takes as imput a dataframe containing missing values and then it imputes them using GPFALearner.
Therefore it needs to divide the dataframe in 3 sections:

- training data (rows with no NAs)
- times to be imputed (rows with some NAs)
- observations (variables in the pred rows that are not missing) for conditional predictions

In [None]:
t_df = pd.DataFrame([
    [1., 3., 4.],
    [2., 6., np.nan],
    [np.nan, np.nan, np.nan],
    [np.nan, 8., np.nan],
    [3., 4., 5.]
]
)

In [None]:
train_idx = ~t_df.isna().any(1)

  train_idx = ~t_df.isna().any(1)


In [None]:
t_df[train_idx]

Unnamed: 0,0,1,2
0,1.0,3.0,4.0
4,3.0,4.0,5.0


In [None]:
pred_data = t_df[~train_idx].to_numpy() 

In [None]:
pred_data

array([[ 2.,  6., nan],
       [nan, nan, nan],
       [nan,  8., nan]])

In [None]:
idx_cond = ~t_df[~train_idx].isna()

In [None]:
idx_cond

Unnamed: 0,0,1,2
1,True,True,False
2,False,False,False
3,False,True,False


In [None]:
idx_cond = idx_cond.to_numpy().flatten() # learner needs a 1D index

pred_data.flatten()[idx_cond]

train_idx = t_df.isna().any(axis=1)

torch.tensor(~t_df[~train_idx].isna().to_numpy().flatten())

tensor([True, True, True, True, True, True])

Implement this into a function

In [None]:
#| exports
class GPFAImputation:
    def __init__(
        self,
        data: pd.DataFrame , #observed data with missing data as NA
        latent_dims = 1,
        cuda = False, # Use GPU?
        units = None # Dict of unit for each column. Used for plotting
    ):
        self.data = data.copy()
        self.units=units
        self.latent_dims = latent_dims
        
        
        device = 'cuda' if cuda else 'cpu'
        
        self.T = torch.arange(0, len(data), dtype=torch.float32, device=device) # time is encoded with a increase of 1
        
        # Training data
        self.train_idx = ~self.data.isna().any(axis=1)
        self.train_data = torch.tensor(self.data[self.train_idx].to_numpy().astype(np.float32), device=device)
        self.train_T = self.T[self.train_idx]
        
        self.learner = GPFALearner(X = self.train_data, T = self.train_T, latent_dims=latent_dims)
        

        # Prediction data
        self.pred_T = self.T[~self.train_idx]
        self.cond_idx = torch.tensor(~self.data[~self.train_idx].isna().to_numpy().flatten(), device=device) # conditional obsevations
        self.cond_obs = torch.tensor(self.data[~self.train_idx].to_numpy().astype(np.float32).flatten()[self.cond_idx.cpu()], device=device)
        
        if cuda: self.learner.cuda()
        
    def fit(self):
        "Fit learner to training data"
        self.learner.train()
        return self

    def impute(self,
               add_time = True, # add column with time?
               tidy = True, # tidy data?
               ):
        
        self.pred = self.learner.predict(self.pred_T, obs = self.cond_obs, idx = self.cond_idx)
        if not hasattr(self, "pred"):
            self.fit()

        
        if tidy: return self._impute_tidy(add_time)
        else: return self._impute_wide(add_time)
        
        
    def _impute_wide(self, add_time):
        """ Impute in wide format"""
        
        imp_data = self.data.copy()
        for col_idx, col_name in enumerate(imp_data.columns):
            imp_data.loc[~self.train_idx, col_name] = self.pred.mean[:, col_idx].cpu().numpy()
            imp_data.loc[~self.train_idx, col_name + "_std"] = self.pred.std[:, col_idx].cpu().numpy()
        
        if add_time:
            imp_data["time"] = self.T.cpu()
        
        return imp_data 
    
    def _impute_tidy(self, add_time):
        """ transform the pred output into a tidy dataframe suitable for plotting"""
        feature_names = self.data.columns

        pred_mean = pd.DataFrame(self.pred.mean.cpu(), columns = feature_names).assign(time = self.pred_T.cpu()).melt("time", value_name="mean")
        pred_std = pd.DataFrame(self.pred.std.cpu(), columns = feature_names).assign(time = self.pred_T.cpu()).melt("time", value_name="std")
        
        pred = pd.merge(pred_mean, pred_std, on=['time', 'variable'])  
        
        train_data = self.data[self.train_idx].assign(time = self.train_T.cpu()).melt("time", value_name = "mean")
               
        imp_data = pd.concat((train_data, pred))
        
        self.pred_wide = imp_data
        
        return imp_data

In [None]:
fd = GPFADataTest.generate(2, 10, Lambda=[1,2.]).add_random_missing()

In [None]:
fd.data

Unnamed: 0,x0,x1
0,-0.024085,0.089268
1,-0.133942,0.258532
2,-0.60465,-0.603501
3,,
4,-0.478994,-1.161096
5,,
6,-0.594717,-1.284512
7,,1.251743
8,,-2.001107
9,,


In [None]:
imp = GPFAImputation(fd.data)

In [None]:
imp

<__main__.GPFAImputation>

Tidy

In [None]:
imp.impute()

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.024085,
1,1.0,x0,-0.133942,
2,2.0,x0,-0.60465,
3,4.0,x0,-0.478994,
4,6.0,x0,-0.594717,
5,0.0,x1,0.089268,
6,1.0,x1,0.258532,
7,2.0,x1,-0.603501,
8,4.0,x1,-1.161096,
9,6.0,x1,-1.284512,


wide

In [None]:
imp.impute(tidy=False)

Unnamed: 0,x0,x1,x0_std,x1_std,time
0,-0.024085,0.089268,,,0.0
1,-0.133942,0.258532,,,1.0
2,-0.60465,-0.603501,,,2.0
3,-0.416516,-0.562824,0.386124,0.83395,3.0
4,-0.478994,-1.161096,,,4.0
5,-0.419649,-0.564259,0.386142,0.833952,5.0
6,-0.594717,-1.284512,,,6.0
7,-0.358267,1.251743,0.387923,0.0,7.0
8,-0.397247,-2.001107,0.390719,0.0,8.0
9,-0.38412,-0.547979,0.391765,0.834502,9.0


### GPU 

check that the GPU support is working

In [None]:
imp_gpu = GPFAImputation(fd.data, cuda=True)

In [None]:
imp_gpu.impute()

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.024085,
1,1.0,x0,-0.133942,
2,2.0,x0,-0.60465,
3,4.0,x0,-0.478994,
4,6.0,x0,-0.594717,
5,0.0,x1,0.089268,
6,1.0,x1,0.258532,
7,2.0,x1,-0.603501,
8,4.0,x1,-1.161096,
9,6.0,x1,-1.284512,


the gpu and cpu version return similar results!

In [None]:
imp.impute()[["mean", "std"]].to_numpy() - imp_gpu.impute()[["mean", "std"]].to_numpy()

array([[ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [-4.6333283e-02,  6.7106664e-02],
       [-4.6722233e-02,  6.7123741e-02],
       [-2.6827753e-03,  6.9049209e-02],
       [-2.0056546e-02,  7.1836144e-02],
       [-1.1047989e-02,  7.2726548e-02],
       [ 7.1919739e-02, -1.6455758e-01],
       [ 1.5970773e-01, -1.6455603e-01],
       [ 0.0000000e+00,  0.0000000e+00],
       [ 2.3841858e-07,  0.0000000e+00],
       [ 1.8069166e-01, -1.7099941e-01]], dtype=float32)

### Repr

add `__repr__` and `__str__` to imputation objects

In [None]:
#| exports
@patch
def __repr__(self: GPFAImputation):
    return f"""GPFA Imputation:
    N obs: {self.data.shape[0]}
    N features {self.data.shape[1]} ({', '.join(self.data.columns)})
    N missing observations {(~self.cond_idx).sum()}
    N latent: {self.learner.latent_dims}"""

@patch
def __str__(self: GPFAImputation):
    return self.__repr__()

In [None]:
imp

GPFA Imputation:
    N obs: 10
    N features 2 (x0, x1)
    N missing observations 8
    N latent: 1

In [None]:
str(imp)

'GPFA Imputation:\n    N obs: 10\n    N features 2 (x0, x1)\n    N missing observations 8\n    N latent: 1'

## GPFA Imputation Explorer

This is a class that is used for exploring the results for a GPFAImputation, the main difference is that it always return the model predictions and not only the training data

In [None]:
#| exports
class GPFAImputationExplorer:
    def __init__(
        self,
        data: pd.DataFrame , #observed data with missing data as NA
        latent_dims = 1,
        cuda = False, # Use GPU?
    ):
        self.data = data
        self.latent_dims = latent_dims
        
        device = 'cuda' if cuda else 'cpu'
        
        self.T = torch.arange(0, len(data), dtype=torch.float32, device=device) # time is encoded with a increase of 1
        
        # Training data
        self.train_idx = ~self.data.isna().any(axis=1)
        self.train_data = torch.tensor(self.data[self.train_idx].to_numpy().astype(np.float32), device=device)
        self.train_T = self.T[self.train_idx]
        
        self.learner = GPFALearner(X = self.train_data, T = self.train_T, latent_dims=latent_dims)
        
        
        # There is no conditional observation here since it probably doesn't make much sense here
               
        if cuda: self.learner.cuda()
        
    def fit(self):
        "Fit learner to training data"
        self.learner.train()
        return self

    def predict(self):
        
        # return always tidy df
        
        self.pred = self.learner.predict(self.T)
        
        feature_names = self.data.columns
        pred_mean = pd.DataFrame(self.pred.mean.cpu(), columns = feature_names).assign(time = self.T.cpu()).melt("time", value_name="mean")
        pred_std = pd.DataFrame(self.pred.std.cpu(), columns = feature_names).assign(time = self.T.cpu()).melt("time", value_name="std")
        
        return pd.merge(pred_mean, pred_std, on=['time', 'variable'])
    
    def fit_predict(self):
        self.fit()
        return self.predict()

### Repr

add `__repr__` and `__str__` to imputation objects

In [None]:
#| exports
@patch
def __repr__(self: GPFAImputationExplorer):
    return f"""GPFA Imputation Explorer:
    N obs: {self.data.shape[0]}
    N features {self.data.shape[1]} ({', '.join(self.data.columns)})
    N missing observations {self.data.isna().to_numpy().flatten().sum()}
    N latent: {self.learner.latent_dims}"""

@patch
def __str__(self: GPFAImputationExplorer):
    return self.__repr__()

In [None]:
imp_exp = GPFAImputationExplorer(fd.data)

In [None]:
imp_exp

GPFA Imputation Explorer:
    N obs: 10
    N features 2 (x0, x1)
    N missing observations 8
    N latent: 1

In [None]:
str(imp)

'GPFA Imputation:\n    N obs: 10\n    N features 2 (x0, x1)\n    N missing observations 8\n    N latent: 1'

In [None]:
imp_exp.predict()

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.297978,0.341699
1,1.0,x0,-0.320394,0.341324
2,2.0,x0,-0.391404,0.341698
3,3.0,x0,-0.388192,0.345208
4,4.0,x0,-0.390354,0.342071
5,5.0,x0,-0.389361,0.345208
6,6.0,x0,-0.408291,0.342072
7,7.0,x0,-0.381646,0.345724
8,8.0,x0,-0.367911,0.34624
9,9.0,x0,-0.367281,0.346241


## Results

In [None]:
#| export
class GPFAResult:
    def __init__(self,
                 data_imputed, #imputed data in tidy format
                 data_complete, # complete data in tidy format
                 learner, # learner for parameters display
                 units = None, # units for plots
                ):
        store_attr()
        
    

constructor methods from `GPFAImputation` and `GPFAImputationResults`

In [None]:
#| export
@patch
def to_result(self: GPFAImputation, data_complete, units=None):
    return GPFAResult(self.impute(add_time=True), data_complete, self.learner, units)

In [None]:
#| export
@patch
def to_result(self: GPFAImputationExplorer, data_complete, units=None):
    return GPFAResult(self.predict(), data_complete, self.learner, units)

In [None]:
res = imp.to_result(fd.data_compl_tidy, units = {'x0': 'x0 units', 'x1': 'x1 unitssss'})

In [None]:
res

<__main__.GPFAResult>

### Metrics

In [None]:
#| export
@patch
def compute_metric(self: GPFAResult,
                   metric, # function that takes as argument true and pred and returns the metric
                   metric_name = 'metric',
                   all_data = False # whether to consider whole dataset or only where there is the gap
                  ):
    df = pd.merge(self.data_imputed, self.data_complete, on = ['time','variable'])
    
    vars = []
    
    for var in df.variable.unique():
        mask = (df.variable == var) & (df.is_missing == True) if not all_data else df.variable == var
        df_var = df[mask]
        vars.append({'variable': var,
                      metric_name: metric(df_var['value'], df_var['mean'])})
    
    return pd.DataFrame(vars)

In [None]:
#| exports
@patch
def rmse(self: GPFAResult, all_data=False):
    return self.compute_metric(lambda x, y: np.sqrt(mean_squared_error(x,y)), "rmse", all_data=all_data)
    

In [None]:
res.rmse()

Unnamed: 0,variable,rmse
0,x0,1.008747
1,x1,1.927404


In [None]:
res.rmse(all_data=True)

Unnamed: 0,variable,rmse
0,x0,0.713292
1,x1,1.055683


In [None]:
#| exports
@patch
def r2(self: GPFAResult, all_data=True):
    return self.compute_metric(r2_score, "r2", all_data)

In [None]:
res.r2()

Unnamed: 0,variable,r2
0,x0,-0.182329
1,x1,0.245124


### Prediction plot

In [None]:
#| exports
def _plot_variable(imp, complete, variable, y_label="", sel=None, properties = {}):
    
    imp = imp[imp.variable == variable]

    
    error = alt.Chart(imp).mark_errorband().encode(
        x = "time",    
        y = alt.Y("err_low:Q", title = y_label, scale=alt.Scale(zero=False)),
        y2 = "err_high:Q",
        color=alt.Color("variable",
                        legend = alt.Legend(title=["Line: pred. mean", "area: +/- 2 std", "(variable)"])
                       ),
        tooltip = alt.Tooltip(['std', 'mean'], format=".4")
    ).transform_calculate(
        err_low = "datum.mean - 2 * datum.std",
        err_high = "datum.mean + 2 * datum.std"
    ).properties( **properties)

    pred = alt.Chart(imp).mark_line().encode(
        x = "time",    
        y = alt.Y("mean:Q", title = y_label, scale=alt.Scale(zero=False)),
        color="variable",
    ).add_selection(
        sel if sel is not None else alt.selection_interval(bind="scales")
    ).properties(title = variable)

    base_plot = error + pred
    
    if complete is not None:

        complete = complete[complete.variable == variable]
        truth_plt = alt.Chart(complete).mark_point(
            color='black',
            strokeWidth = 1,
            fillOpacity = 1
        ).encode(
            x = "time",
            y = alt.Y("value", title = y_label, scale=alt.Scale(zero=False)),
            fill= alt.Fill("is_missing", scale = alt.Scale(range=["#ffffff00", "black"]),
                           legend = alt.Legend(title =["Observed data","(is missing)"])),
            shape = "is_missing",
        )

        base_plot = truth_plt + base_plot
        
    return base_plot
    

In [None]:
imp_d = imp.impute(tidy=True, add_time=True)

_plot_variable(imp_d, None, "x1")

  for col_name, dtype in df.dtypes.iteritems():


In [None]:
#| exports
@patch()
def plot_pred(
    self: GPFAResult,
    n_cols: int = 2,
    bind_interaction: bool =True, # Whether the sub-plots for each variable should be connected for zooming/panning
    properties:dict = {} # additional properties (eg. size) for altair plot
):
    "Plot the prediction for each variable"
   
    plot_list = [alt.hconcat() for _ in range(0, self.data_imputed.shape[0], n_cols)]
    selection_scale = alt.selection_interval(bind="scales", encodings=['x']) if bind_interaction else None
    for idx, variable in enumerate(pd.unique(self.data_imputed.variable)):
        plot_list[idx // n_cols] |= _plot_variable(self.data_imputed,
                                                   self.data_complete,
                                                   variable,
                                                   y_label = f"{variable} [{self.units[variable]}]" if self.units is not None else variable,
                                                   sel = selection_scale, properties=properties)
    
    plot = alt.vconcat(*plot_list)
    
    return plot

In [None]:
res.plot_pred()

The code is running correctly and as expected around the missing data point the error is band is wider

### Display results

show the prediction plot, metrics and model parameters in one convinient view

In [None]:
#| export
from IPython.display import HTML

from ipywidgets import HBox, VBox, interact, widgets
from ipywidgets.widgets import Output

In [None]:
#| export
def _display_as_row(dfs, titles=None):
    """display multiple dataframes in the same row"""
    dfs =  listify(dfs)
    titles = listify(titles)
    out = []
    for df, title in zip_longest(dfs, titles, fillvalue=""):
        out.append(f"<div> <p style='font-size: 1.3rem; font-decoration: bold'>{title}<p> {df.to_html()} </div>")
    out = f"<div style=\"display: flex; gap: 20px;\"> {''.join(out)}</div>"
    display(HTML("".join(out)))

def _style_df(df):
    """style dataframe for better printing """
    return df.style.hide(axis="index").format(precision = 4)

In [None]:
_display_as_row(pd.DataFrame([1,2]), titles="test")

Unnamed: 0,0
0,1
1,2


In [None]:
_display_as_row([pd.DataFrame([1,2]), pd.DataFrame([3,4])], titles=["test", "t2"])

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,3
1,4


In [None]:
_display_as_row([pd.DataFrame([1,2]) for i in range(10)], titles=[f"test{i}" for i in range(10)])

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2

Unnamed: 0,0
0,1
1,2


In [None]:
#| exports
@patch 
def display_results(self: GPFAResult, plot_args={}):
    plot_args = {'properties': {'height': 150 , 'width': 300}, **plot_args} # set default plot size
    plot = self.plot_pred(**plot_args)
    
    r2 = self.r2()
    
    # there is no GPFA leaner so don't display metrics and return early
    if self.learner is None:
        display(plot)
        _display_as_row(_style_df(r2), "r2")
        return
    
    variables = pd.DataFrame({'variable': self.data_imputed.columns})
    latent_names = [f"z{i}" for i in range(self.learner.latent_dims)]

    
    Lambda = pd.concat([
        variables,
        pd.DataFrame(
            self.learner.model.covar_module.Lambda.detach().cpu().numpy(),
            columns=latent_names)
    ], axis=1)
    
    
    lengthscale = pd.DataFrame({
        'latent': latent_names,
        'lengthscale': [self.learner.model.covar_module.latent_kernels[i].lengthscale.detach().item() for i in range(self.learner.latent_dims)]
    })
    
    #loss = plt.plot(self.learner.losses)
    
    
    display(plot)
    _display_as_row([_style_df(df) for df in [r2, Lambda, lengthscale]], ["r2", "Λ", "Lengthscale"])
    

In [None]:
res.display_results()

  for col_name, dtype in df.dtypes.iteritems():


variable,r2
x0,-0.6488
x1,-20.9363

variable,z0
time,0.8437
variable,0.1487
mean,
std,

latent,lengthscale
z0,0.6931


## Export 

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()