the goal of this notebook is to be able to:

- take a timeseries that contains gaps
- train a GPFA Learner using the available data
- impute the gap, by using the predictions of the learner
- to improve the imputation by conditioning the distribution on the other observation at the same time stamp

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#| hide
#| default_exp imputation

In [3]:
#| export
from gpfa_imputation.learner import *
from gpfa_imputation.data_preparation import *

import torch

import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error, r2_score

from fastcore.foundation import patch, patch_to
from fastcore.meta import delegates

import matplotlib.pyplot as plt
import altair as alt
from altair import datum

from functools import lru_cache

In [4]:
torch.manual_seed(277); # set seed for reproducibility of examples

## GPFA Imputation

This is the core class that does the imputation using a GPFA

The inputs is:

- a dataframe containing the observed data, where the row with missing data have been removed
- a vector of times where the data is missing

it returns:

- a complete dataframe with the prediction of the model

The goal is that `GPFAImputation` takes as imput a dataframe containing missing values and then it imputes them using GPFALearner.
Therefore it needs to divide the dataframe in 3 sections:

- training data (rows with no NAs)
- times to be imputed (rows with some NAs)
- observations (variables in the pred rows that are not missing) for conditional predictions

In [5]:
t_df = pd.DataFrame([
    [1., 3., 4.],
    [2., 6., np.nan],
    [np.nan, np.nan, np.nan],
    [np.nan, 8., np.nan],
    [3., 4., 5.]
]
)

In [6]:
train_idx = ~t_df.isna().any(1)

  train_idx = ~t_df.isna().any(1)


In [7]:
t_df[train_idx]

Unnamed: 0,0,1,2
0,1.0,3.0,4.0
4,3.0,4.0,5.0


In [8]:
pred_data = t_df[~train_idx].to_numpy() 

In [9]:
pred_data

array([[ 2.,  6., nan],
       [nan, nan, nan],
       [nan,  8., nan]])

In [10]:
idx_cond = ~t_df[~train_idx].isna()

In [11]:
idx_cond

Unnamed: 0,0,1,2
1,True,True,False
2,False,False,False
3,False,True,False


In [12]:
idx_cond = idx_cond.to_numpy().flatten() # learner needs a 1D index

pred_data.flatten()[idx_cond]

train_idx = t_df.isna().any(axis=1)

torch.tensor(~t_df[~train_idx].isna().to_numpy().flatten())

tensor([True, True, True, True, True, True])

Implement this into a function

In [13]:
#| exports
class GPFAImputation:
    def __init__(
        self,
        data: pd.DataFrame , #observed data with missing data as NA
        complete_data: pd.DataFrame = None, # Optional complete dataframe (for testing)
        latent_dims = 1,
        cuda = True, # Use GPU?
        units = None # Dict of unit for each column. Used for plotting
    ):
        self.data = data
        self.data_complete = complete_data # TODO polish this, is this tidy or wide? check has required cols
        self.units=units
        self.latent_dims = latent_dims
        
        
        device = 'cuda' if cuda else 'cpu'
        
        self.T = torch.arange(0, len(data), dtype=torch.float32, device=device) # time is encoded with a increase of 1
        
        # Training data
        self.train_idx = ~self.data.isna().any(axis=1)
        self.train_data = torch.tensor(self.data[self.train_idx].to_numpy().astype(np.float32), device=device)
        self.train_T = self.T[self.train_idx]
        
        self.learner = GPFALearner(X = self.train_data, T = self.train_T, latent_dims=latent_dims)
        

        # Prediction data
        self.pred_T = self.T[~self.train_idx]
        self.cond_idx = torch.tensor(~self.data[~self.train_idx].isna().to_numpy().flatten(), device=device) # conditional obsevations
        self.cond_obs = torch.tensor(self.data[~self.train_idx].to_numpy().astype(np.float32).flatten()[self.cond_idx.cpu()], device=device)
        
        if cuda: self.learner.cuda()
        
        
    @lru_cache
    def impute(self,
               add_time = True, # add column with time?
               tidy = True # tidy data?
               ):
        
        if not hasattr(self, "pred"):
            self.learner.train()
            self.pred = self.learner.predict(self.pred_T, obs = self.cond_obs, idx = self.cond_idx)
        
        if tidy: return self._impute_tidy(add_time)
        else: return self._impute_wide(add_time)
        
        
    def _impute_wide(self, add_time):
        """ Impute in wide format"""
        
        imp_data = self.data.copy()
        for col_idx, col_name in enumerate(imp_data.columns):
            imp_data.loc[~self.train_idx, col_name] = self.pred.mean[:, col_idx].cpu().numpy()
            imp_data.loc[~self.train_idx, col_name + "_std"] = self.pred.std[:, col_idx].cpu().numpy()
        
        idx_vars = []
        if add_time:
            imp_data["time"] = self.T.cpu()
            idx_vars.append("time")
        
        return imp_data 
    
    def _impute_tidy(self, add_time):
        """ transform the pred output into a tidy dataframe suitable for plotting"""
        feature_names = self.data.columns

        pred_mean = pd.DataFrame(self.pred.mean.cpu(), columns = feature_names).assign(time = self.pred_T.cpu()).melt("time", value_name="mean")
        pred_std = pd.DataFrame(self.pred.std.cpu(), columns = feature_names).assign(time = self.pred_T.cpu()).melt("time", value_name="std")
        
        pred = pd.merge(pred_mean, pred_std, on=['time', 'variable'])  
        
        train_data = self.data[self.train_idx].assign(time = self.train_T.cpu()).melt("time", value_name = "mean")
               
        imp_data = pd.concat((train_data, pred))
        
        self.pred_wide = imp_data
        
        return imp_data
    

In [14]:
fd = GPFADataTest.generate(2, 10, Lambda=[1,2.]).add_random_missing()

In [15]:
fd.data

Unnamed: 0,x0,x1
0,-0.024085,0.089268
1,-0.133942,0.258532
2,-0.60465,-0.603501
3,,
4,-0.478994,-1.161096
5,,
6,-0.594717,-1.284512
7,,1.251743
8,,-2.001107
9,,


In [16]:
imp = GPFAImputation(fd.data, fd.tidy_df(complete=True, is_missing=True))

In [17]:
imp

<__main__.GPFAImputation at 0x7f88c8c46ce0>

Tidy

In [18]:
imp.impute()

  0%|          | 0/100 [00:00<?, ?it/s]

torch.linalg.solve_triangular has its arguments reversed and does not return a copy of one of the inputs.
X = torch.triangular_solve(B, A).solution
should be replaced with
X = torch.linalg.solve_triangular(A, B). (Triggered internally at  ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2183.)
  res = torch.triangular_solve(right_tensor, self.evaluate(), upper=self.upper).solution
 does not have profile information (Triggered internally at  ../torch/csrc/jit/codegen/cuda/graph_fuser.cpp:104.)
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.024085,
1,1.0,x0,-0.133942,
2,2.0,x0,-0.60465,
3,4.0,x0,-0.478994,
4,6.0,x0,-0.594717,
5,0.0,x1,0.089268,
6,1.0,x1,0.258532,
7,2.0,x1,-0.603501,
8,4.0,x1,-1.161096,
9,6.0,x1,-1.284512,


wide

In [19]:
imp.impute(tidy=False)

Unnamed: 0,x0,x1,x0_std,x1_std,time
0,-0.024085,0.089268,,,0.0
1,-0.133942,0.258532,,,1.0
2,-0.60465,-0.603501,,,2.0
3,-0.471591,-0.846165,0.1492,0.291451,3.0
4,-0.478994,-1.161096,,,4.0
5,-0.992677,-2.374261,0.149305,0.291914,5.0
6,-0.594717,-1.284512,,,6.0
7,0.123035,1.251743,0.123644,0.0,7.0
8,-0.784389,-2.001107,0.123953,0.0,8.0
9,-1.27474,-3.201419,0.183295,0.427122,9.0


### CPU 

check that the CPU support is still working

In [20]:
imp_cpu = GPFAImputation(fd.data, fd.tidy_df(complete=True, is_missing=True), cuda=False)

In [21]:
imp_cpu.impute()

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.024085,
1,1.0,x0,-0.133942,
2,2.0,x0,-0.60465,
3,4.0,x0,-0.478994,
4,6.0,x0,-0.594717,
5,0.0,x1,0.089268,
6,1.0,x1,0.258532,
7,2.0,x1,-0.603501,
8,4.0,x1,-1.161096,
9,6.0,x1,-1.284512,


### Repr

add `__repr__` and `__str__` to imputation objects

In [22]:
#| exports
@patch
def __repr__(self: GPFAImputation):
    return f"""GPFA Imputation:
    N obs: {self.data.shape[0]}
    N features {self.data.shape[1]} ({', '.join(self.data.columns)})
    N missing observations {(~self.cond_idx).sum()}"""

@patch
def __str__(self: GPFAImputation):
    return self.__repr__()

In [23]:
imp

GPFA Imputation:
    N obs: 10
    N features 2 (x0, x1)
    N missing observations 8

In [24]:
str(imp)

'GPFA Imputation:\n    N obs: 10\n    N features 2 (x0, x1)\n    N missing observations 8'

### From GPFADataTest

make constrution from a `GPFADataTest` easier

In [25]:
@patch_to(GPFADataTest)
@delegates(GPFAImputation, but='complete_data')
def to_imp(self: GPFADataTest, **kwargs):
    "Makes GPFAImputation object from data"
    return GPFAImputation(self.data, self.tidy_df(complete=True, is_missing=True), **kwargs)

In [26]:
fd.to_imp()

GPFA Imputation:
    N obs: 10
    N features 2 (x0, x1)
    N missing observations 8

## Results

### Metrics

In [27]:
#| exports
@patch
def compute_metric(self: GPFAImputation,
                   metric, # function that takes as argument true and pred and returns the metric
                   metric_name = 'metric'):
    pred = self.impute(tidy=True, add_time=True)
    
    df = pd.merge(pred, self.data_complete, on = ['time','variable'])
    
    vars = []
    
    for var in df.variable.unique():
        df_var = df[df.variable == var]
        vars.append({'variable': var,
                      metric_name: metric(df_var['value'], df_var['mean'])})
    
    return pd.DataFrame(vars)

In [28]:
#| exports
@patch
def rmse(self: GPFAImputation):
    
    return self.compute_metric(lambda x, y: np.sqrt(mean_squared_error(x,y)), "rmse")
    

In [29]:
imp.rmse()

Unnamed: 0,variable,rmse
0,x0,0.918873
1,x1,2.042767


In [30]:
#| exports
@patch
def r2(self: GPFAImputation):
    return self.compute_metric(r2_score, "r2")

In [31]:
imp.r2()

Unnamed: 0,variable,r2
0,x0,-0.962072
1,x1,-1.826482


### Prediction plot

In [32]:
#| exports
def _plot_variable(imp, complete, variable, y_label="", sel=None, properties = {}):
    
    imp = imp[imp.variable == variable]

    
    error = alt.Chart(imp).mark_errorband().encode(
        x = "time",    
        y = alt.Y("err_low:Q", title = y_label, scale=alt.Scale(zero=False)),
        y2 = "err_high:Q",
        color=alt.Color("variable",
                        legend = alt.Legend(title=["Line: pred. mean", "area: +/- 2 std", "(variable)"])
                       ),
        tooltip = alt.Tooltip(['std', 'mean'], format=".4")
    ).transform_calculate(
        err_low = "datum.mean - 2 * datum.std",
        err_high = "datum.mean + 2 * datum.std"
    ).properties( **properties)

    pred = alt.Chart(imp).mark_line().encode(
        x = "time",    
        y = alt.Y("mean:Q", title = y_label, scale=alt.Scale(zero=False)),
        color="variable",
    ).add_selection(
        sel if sel is not None else alt.selection_interval(bind="scales")
    ).properties(title = variable)

    base_plot = error + pred
    
    if complete is not None:

        complete = complete[complete.variable == variable]
        truth_plt = alt.Chart(complete).mark_point(
            color='black',
            strokeWidth = 1,
            fillOpacity = 1
        ).encode(
            x = "time",
            y = alt.Y("value", title = y_label, scale=alt.Scale(zero=False)),
            fill= alt.Fill("is_missing", scale = alt.Scale(range=["#ffffff00", "black"]),
                           legend = alt.Legend(title =["Observed data","(is missing)"])),
            shape = "is_missing",
        )

        base_plot = truth_plt + base_plot
        
    return base_plot
    

In [33]:
imp_d = imp.impute(tidy=True, add_time=True)

_plot_variable(imp_d, None, "x1")

  for col_name, dtype in df.dtypes.iteritems():


In [34]:
#| exports
@patch()
def plot_pred(
    self: GPFAImputation,
    complete = True, # Optional true data to be plotted agaist predictions
    units: dict = None, # Optional dict where keys are col name and value the unit (y axis labels)
    n_cols: int = 2,
    bind_interaction: bool =True, # Whether the sub-plots for each variable should be connected for zooming/panning
    properties:dict = {} # addtional properties (eg. size) for altair plot
):
    "Plot the prediction for each variable, optionally including true values"
    imp = self._impute_tidy(add_time=True) if hasattr(self, "pred") else self.impute(tidy=True, add_time=True)
    
   
    plot_list = [alt.hconcat() for _ in range(0, imp.shape[0], n_cols)]
    selection_scale = alt.selection_interval(bind="scales", encodings=['x']) if bind_interaction else None
    for idx, variable in enumerate(pd.unique(imp.variable)):
        plot_list[idx // n_cols] |= _plot_variable(imp,
                                                   self.data_complete if complete else None,
                                                   variable,
                                                   y_label = f"{variable} [{units[variable]}]" if units is not None else variable,
                                                   sel = selection_scale, properties=properties)
    
    plot = alt.vconcat(*plot_list)
    
    return plot

In [35]:
imp.plot_pred(units = {'x0': 'x0 units', 'x1': 'x1 unitssss'})

  for col_name, dtype in df.dtypes.iteritems():


The code is running correctly and as expected around the missing data point the error is band is wider

### Display results

show the prediction plot, metrics and model parameters in one convinient view

In [36]:
from IPython.display import HTML

from ipywidgets import HBox, VBox, interact, widgets
from ipywidgets.widgets import Output

In [37]:
def _to_widget(x, title=""):
    """Convert an object into a output widget"""
    out = Output()
    with out:
        display(HTML(f"<h4>{title}</h4>"))
        display(x)
    return out

In [38]:
def _plot_to_widget(x, title=""):
    """Convert an matplotlib plot into a output widget"""
    out = Output()
    with out:
        display(HTML(f"<h4>{title}</h4>"))
        plt.show(x)
    return out

In [39]:
def _style_df(df):
    """style dataframe for better printing """
    return df.style.hide(axis="index").format(precision = 4)

In [40]:
@patch 
def display_results(self: GPFAImputation):
    plot = self.plot_pred(units=self.units, properties =  {'height': 150 , 'width': 300})
    
    r2 = self.r2()
    
    variables = pd.DataFrame({'variable': self.data.columns})
    latent_names = [f"z{i}" for i in range(self.latent_dims)]
    
    Lambda = pd.concat([
        variables,
        pd.DataFrame(
            self.learner.model.covar_module.Lambda.detach().cpu().numpy(),
            columns=latent_names)
    ], axis=1)
    
    
    lengthscale = pd.DataFrame({
        'latent': latent_names,
        'lengthscale': [self.learner.model.covar_module.latent_kernels[i].lengthscale.detach().item() for i in range(self.latent_dims)]
    })
    
    loss = plt.plot(self.learner.losses)
    
    
    metrics =  [_to_widget(_style_df(df), title=title) for df, title in zip([r2, Lambda, lengthscale], ["r2", "Λ", "Lengthscale"])]
    metrics.append(_plot_to_widget(loss, title="Loss"))
    
    # use ipywidget layout
    bottom = HBox(metrics)
    
    return VBox([_to_widget(plot), bottom])
        

In [41]:
imp.display_results()

VBox(children=(Output(), HBox(children=(Output(), Output(), Output(), Output()))))

In [42]:
fd.to_imp(latent_dims=2).display_results()

  0%|          | 0/100 [00:00<?, ?it/s]

VBox(children=(Output(), HBox(children=(Output(), Output(), Output(), Output()))))

## Export 

In [43]:
#| hide
from nbdev import nbdev_export
nbdev_export()