the goal of this notebook is to be able to:

- take a timeseries that contains gaps
- train a GPFA Learner using the available data
- impute the gap, by using the predictions of the learner
- to improve the imputation by conditioning the distribution on the other observation at the same time stamp

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
#| default_exp imputation

In [None]:
#| export
from gpfa_imputation.learner import *
from gpfa_imputation.data_preparation import *
from gpfa_imputation.gpfa import GPFA

import torch

import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error, r2_score

from fastcore.foundation import patch, patch_to
from fastcore.meta import delegates
from fastcore.basics import store_attr, listify
from fastcore.test import test_close
from itertools import zip_longest

import matplotlib.pyplot as plt
import altair as alt
from altair import datum

from functools import lru_cache

In [None]:
torch.manual_seed(277); # set seed for reproducibility of examples

## GPFA Imputation

This is the core class that does the imputation using a GPFA

The inputs is:

- a dataframe containing the observed data, where the row with missing data have been removed
- a vector of times where the data is missing

it returns:

- a complete dataframe with the prediction of the model

The goal is that `GPFAImputation` takes as imput a dataframe containing missing values and then it imputes them using GPFALearner.
Therefore it needs to divide the dataframe in 3 sections:

- training data (rows with no NAs)
- times to be imputed (rows with some NAs)
- observations (variables in the pred rows that are not missing) for conditional predictions

In [None]:
t_df = pd.DataFrame([
    [1., 3., 4.],
    [2., 6., np.nan],
    [np.nan, np.nan, np.nan],
    [np.nan, 8., np.nan],
    [3., 4., 5.]
]
)

In [None]:
train_idx = ~t_df.isna().any(1)

  train_idx = ~t_df.isna().any(1)


In [None]:
t_df[train_idx]

Unnamed: 0,0,1,2
0,1.0,3.0,4.0
4,3.0,4.0,5.0


In [None]:
pred_data = t_df[~train_idx].to_numpy() 

In [None]:
pred_data

array([[ 2.,  6., nan],
       [nan, nan, nan],
       [nan,  8., nan]])

In [None]:
idx_cond = ~t_df[~train_idx].isna()

In [None]:
idx_cond

Unnamed: 0,0,1,2
1,True,True,False
2,False,False,False
3,False,True,False


In [None]:
idx_cond = idx_cond.to_numpy().flatten() # learner needs a 1D index

pred_data.flatten()[idx_cond]

train_idx = t_df.isna().any(axis=1)

torch.tensor(~t_df[~train_idx].isna().to_numpy().flatten())

tensor([True, True, True, True, True, True])

Implement this into a function

In [None]:
#| exports
class GPFAImputation:
    def __init__(
        self,
        data: pd.DataFrame , #observed data with missing data as NA
        latent_dims = 1,
        cuda = False, # Use GPU?
        model = GPFA # sub-class of `GPFA` 
    ):
        self.data = data.copy()
        self.latent_dims = latent_dims
        
        device = 'cuda' if cuda else 'cpu'
        
        self.T = torch.arange(0, len(data), dtype=torch.float32, device=device) # time is encoded with a increase of 1
        
        # Training data
        self.train_idx = ~self.data.isna().any(axis=1)
        self.train_data = torch.tensor(self.data[self.train_idx].to_numpy().astype(np.float32), device=device)
        self.train_T = self.T[self.train_idx]
        
        self.learner = GPFALearner(X = self.train_data, T = self.train_T, latent_dims=latent_dims, model=model, var_names= self.data.columns)
        

        # Prediction data
        self.pred_T = self.T[~self.train_idx]
        self.cond_idx = torch.tensor(~self.data[~self.train_idx].isna().to_numpy().flatten(), device=device) # conditional obsevations
        self.cond_obs = torch.tensor(self.data[~self.train_idx].to_numpy().astype(np.float32).flatten()[self.cond_idx.cpu()], device=device)
        
        if cuda: self.learner.cuda()
        
                                   
    def fit(self, n_iter=100):
        "Fit learner to training data"
        self.learner.train(n_iter = n_iter)
        return self

    def impute(self,
               add_time = True, # add column with time?
               tidy = True, # tidy data?
               ):
        
        self.pred = self.learner.predict(self.pred_T, obs = self.cond_obs, idx = self.cond_idx)
        if not hasattr(self, "pred"):
            self.fit()

        
        if tidy: return self._impute_tidy(add_time)
        else: return self._impute_wide(add_time)
        
        
    def _impute_wide(self, add_time):
        """ Impute in wide format"""
        
        imp_data = self.data.copy()
        for col_idx, col_name in enumerate(imp_data.columns):
            imp_data.loc[~self.train_idx, col_name] = self.pred.mean[:, col_idx].cpu().numpy()
            imp_data.loc[~self.train_idx, col_name + "_std"] = self.pred.std[:, col_idx].cpu().numpy()
        
        if add_time:
            imp_data["time"] = self.T.cpu()
        
        return imp_data 
    
    def _impute_tidy(self, add_time):
        """ transform the pred output into a tidy dataframe suitable for plotting"""
        feature_names = self.data.columns

        pred_mean = pd.DataFrame(self.pred.mean.cpu(), columns = feature_names).assign(time = self.pred_T.cpu()).melt("time", value_name="mean")
        pred_std = pd.DataFrame(self.pred.std.cpu(), columns = feature_names).assign(time = self.pred_T.cpu()).melt("time", value_name="std")
        
        pred = pd.merge(pred_mean, pred_std, on=['time', 'variable'])  
        
        train_data = self.data[self.train_idx].assign(time = self.train_T.cpu()).melt("time", value_name = "mean")
               
        imp_data = pd.concat((train_data, pred))
        
        self.pred_wide = imp_data
        
        return imp_data

In [None]:
fd = GPFADataTest.generate(2, 10, Lambda=[1,2.]).add_random_missing()

In [None]:
fd.data

Unnamed: 0,x0,x1
0,-0.024085,0.089268
1,-0.133942,0.258532
2,-0.60465,-0.603501
3,,
4,-0.478994,-1.161096
5,,
6,-0.594717,-1.284512
7,,1.251743
8,,-2.001107
9,,


In [None]:
imp = GPFAImputation(fd.data)

In [None]:
imp

<__main__.GPFAImputation>

Tidy

In [None]:
imp.impute()

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.024085,
1,1.0,x0,-0.133942,
2,2.0,x0,-0.60465,
3,4.0,x0,-0.478994,
4,6.0,x0,-0.594717,
5,0.0,x1,0.089268,
6,1.0,x1,0.258532,
7,2.0,x1,-0.603501,
8,4.0,x1,-1.161096,
9,6.0,x1,-1.284512,


wide

In [None]:
imp.impute(tidy=False)

Unnamed: 0,x0,x1,x0_std,x1_std,time
0,-0.024085,0.089268,,,0.0
1,-0.133942,0.258532,,,1.0
2,-0.60465,-0.603501,,,2.0
3,-0.416516,-0.562824,0.386124,0.83395,3.0
4,-0.478994,-1.161096,,,4.0
5,-0.419649,-0.564259,0.386142,0.833952,5.0
6,-0.594717,-1.284512,,,6.0
7,-0.358267,1.251743,0.387923,0.0,7.0
8,-0.397247,-2.001107,0.390719,0.0,8.0
9,-0.38412,-0.547979,0.391765,0.834502,9.0


### GPU 

check that the GPU support is working

In [None]:
imp_gpu = GPFAImputation(fd.data, cuda=True)

In [None]:
imp_gpu.impute()

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.024085,
1,1.0,x0,-0.133942,
2,2.0,x0,-0.60465,
3,4.0,x0,-0.478994,
4,6.0,x0,-0.594717,
5,0.0,x1,0.089268,
6,1.0,x1,0.258532,
7,2.0,x1,-0.603501,
8,4.0,x1,-1.161096,
9,6.0,x1,-1.284512,


the gpu and cpu version return similar results!

In [None]:
imp.impute()[["mean", "std"]].to_numpy() - imp_gpu.impute()[["mean", "std"]].to_numpy()

array([[ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [ 0.0000000e+00,            nan],
       [-4.6333283e-02,  6.7106664e-02],
       [-4.6722233e-02,  6.7123741e-02],
       [-2.6827753e-03,  6.9049209e-02],
       [-2.0056546e-02,  7.1836144e-02],
       [-1.1047989e-02,  7.2726548e-02],
       [ 7.1919739e-02, -1.6455758e-01],
       [ 1.5970773e-01, -1.6455603e-01],
       [ 0.0000000e+00,  0.0000000e+00],
       [ 2.3841858e-07,  0.0000000e+00],
       [ 1.8069166e-01, -1.7099941e-01]], dtype=float32)

### Repr

add `__repr__` and `__str__` to imputation objects

In [None]:
#| exports
@patch
def __repr__(self: GPFAImputation):
    return f"""GPFA Imputation:
    N obs: {self.data.shape[0]}
    N features {self.data.shape[1]} ({', '.join(self.data.columns)})
    N missing observations {(~self.cond_idx).sum()}
    N latent: {self.learner.latent_dims}"""

@patch
def __str__(self: GPFAImputation):
    return self.__repr__()

In [None]:
imp

GPFA Imputation:
    N obs: 10
    N features 2 (x0, x1)
    N missing observations 8
    N latent: 1

In [None]:
str(imp)

'GPFA Imputation:\n    N obs: 10\n    N features 2 (x0, x1)\n    N missing observations 8\n    N latent: 1'

## GPFA Imputation Explorer

This is a class that is used for exploring the results for a GPFAImputation, the main difference is that it always return the model predictions and not only the training data

In [None]:
#| exports
class GPFAImputationExplorer(GPFAImputation):
    "GPFAImputation where predictions are for all times not only missing data"
    
    def predict(self):
        "Predict for all times, also when there is an observation, supporting cond obs, with valid std"
        imp_mean = pd.DataFrame({'time': self.T.cpu()})
        imp_std = pd.DataFrame({'time': self.T.cpu()})
        
        # Fill using general predictions
        
        all_pred = self.learner.predict(self.T)
        
        for col_idx, col_name in enumerate(self.data.columns):
            imp_mean.loc[:, col_name] = all_pred.mean[:, col_idx].cpu().numpy()
            imp_std.loc[:, col_name] = all_pred.std[:, col_idx].cpu().numpy()
        
        # Fine tune with cond predictions
        
        pred_cond = self.learner.predict(self.pred_T, obs = self.cond_obs, idx = self.cond_idx)
        obs_mask = self.cond_idx.reshape(-1, self.data.shape[1]).cpu().numpy()

        for col_idx, col_name in enumerate(self.data.columns):
            mean= pred_cond.mean[:, col_idx].cpu().numpy()
            std = pred_cond.std[:, col_idx].cpu().numpy()

            # when there is a cond obs the std is nan, which is replaced with the std without the conditional prediction
            mask_data = ~self.train_idx.to_numpy()
            mask_data[mask_data] = ~obs_mask[:, col_idx]

            imp_mean.loc[mask_data, col_name] = mean[~obs_mask[:, col_idx]]
            imp_std.loc[mask_data, col_name] = std[~obs_mask[:, col_idx]]
        
        # make tidy
        
        return pd.merge(
            imp_mean.melt('time', value_name = "mean"),
            imp_std.melt('time', value_name = "std"),
            on = ['time', 'variable']
        )

In [None]:
GPFAImputationExplorer(fd.data).fit().predict()

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,time,variable,mean,std
0,0.0,x0,-0.139681,0.123077
1,1.0,x0,-0.117523,0.122828
2,2.0,x0,-0.39891,0.123055
3,3.0,x0,-0.472752,0.149371
4,4.0,x0,-0.564477,0.123241
5,5.0,x0,-0.985978,0.149474
6,6.0,x0,-0.609512,0.123253
7,7.0,x0,0.120754,0.123318
8,8.0,x0,-0.781911,0.123618
9,9.0,x0,-1.257758,0.183116


### Repr

add `__repr__` and `__str__` to imputation objects

In [None]:
#| exports
@patch
def __repr__(self: GPFAImputationExplorer):
    return f"""GPFA Imputation Explorer:
    N obs: {self.data.shape[0]}
    N features {self.data.shape[1]} ({', '.join(self.data.columns)})
    N missing observations {self.data.isna().to_numpy().flatten().sum()}
    N latent: {self.learner.latent_dims}"""

@patch
def __str__(self: GPFAImputationExplorer):
    return self.__repr__()

In [None]:
imp_exp = GPFAImputationExplorer(fd.data)

In [None]:
imp_exp

In [None]:
str(imp)

In [None]:
imp_exp.predict()

## Export 

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()