In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from pygrinder import mcar
from pypots.data import load_specific_dataset
from pypots.imputation import SAITS
from pypots.utils.metrics import calc_mae

In [1]:
import torch

In [4]:
print(torch.__version__)

2.1.2


In [3]:
print(torch.cuda.get_device_properties(0).total_memory)

AssertionError: Torch not compiled with CUDA enabled

In [6]:

# Data preprocessing. Tedious, but PyPOTS can help.
data = load_specific_dataset('physionet_2012')  # PyPOTS will automatically download and extract it.
X = data['X']
num_samples = len(X['RecordID'].unique())
X = X.drop(['RecordID', 'Time'], axis = 1)
X = StandardScaler().fit_transform(X.to_numpy())
X = X.reshape(num_samples, 48, -1)
X_ori = X  # keep X_ori for validation
X = mcar(X, 0.1)  # randomly hold out 10% observed values as ground truth
dataset = {"X": X}  # X for model input
print(X.shape)  # (11988, 48, 37), 11988 samples, 48 time steps, 37 features

# Model training. This is PyPOTS showtime.
saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, d_inner=128, n_heads=4, d_k=64, d_v=64, dropout=0.1, epochs=10, device='cuda')
# Here I use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
saits.fit(dataset)
imputation = saits.impute(dataset)  # impute the originally-missing values and artificially-missing values
indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)

2024-01-25 14:15:24 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)...
2024-01-25 14:15:24 [INFO]: Starting preprocessing physionet_2012...
2024-01-25 14:15:24 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-01-25 14:15:24 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-01-25 14:15:24 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-01-25 14:15:24 [INFO]: Loaded successfully!
2024-01-25 14:15:47 [INFO]: Using the given device: cuda


(11988, 48, 37)


AssertionError: You are trying to use CUDA for model training, but CUDA is not available in your environment.

In [5]:
data

{'X':         RecordID  Time  ALP  ALT  AST  Albumin  BUN  Bilirubin  Cholesterol  \
 0         132539   0.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 1         132539   1.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 2         132539   2.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 3         132539   3.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 4         132539   4.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 ...          ...   ...  ...  ...  ...      ...  ...        ...          ...   
 575419    163037  43.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 575420    163037  44.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 575421    163037  45.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 575422    163037  46.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 575423    163037  47.0  NaN  NaN  NaN      NaN  NaN        NaN          NaN   
 
         Creatinine  ...  RespRat