In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

### Install NeuMiss

In [2]:
!git clone --depth=1 https://github.com/marineLM/NeuMiss_sota.git

Cloning into 'NeuMiss_sota'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 20 (delta 0), reused 13 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (20/20), 62.08 KiB | 512.00 KiB/s, done.


In [3]:
%cd NeuMiss_sota

/content/NeuMiss_sota


In [4]:
!pip install . -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for neumiss (pyproject.toml) ... [?25l[?25hdone


## Sample use of NeuMiss

train NeuMissMLP on a simple synthetic dataset

In [5]:
%cd src

/content/NeuMiss_sota/src


In [6]:
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from neumiss import NeuMissMLP
from generate_example_dataset import get_example_dataset
from utils import (
    get_optimizer_by_group,
    train_model,
    compute_preds,
    compute_regression_metrics
)

Generate a synthetic dataset. Here, we generate Gaussian data (10,000 samples, 10 features) with 50% MCAR missing values. The response is generated linearly from the complete data. Note that **with NeuMiss, it is not necessary to impute the data**. The **NeuMissBlock expects and handles NaN in the inputs**.

In [7]:
torch.manual_seed(0)

<torch._C.Generator at 0x7fd1ae9719d0>

In [8]:
ds_train, ds_val, ds_test = get_example_dataset()
p = ds_train.tensors[0].shape[1]  # n_features
#
train_loader = DataLoader(ds_train, batch_size=64, shuffle=True)
val_loader = DataLoader(ds_val, batch_size=64)
test_loader = DataLoader(ds_test, batch_size=64)

In [9]:
ds_train[:5]

(tensor([[-3.7154,     nan, -1.6264,     nan,     nan,  1.4869,     nan, -2.5403,
           2.2790,     nan],
         [    nan,  0.8611, -0.8832, -1.5880,  2.0258,  0.2589,     nan, -1.3742,
          -0.8846,  0.6854],
         [-0.2808, -2.5026, -0.9402,     nan, -9.7543,     nan,     nan, -0.6970,
              nan,     nan],
         [-2.1774,  1.0111,     nan,     nan, -3.3075,     nan,  0.2433,     nan,
              nan, -4.1947],
         [    nan,     nan,     nan,     nan,     nan,     nan,     nan,     nan,
           5.1190, -2.6865]]),
 tensor([ 0.7345,  0.8464, -2.0679,  0.0636, -0.1527]))

Instantiate a NeuMissMLP network.


In [10]:
model = NeuMissMLP(
    n_features=p,
    neumiss_depth=10,
    mlp_depth=0,
    mlp_width=p
)

Instantiate an optimizer, a scheduler and a loss.

In [11]:
optim_hyperparams = {'weight_decay': 0, 'lr': 1e-3}
optimizer = get_optimizer_by_group(model, optim_hyperparams)

sched_hyperparams = {'factor': 0.2, 'patience': 10, 'threshold': 1e-4}
scheduler = ReduceLROnPlateau(optimizer, mode='min', **sched_hyperparams)

criterion = nn.MSELoss()

Train the model.


In [12]:
train_model(model, criterion, train_loader, val_loader, optimizer,
            scheduler, early_stopping=True, n_epochs=200, lr_threshold=1e-6)

EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 2 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 2 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 2 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 2 out of                       12
EarlyStopping counter: 1 out of                       12
EarlyStopping counter: 1 out of

Compute prediction scores.

In [13]:
train_loader = DataLoader(ds_train, batch_size=256, shuffle=False)
pred = compute_preds(model, train_loader, val_loader, test_loader,
                     classif=False)

res = {}
splits = ['train', 'val', 'test']
preds = [pred[split] for split in splits]
y_labels = [ds_train.tensors[1], ds_val.tensors[1], ds_test.tensors[1]]

for split, pred, y_label in zip(splits, preds, y_labels):
    res_split = compute_regression_metrics(pred, y_label)
    for metric, value in res_split.items():
        res[f'{metric}_{split}'] = value

In [14]:
pd.DataFrame.from_dict(res, orient='index', columns=['score'])

Unnamed: 0,score
r2_train,0.815609
mse_train,0.198918
r2_val,0.796459
mse_val,0.22698
r2_test,0.7997
mse_test,0.22108
