# MuViS Benchmark - Data Loading Guide

Quick-start notebook for loading and preparing data from the **MuViS Multimodal Virtual Sensing Benchmark**.

---

## 1. Load Dataset

In [1]:
from muvis.data_utils.muvis_dataset import MuViSDataset

In [2]:
dataset_ids = [
	"BeijingPM10Quality",
	"BeijingPM25Quality",
	"Panasonic18650PFData",
    "PPGDalia",
	"REVS/2013_Monterey_Motorsports_Reunion",
    "REVS/2013_Targa_Sixty_Six",
    "REVS/2014_Targa_Sixty_Six",
    "TennesseeEastmanProcess",
	"VehicleDynamicsDataset"
	]

dataset = MuViSDataset.get_dataset(dataset_ids[0], base_path="../data/processed")
X_train, y_train = dataset.get_data(split='train')
X_test, y_test = dataset.get_data(split='test')

print(f"Train dataset shape: {X_train.shape}, {y_train.shape}")
print(f"Test dataset shape: {X_test.shape}, {y_test.shape}")

Train dataset shape: (11918, 24, 9), (11918,)
Test dataset shape: (5048, 24, 9), (5048,)


## 2. Standardize & Split Data
Create a validation split and apply standard scaling per feature.

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, shuffle=True)

num_features = X_train.shape[2]

# standard scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, num_features)).reshape(X_train.shape)
X_val_scaled = scaler.transform(X_val.reshape(-1, num_features)).reshape(X_val.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, num_features)).reshape(X_test.shape)

print(f"Scaled Train dataset shape: {X_train_scaled.shape}, {y_train.shape}")
print(f"Scaled Val dataset shape: {X_val_scaled.shape}, {y_val.shape}")
print(f"Scaled Test dataset shape: {X_test_scaled.shape}, {y_test.shape}")

Scaled Train dataset shape: (10726, 24, 9), (10726,)
Scaled Val dataset shape: (1192, 24, 9), (1192,)
Scaled Test dataset shape: (5048, 24, 9), (5048,)


## 3. PyTorch DataLoaders (Sequential)
For sequence models (LSTM, Transformer, ResNet1D) — data shape: `(N, T, D)`

In [5]:
from torch.utils.data import DataLoader
from muvis.utils import datasets

In [6]:
dataset_class = "SequentialDataset"
batchsize = 256

train_dataset = datasets.__dict__[dataset_class](X_train_scaled, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True)

val_dataset = datasets.__dict__[dataset_class](X_val_scaled, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=batchsize, shuffle=False)

test_dataset = datasets.__dict__[dataset_class](X_test_scaled, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False)

In [7]:
X, y = next(iter(train_dataloader))
print(f"Batch X shape: {X.shape}, y shape: {y.shape}")

Batch X shape: torch.Size([256, 24, 9]), y shape: torch.Size([256])


## 4. Flattened Data
For tree-based models and MLPs — data shape: `(N, T*D)`

### Tree-based Models (XGBoost, CatBoost, RandomForest)

In [8]:
X_train_flat = X_train_scaled.reshape(X_train_scaled.shape[0], -1)
X_val_flat = X_val_scaled.reshape(X_val_scaled.shape[0], -1)
X_test_flat = X_test_scaled.reshape(X_test_scaled.shape[0], -1)

print(f"Flattened Train dataset shape: {X_train_flat.shape}, {y_train.shape}")
print(f"Flattened Val dataset shape: {X_val_flat.shape}, {y_val.shape}")
print(f"Flattened Test dataset shape: {X_test_flat.shape}, {y_test.shape}")

Flattened Train dataset shape: (10726, 216), (10726,)
Flattened Val dataset shape: (1192, 216), (1192,)
Flattened Test dataset shape: (5048, 216), (5048,)


### MLP (with PyTorch DataLoader)

In [9]:
from torch.utils.data import DataLoader
from muvis.utils import datasets

In [10]:
dataset_class = "FlattenedDataset"
batchsize = 256

train_dataset = datasets.__dict__[dataset_class](X_train_scaled, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True)

val_dataset = datasets.__dict__[dataset_class](X_val_scaled, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=batchsize, shuffle=False)

test_dataset = datasets.__dict__[dataset_class](X_test_scaled, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False)

In [11]:
X, y = next(iter(train_dataloader))
print(f"Batch X shape: {X.shape}, y shape: {y.shape}")

Batch X shape: torch.Size([256, 216]), y shape: torch.Size([256])
