In [1]:
import torch
from torch.utils.data import Dataset

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx], dtype=torch.float32)
        target = torch.tensor(self.targets[idx], dtype=torch.float32)
        return sequence, target


mps


In [2]:
import os
import pandas as pd
from glob import glob

# Define the path to the directory containing CSV files
data_dir = "/Users/vinny/-hd-net/data-csv"  # Replace this with the actual path
file_paths = glob(os.path.join(data_dir, "*.csv"))

# Parameters
channels = ['ch.1', 'ch.2', 'ch.3']
T = 50  # Sequence length

In [3]:
# Step 1: Compute global mean and std across all files
def compute_global_stats(file_paths, channels):
    all_data = []
    for file in file_paths:
        df = pd.read_csv(file)
        print(f"Processing file: {os.path.basename(file)}")
        print(f"Columns in file: {df.columns.tolist()}")
        # Check if the required columns exist
        if not all(col in df.columns for col in channels):
            print(f"Skipping file {os.path.basename(file)}: Missing required columns.")
            continue
        all_data.append(df[channels])
    if not all_data:
        raise ValueError("No valid data found in the files for the specified channels.")
    combined_data = pd.concat(all_data)
    global_mean = combined_data.mean()
    global_std = combined_data.std()
    return global_mean, global_std

global_mean, global_std = compute_global_stats(file_paths, channels)

Processing file: 081-3623.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: 081-4831.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: 013-4831.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: 013-3623.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: 051-4831.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: 051-3623.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: none-3623.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']
Processing file: none-4831.csv
Columns in file: ['time', 'ch.1', 'ch.2', 'ch.3']


In [4]:
# Step 2: Normalize each file and prepare sequences
def prepare_sequences(file_paths, global_mean, global_std, T, channels):
    dataset_sequences = {}
    for file in file_paths:
        df = pd.read_csv(file)
        # Normalize
        df[channels] = (df[channels] - global_mean) / global_std
        # Create sequences
        sequences, targets = [], []
        for i in range(len(df) - T):
            seq = df.iloc[i:i + T][channels].values
            target = df.iloc[i:i + T][channels].values
            sequences.append(seq)
            targets.append(target)
        # Store sequences for this file
        dataset_sequences[os.path.basename(file)] = {
            "sequences": sequences,
            "targets": targets
        }
    return dataset_sequences

dataset_sequences = prepare_sequences(file_paths, global_mean, global_std, T, channels)

In [5]:
# Output example: Number of sequences per file
for file_name, data in dataset_sequences.items():
    print(f"File: {file_name}, Number of Sequences: {len(data['sequences'])}")

File: 081-3623.csv, Number of Sequences: 272750
File: 081-4831.csv, Number of Sequences: 230050
File: 013-4831.csv, Number of Sequences: 202150
File: 013-3623.csv, Number of Sequences: 291350
File: 051-4831.csv, Number of Sequences: 198450
File: 051-3623.csv, Number of Sequences: 248850
File: none-3623.csv, Number of Sequences: 300250
File: none-4831.csv, Number of Sequences: 191950


In [6]:
# from torch.utils.data import Dataset, DataLoader

# # Step 3: Define PyTorch Dataset class
# class TimeSeriesDataset(Dataset):
#     def __init__(self, sequences, targets):
#         """
#         Args:
#             sequences (list of np.ndarray): List of input sequences.
#             targets (list of np.ndarray): Corresponding target values.
#         """
#         self.sequences = sequences
#         self.targets = targets

#     def __len__(self):
#         return len(self.sequences)

#     def __getitem__(self, idx):
#         # Convert to PyTorch tensors
#         sequence = torch.tensor(self.sequences[idx], dtype=torch.float32)
#         target = torch.tensor(self.targets[idx], dtype=torch.float32)
#         return sequence, target


In [6]:
import torch
import torch.nn as nn
import pytorch_lightning as pl

class CNNLSTM(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, kernel_size=3, learning_rate=1e-3):
        super(CNNLSTM, self).__init__()

        self.save_hyperparameters()

        # CNN layer (1D convolution)
        self.cnn = nn.Conv1d(in_channels=input_dim, out_channels=hidden_dim, kernel_size=kernel_size, padding=kernel_size // 2)

        # LSTM layers
        self.lstm = nn.LSTM(input_size=hidden_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)

        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Learning rate
        self.learning_rate = learning_rate

    def forward(self, x):
        # Input shape: (batch_size, sequence_length, input_dim)

        # CNN expects (batch_size, input_dim, sequence_length), so we permute
        x = x.permute(0, 2, 1)
        x = self.cnn(x)

        # LSTM expects (batch_size, sequence_length, hidden_dim), so we permute back
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)

        # Apply fully connected layer to the last timestep
        x = self.fc(x)
        return x

    def training_step(self, batch, batch_idx):
        sequences, targets = batch
        predictions = self(sequences)
        loss = nn.MSELoss()(predictions, targets)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        sequences, targets = batch
        predictions = self(sequences)
        loss = nn.MSELoss()(predictions, targets)
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        sequences, targets = batch
        predictions = self(sequences)
        loss = nn.MSELoss()(predictions, targets)
        self.log('test_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        return [optimizer], [scheduler]

# Example usage
# model = CNNLSTM(input_dim=3, hidden_dim=64, num_layers=2, output_dim=3)


In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Example: Evaluate on test data
def evaluate_model(model, test_loader):
    all_targets = []
    all_predictions = []

    model.eval()
    with torch.no_grad():
        for sequences, targets in test_loader:
            predictions = model(sequences)
            all_targets.append(targets.numpy())
            all_predictions.append(predictions.numpy())

    # Convert to numpy arrays
    all_targets = np.concatenate(all_targets, axis=0)
    all_predictions = np.concatenate(all_predictions, axis=0)

    # Compute metrics
    mse = mean_squared_error(all_targets, all_predictions)
    mae = mean_absolute_error(all_targets, all_predictions)
    r2 = r2_score(all_targets, all_predictions)

    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R^2: {r2:.4f}")

    # Visualization
    plt.figure(figsize=(12, 6))
    plt.plot(all_targets[:100], label="Actual")
    plt.plot(all_predictions[:100], label="Predicted")
    plt.title("Time-Series Forecast (Sample)")
    plt.legend()
    plt.show()

    # Residual plot
    residuals = all_targets - all_predictions
    plt.figure(figsize=(10, 5))
    plt.plot(residuals, label="Residuals")
    plt.axhline(0, color="r", linestyle="--")
    plt.title("Residual Plot")
    plt.legend()
    plt.show()

In [8]:
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 4: Perform Leave-One-Out Validation
def leave_one_out_validation(dataset_sequences):
    results = []

    for test_file in dataset_sequences.keys():
        print(f"Leave-One-Out: Testing on {test_file}")

        # Prepare training data (all files except the test file)
        train_sequences = []
        train_targets = []
        for file, data in dataset_sequences.items():
            if file != test_file:
                train_sequences.extend(data["sequences"])
                train_targets.extend(data["targets"])

        # Prepare test data
        test_data = dataset_sequences[test_file]
        test_sequences = test_data["sequences"]
        test_targets = test_data["targets"]

        # Create DataLoaders
        train_dataset = TimeSeriesDataset(train_sequences, train_targets)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=5, persistent_workers=True)

        test_dataset = TimeSeriesDataset(test_sequences, test_targets)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=5, persistent_workers=True)

        # Initialize the model
        model = CNNLSTM(
            input_dim=3,        # Number of channels
            hidden_dim=64,      # Number of LSTM hidden units
            num_layers=2,       # Stacked LSTM layers
            output_dim=3,       # Output channels (same as input channels)
            kernel_size=3,      # Convolution kernel size
            learning_rate=1e-3  # Learning rate
        )

        # Define callbacks
        early_stopping = EarlyStopping(monitor="val_loss", patience=5, mode="min")
        checkpoint = ModelCheckpoint(monitor="val_loss", mode="min", filename="best_model")

        # Train the model
        trainer = Trainer(
            max_epochs=20,
            callbacks=[early_stopping, checkpoint],
            log_every_n_steps=10
        )
        trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=test_loader)

        # **Use evaluate_model() during this iteration**
        print(f"Evaluation for {test_file}:")
        evaluate_model(model, test_loader)  # Visualization for this iteration

        # Calculate metrics
        all_targets = []
        all_predictions = []
        model.eval()
        with torch.no_grad():
            for sequences, targets in test_loader:
                predictions = model(sequences)
                all_targets.append(targets.numpy())
                all_predictions.append(predictions.numpy())

        all_targets = np.concatenate(all_targets, axis=0)
        all_predictions = np.concatenate(all_predictions, axis=0)

        mse = mean_squared_error(all_targets, all_predictions)
        mae = mean_absolute_error(all_targets, all_predictions)
        r2 = r2_score(all_targets, all_predictions)

        print(f"Results for {test_file}: MSE={mse:.4f}, MAE={mae:.4f}, R^2={r2:.4f}")
        results.append({"file": test_file, "mse": mse, "mae": mae, "r2": r2})

    return results

In [None]:
# print(f"CPU cores available: {os.cpu_count()}")

In [9]:
# Run Leave-One-Out Validation
results = leave_one_out_validation(dataset_sequences)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/vinny/miniconda3/envs/ml-pytorch-312/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name | Type   | Params | Mode 
----------------------------------------
0 | cnn  | Conv1d | 640    | train
1 | lstm | LSTM   | 66.6 K | train
2 | fc   | Linear | 195    | train
----------------------------------------
67.4 K    Trainable params
0         Non-trainable params
67.4 K    Total params
0.270     Total estimate

Leave-One-Out: Testing on 081-3623.csv
Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/vinny/miniconda3/envs/ml-pytorch-312/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vinny/miniconda3/envs/ml-pytorch-312/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'TimeSeriesDataset' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>

Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
# Post-Validation: Detailed evaluation for specific test datasets
for test_file in dataset_sequences.keys():
    print(f"\nPost-Validation Evaluation for {test_file}:")
    test_data = dataset_sequences[test_file]
    test_sequences = test_data["sequences"]
    test_targets = test_data["targets"]

    # Create DataLoader for the test data
    test_dataset = TimeSeriesDataset(test_sequences, test_targets)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Assuming the model is trained (or reload the model checkpoint if needed)
    trained_model = CNNLSTM(
        input_dim=3,
        hidden_dim=64,
        num_layers=2,
        output_dim=3,
        kernel_size=3,
        learning_rate=1e-3
    )

    # Perform post-validation evaluation
    evaluate_model(trained_model, test_loader)