
PCA

In [12]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from utils.dataset import get_data_loaders
import hydra
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is NOT available. Using CPU.")

hydra.core.global_hydra.GlobalHydra.instance().clear()
with initialize(version_base=None, config_path="../src/configs"):
    cfg = compose(config_name="base", overrides=["project_root=/home/paperspace/DeepEmotion"])
print(cfg)

cfg.train.batch_size = 1024

CUDA is available. Using GPU: Quadro M4000
{'project_root': '/home/paperspace/DeepEmotion', 'data': {'data_path': '${project_root}/data/raw/derivatives/non-linear_anatomical_alignment', 'zarr_path': '${project_root}/dataset.zarr', 'label_path': '${project_root}/data/resampled_annotations/av1o6_resampled.tsv', 'sessions': ['01', '02', '03', '04', '05', '06', '07', '08'], 'file_pattern_template': '*_ses-forrestgump_task-forrestgump_rec-dico7Tad2grpbold7TadNL_run-{}_bold.nii.gz', 'subjects': ['sub-01', 'sub-02', 'sub-03', 'sub-04', 'sub-05', 'sub-06', 'sub-07', 'sub-08', 'sub-09', 'sub-11', 'sub-12', 'sub-13', 'sub-14', 'sub-15', 'sub-16', 'sub-17', 'sub-18', 'sub-19', 'sub-20'], 'session_offsets': [0, 902, 1784, 2660, 3636, 4560, 5438, 6522], 'emotion_idx': {'NONE': 0, 'HAPPINESS': 1, 'FEAR': 2, 'SADNESS': 3, 'LOVE': 4, 'ANGERRAGE': 5, 'CONTEMPT': 6, 'GRATITUDE': 7, 'ADMIRATION': 8, 'COMPASSION': 9, 'PRIDE': 10, 'REMORSE': 11, 'DISAPPOINTMENT': 12, 'HAPPYFOR': 13, 'GLOATING': 14, 'SATISF

In [13]:
train_dataloader, val_dataloader = get_data_loaders(cfg)

num_train_batches = len(train_dataloader)
sample_batch = next(iter(train_dataloader))
batch_data = sample_batch["data_tensor"]

print(f"Cell 2 complete: Loaded the dataset.")
print(f"Number of train batches: {num_train_batches}")
print(f"Shape of one batch of data: {batch_data.shape}")

Dataset contains 152 files.
Spatial dimensions: (132, 175, 48)
Maximum timepoints per file: 542
Subjects: ['sub-01' 'sub-02' 'sub-03' 'sub-04' 'sub-05' 'sub-06' 'sub-07' 'sub-08'
 'sub-09' 'sub-11' 'sub-12' 'sub-13' 'sub-14' 'sub-15' 'sub-16' 'sub-17'
 'sub-18' 'sub-19' 'sub-20']
Sessions: ['01' '02' '03' '04' '05' '06' '07' '08']
Emotion categories: ['NONE', 'HAPPINESS', 'FEAR', 'SADNESS', 'LOVE', 'ANGERRAGE', 'CONTEMPT', 'GRATITUDE', 'ADMIRATION', 'COMPASSION', 'PRIDE', 'REMORSE', 'DISAPPOINTMENT', 'HAPPYFOR', 'GLOATING', 'SATISFACTION', 'HOPE', 'HATE', 'RELIEF', 'SHAME', 'GRATIFICATION', 'FEARSCONFIRMED']
Total valid labeled timepoints: 28424
Cell 2 complete: Loaded the dataset.
Number of train batches: 23
Shape of one batch of data: torch.Size([1024, 132, 175, 48])


In [11]:
# Define PCA parameters
n_components = 50  # Set desired number of principal components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

# Get dataset dimensions from a single batch
sample_batch = next(iter(train_dataloader))["data_tensor"]
num_features = sample_batch.shape[1] * sample_batch.shape[2] * sample_batch.shape[3]  # 132 * 175 * 48

# Initialize principal components randomly
W = torch.randn(num_features, n_components, device=device)  # Shape: (num_features, n_components)
mean_running = torch.zeros(num_features, device=device)  # Running mean
num_samples = 0  # Total number of processed samples

# Process data batch by batch
for batch_idx, batch in enumerate(train_dataloader):
    print(f"Processing batch {batch_idx}")

    allocated_before = torch.cuda.memory_allocated() / 1e6
    reserved_before = torch.cuda.memory_reserved() / 1e6

    # Move batch to GPU and flatten it
    batch_data = batch["data_tensor"].float().to(device, non_blocking=True)  # Shape: [B, 132, 175, 48]
    batch_data = batch_data.view(batch_data.size(0), -1)  # Flatten to [B, num_features]

    # Update running mean
    batch_mean = batch_data.mean(dim=0)
    num_samples += batch_data.shape[0]
    mean_running = (mean_running * (num_samples - batch_data.shape[0]) + batch_data.shape[0] * batch_mean) / num_samples

    # Center batch data (remove mean)
    batch_data -= mean_running

    # Incremental PCA update using Power Iteration
    batch_projection = batch_data @ W  # Project onto existing components
    U, S, Vh = torch.linalg.svd(batch_projection, full_matrices=False)  # Compute new principal directions
    W = (W @ Vh[:n_components].T)  # Update principal components

    # Log GPU memory after processing batch
    allocated_after = torch.cuda.memory_allocated() / 1e6
    reserved_after = torch.cuda.memory_reserved() / 1e6

    print(f"Batch {batch_idx} GPU memory usage:")
    print(f"  Before processing: allocated={allocated_before:.2f} MB, reserved={reserved_before:.2f} MB")
    print(f"  After processing: allocated={allocated_after:.2f} MB, reserved={reserved_after:.2f} MB")

    # Free GPU memory
    del batch_data, batch_projection, U, S, Vh, batch_mean
    torch.cuda.empty_cache()

print(f"Incremental PCA training complete. Principal components shape: {W.shape}")


device: cuda
Running Incremental PCA on cuda...
Processing batch 0
Batch 0 GPU memory usage:
  Before processing: allocated=235.25 MB, reserved=486.54 MB
  After processing: allocated=1375.21 MB, reserved=1625.29 MB
Processing batch 1
Batch 1 GPU memory usage:
  Before processing: allocated=235.25 MB, reserved=264.24 MB
  After processing: allocated=1375.21 MB, reserved=1625.29 MB


KeyboardInterrupt: 