
PCA

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from utils.dataset import get_data_loaders
import hydra
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose
import torch
import os
from utils.dataset import get_data_loaders
from models.CNN import CNN
from models.resnet import ResNet, BasicBlock
import pandas as pd
from tqdm import tqdm
from utils.dataset import get_data_loaders
from models.CNN import CNN
from sklearn.decomposition import IncrementalPCA

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is NOT available. Using CPU.")

hydra.core.global_hydra.GlobalHydra.instance().clear()
with initialize(version_base=None, config_path="../../src/configs"):
    cfg = compose(config_name="base", overrides=["project_root=/home/paperspace/DeepEmotion"])
print(cfg)

ModuleNotFoundError: No module named 'matplotlib'

In [3]:
train_dataloader, val_dataloader = get_data_loaders(cfg)

num_train_batches = len(train_dataloader)
sample_batch = next(iter(train_dataloader))
batch_data = sample_batch["data_tensor"]

print(f"Cell 2 complete: Loaded the dataset.")
print(f"Number of train batches: {num_train_batches}")
print(f"Shape of one batch of data: {batch_data.shape}")
print(f"batch_size: {cfg.train.batch_size}")

Dataset contains 152 files.
Spatial dimensions: (132, 175, 48)
Maximum timepoints per file: 542
Subjects: ['sub-01' 'sub-02' 'sub-03' 'sub-04' 'sub-05' 'sub-06' 'sub-07' 'sub-08'
 'sub-09' 'sub-11' 'sub-12' 'sub-13' 'sub-14' 'sub-15' 'sub-16' 'sub-17'
 'sub-18' 'sub-19' 'sub-20']
Sessions: ['01' '02' '03' '04' '05' '06' '07' '08']
Emotion categories: ['NONE', 'HAPPINESS', 'FEAR', 'SADNESS', 'LOVE', 'ANGER']
Total valid labeled timepoints: 13813
Cell 2 complete: Loaded the dataset.
Number of train batches: 442
Shape of one batch of data: torch.Size([25, 132, 175, 48])
batch_size: 25


In [5]:
# Invert the label dictionary
inverse_emotion_idx = {v: k for k, v in cfg.data.emotion_idx.items()}

n_components = 2  # if you're only using 2 now
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get dataset dimensions
sample_batch = next(iter(train_dataloader))["data_tensor"]
num_features = sample_batch.shape[1] * sample_batch.shape[2] * sample_batch.shape[3]  # e.g., 132 * 175 * 48

W = torch.randn(num_features, n_components, device=device)
mean_running = torch.zeros(num_features, device=device)
num_samples = 0
results = []

for batch_idx, batch in enumerate(train_dataloader):
    batch_data = batch["data_tensor"].float().to(device, non_blocking=True)
    batch_data = batch_data.view(batch_data.size(0), -1)

    batch_mean = batch_data.mean(dim=0)
    num_samples += batch_data.shape[0]
    mean_running = (mean_running * (num_samples - batch_data.shape[0]) 
                    + batch_data.shape[0] * batch_mean) / num_samples

    batch_data -= mean_running

    # Power iteration / simple incremental update
    temp_projection = batch_data @ W
    U, S, Vh = torch.linalg.svd(temp_projection, full_matrices=False)
    W = W @ Vh[:n_components].T  # W remains [num_features, 2]

    batch_projection = batch_data @ W  # shape [B, 2]

    # Retrieve labels
    labels = batch["label_tensor"]
    
    # Convert to CPU for final results
    batch_projection_cpu = batch_projection.cpu().numpy()
    labels_cpu = labels.cpu().numpy()

    # Store PC values plus label name
    for i in range(batch_projection_cpu.shape[0]):
        pc_values = list(batch_projection_cpu[i])  
        label_idx = labels_cpu[i]
        label_name = inverse_emotion_idx.get(label_idx, "UNK")
        pc_values.append(label_name)
        results.append(pc_values)

# Write to CSV
columns = [f"PC{i+1}" for i in range(n_components)] + ["EmotionLabel"]
df = pd.DataFrame(results, columns=columns)
csv_path = "/home/paperspace/DeepEmotion/output/PCA/raw/sub_ALL.csv"
df.to_csv(csv_path, index=False)
print(f"PCA projections saved to {csv_path}")


OSError: Cannot save file into a non-existent directory: '/home/paperspace/DeepEmotion/src/output/PCA/raw'

In [14]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
from omegaconf import OmegaConf
from utils.dataset import get_data_loaders
from models.CNN import CNN

# Config variables
PROJECT_ROOT = os.path.abspath("../../")

cfg = OmegaConf.create({
    "project_root": PROJECT_ROOT,
    "verbose": True,
    "wandb": True,
    "sys_log": True,
    "model": "CNN",
    "CNN": {
        "c1": 16, "c2": 32, "c3": 64, "k1": 3, "k2": 3, "k3": 3,
        "pk": 2, "ps": 2, "kernel_size": 3, "stride": 1, "padding": 1
    },
    "train": {
        "epochs": 50, "batch_size": 20, "shuffle": True, "train_ratio": 0.8,
        "print_label_frequencies": True
    },
    "data": {
        "data_path": f"{PROJECT_ROOT}/data/raw/derivatives/non-linear_anatomical_alignment",
        "zarr_dir_path": f"{PROJECT_ROOT}/zarr_datasets",
        "zarr_path": f"{PROJECT_ROOT}/zarr_datasets/pool_emotions",
        "label_path": f"{PROJECT_ROOT}/data/updated_annotations/pooled_annotations_structured.tsv",
        "sessions": ["01", "02", "03", "04", "05", "06", "07", "08"],
        "file_pattern_template": "*_ses-forrestgump_task-forrestgump_rec-dico7Tad2grpbold7TadNL_run-{}_bold.nii.gz",
        "subjects": ["sub-7"],
        "session_offsets": [0, 902, 1784, 2660, 3636, 4560, 5438, 6522],
        "emotion_idx": {"NONE": 0, "HAPPINESS": 1, "FEAR": 2, "SADNESS": 3, "LOVE": 4, "ANGER": 5},
        "normalization": False,
        "weight_decay": 0,
        "learning_rate": 0.0001,
        "seed": 42,
        "save_model": True,
        "load_model": False,
        "save_model_path": "output/models",
        "load_model_path": f"{PROJECT_ROOT}/output/models/model-sub02-20.pth",
        "output_csv_path": f"{PROJECT_ROOT}/output/PCA/hidden/sub_ALL.csv"
    }
})

print(cfg.data.zarr_path)

# Load dataloaders
train_dataloader, val_dataloader = get_data_loaders(cfg)

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN(cfg=cfg, output_dim=len(cfg.data.emotion_idx))
model.load_state_dict(torch.load(cfg.data.load_model_path, map_location=device))
model.to(device)
model.eval()

# Get hidden state dimension
sample_batch = next(iter(train_dataloader))["data_tensor"].to(device).float()
with torch.no_grad():
    _, hidden_sample = model(sample_batch[:1], return_hidden=True)
hidden_dim = hidden_sample.shape[1]

# Invert emotion label map
inverse_emotion_idx = {v: k for k, v in cfg.data.emotion_idx.items()}

# PCA loop
n_components = 2
num_epochs = 2

for epoch in range(1, num_epochs + 1):
    print(f"\nEpoch {epoch}/{num_epochs}")
    
    ipca = IncrementalPCA(n_components=n_components)

    # Fit PCA incrementally
    for batch in tqdm(train_dataloader, desc=f"Fitting IncrementalPCA Epoch {epoch}"):
        data = batch["data_tensor"].float().to(device)
        with torch.no_grad():
            _, hidden = model(data, return_hidden=True)
        ipca.partial_fit(hidden.cpu().numpy())

# After final epoch, transform and save
all_hidden = []
all_labels = []

for batch in tqdm(train_dataloader, desc="Transforming after final epoch"):
    data = batch["data_tensor"].float().to(device)
    labels = batch["label_tensor"]
    with torch.no_grad():
        _, hidden = model(data, return_hidden=True)
    all_hidden.append(hidden.cpu().numpy())
    all_labels.append(labels.cpu().numpy())

all_hidden = np.concatenate(all_hidden, axis=0)
all_labels = np.concatenate(all_labels, axis=0)
pca_proj = ipca.transform(all_hidden)

results = []
for row, label_idx in zip(pca_proj, all_labels):
    row = list(row)
    row.append(inverse_emotion_idx.get(label_idx, "UNK"))
    results.append(row)

df = pd.DataFrame(results, columns=["PC1", "PC2", "EmotionLabel"])
save_dir = os.path.join(PROJECT_ROOT, "output/PCA/hidden")
os.makedirs(save_dir, exist_ok=True)
csv_path = os.path.join(save_dir, "sub_AL_ao_annotations.csv")
df.to_csv(csv_path, index=False)
print(f"Saved PCA to {csv_path}")


/home/paperspace/DeepEmotion/zarr_datasets/pool_emotions
Dataset contains 152 files.
Spatial dimensions: (132, 175, 48)
Maximum timepoints per file: 542
Subjects: ['sub-01' 'sub-02' 'sub-03' 'sub-04' 'sub-05' 'sub-06' 'sub-07' 'sub-08'
 'sub-09' 'sub-11' 'sub-12' 'sub-13' 'sub-14' 'sub-15' 'sub-16' 'sub-17'
 'sub-18' 'sub-19' 'sub-20']
Sessions: ['01' '02' '03' '04' '05' '06' '07' '08']
Emotion categories: ['NONE', 'HAPPINESS', 'FEAR', 'SADNESS', 'LOVE', 'ANGER']
Total valid labeled timepoints: 13813

Epoch 1/2


Fitting IncrementalPCA Epoch 1: 100%|██████████| 553/553 [06:20<00:00,  1.45it/s]



Epoch 2/2


Fitting IncrementalPCA Epoch 2: 100%|██████████| 553/553 [06:13<00:00,  1.48it/s]
Transforming after final epoch: 100%|██████████| 553/553 [06:19<00:00,  1.46it/s]


Saved PCA to /home/paperspace/DeepEmotion/output/PCA/hidden/sub_AL_ao_annotations.csv
