In order to have a fair data proportions experiment, we need to construct the k folds of the Graph dataset beforehand. We will do this here. 

In [1]:
### SET PATH TO BE ROOT ###

import os
# Get the current working directory
current_dir = os.getcwd()
# Move one directory up
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
# Change the working directory
os.chdir(parent_dir)
# Verify the change
print("Current working directory:", os.getcwd())

Current working directory: /home/toy-problem


In [2]:
import argparse
import numpy as np
from pathlib import Path

from easydict import EasyDict
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import Subset
from torch_geometric.loader import DataLoader as GeomDataLoader

from tqdm import tqdm

import pytorch_lightning as pl
from models.pytorch_lightning import MAELightningModule, GAELightningModule
from datasets.pytorch import GraphDataset
from datasets.pytorch_lightning import GNNDataModule, PartNetDataModule, CeasarDataModule, PartNetEmbeddingsDataModule
from models.pytorch_models import Point_MAE
from utils.config import *


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [9]:
# args = argparse.Namespace(config='cfgs/build_gnn_ds.yaml', wandb=False)
args = argparse.Namespace(config='cfgs/build_gnn_ds_ceasar.yaml', wandb=False)

cfg = get_cfg(args=args, logger=None) 

# Load and freeze the encoder
pretrained_frozen_encoder =  MAELightningModule.load_and_freeze_encoder(cfg.group_and_encode_model.pretrained_ckpnt, cfg, args)
group_and_encode = GAELightningModule(cfg, args=args, pretrained_encoder=pretrained_frozen_encoder, base_type=False)
# load the data module
# data_module = PartNetDataModule(cfg=cfg, args=args)
data_module = CeasarDataModule(cfg=cfg, args=args)


trainer = pl.Trainer(
    accelerator='gpu',
    devices=[int(cfg.device.device_id)], 
    max_epochs=1, # one epoch to extract all latents
    logger=None, 
    # default_root_dir=args.experiment_path,
    # limit_val_batches=0,
)

# fit the model
predictions = trainer.predict(model=group_and_encode, datamodule=data_module)

# loop through all predictions
print("creating dataset.")
sample = 0
for encoded_batch, labels_batch in tqdm(predictions):
    for encoded_pcd, labels in zip(encoded_batch, labels_batch):
        file_name = str(sample)
        sample += 1
        save_dir = Path(f"/srv/healthcare/datascience/data/GRAPE/caesar_raw_embeddings/{sample}")
        save_dir.mkdir(parents=True, exist_ok=True)
        # save the data
        encoded_pcd = encoded_pcd.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        labels = labels - 1

        np.save(save_dir / 'embeddings.npy', encoded_pcd)
        np.save(save_dir / 'labels.npy', labels)

# print(f'{len([p for p in graph_data_path.iterdir() if p.is_dir()])} graphs created')



2024-08-11 17:38:54,149 - Point_MAE - INFO - [Point_MAE] 
2024-08-11 17:38:54,153 - Transformer - INFO - [args] {'mask_ratio': 0.6, 'mask_type': 'rand', 'trans_dim': 384, 'encoder_dims': 384, 'depth': 12, 'drop_path_rate': 0.1, 'num_heads': 6, 'decoder_depth': 4, 'decoder_num_heads': 6}
2024-08-11 17:38:54,479 - Point_MAE - INFO - [Point_MAE] divide point cloud into G256 x S32 points ...
2024-08-11 17:38:54,974 - Transformer - INFO - [args] {'trans_dim': 384, 'encoder_dims': 384, 'depth': 12, 'drop_path_rate': 0.1, 'num_heads': 6}
2024-08-11 17:38:55,200 - GroupAndEncode - INFO - [GroupAndEncode] divide point cloud into G256 x S32 points ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2024-08-11 17:38:55,224 - Ceasar - INFO - [DATASET] 80 instances were loaded
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


PRETRAINED ENCODER LOADED SUCCESFULLY
FINAL MASK RATIO:  0.0


Predicting: |          | 0/? [00:00<?, ?it/s]

creating dataset.


100%|██████████| 10/10 [00:00<00:00, 124.34it/s]


Running the below code will output something like:

2024-06-13 10:34:17,478 - GraphData - INFO - [DATASET] 5159 instances were loaded

This can be ignored, because we acces the "full_dataset" below, not the splits. 

In [5]:
args = argparse.Namespace(config='cfgs/train_gnn.yaml')

cfg = get_cfg(args=args, logger=None)
# cfg.dataset.train.return_raw_data = True

# init data module
data_module = GNNDataModule(cfg=cfg, args=None)
full_dataset = data_module.full_dataset

# make a 90% train+val and 10% test split
train_indices, test_indices = train_test_split(list(range(len(full_dataset))), test_size=0.1, random_state=42)

train_dataset = Subset(full_dataset, train_indices)
test_dataset = Subset(full_dataset, test_indices)

# make a 5 folds for 
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# create 5 folds, then save each fold as graph_fold_x
fold = 0
for train_idx, val_idx in kf.split(train_dataset):
    # save_dir = Path(f"/srv/healthcare/datascience/data/GRAPE/table/folds/fold_{fold}")
    save_dir = Path(f"/srv/healthcare/datascience/data/GRAPE/table_occluded/folds/fold_{fold}")
    save_dir.mkdir(parents=True, exist_ok=True)

    train_fold = Subset(train_dataset, train_idx)
    val_fold = Subset(train_dataset, val_idx)
    
    # save training split
    for s, idx in enumerate(train_idx):
        sample = full_dataset[idx]     
        sample_folder = save_dir / 'training' / f"{s}/"
        sample_folder.mkdir(parents=True, exist_ok=True)

        nodes = sample['nodes']
        edges = sample['edges']

        # Save the sample
        np.save(sample_folder / 'nodes.npy', nodes)
        np.save(sample_folder / 'hierarchy_edges.npy', edges)

    # save test split
    for s, idx in enumerate(val_idx):
        sample = full_dataset[idx]     
        sample_folder = save_dir / 'validation' / f"{s}/"
        sample_folder.mkdir(parents=True, exist_ok=True)

        nodes = sample['nodes']
        edges = sample['edges']

        # Save the sample
        np.save(sample_folder / 'nodes.npy', nodes)
        np.save(sample_folder / 'hierarchy_edges.npy', edges)

    print(f"Fold {fold} saved")
    fold += 1

# Save test data separately
# test_save_dir = Path("/srv/healthcare/datascience/data/GRAPE/table/test")
test_save_dir = Path("/srv/healthcare/datascience/data/GRAPE/table_occluded/test")

test_save_dir.mkdir(parents=True, exist_ok=True)

for s, idx in enumerate(test_indices):
    sample = full_dataset[idx]
    sample_folder = test_save_dir / f"{s}"
    sample_folder.mkdir(parents=True, exist_ok=True)

    nodes = sample['nodes']
    edges = sample['edges']

    # Save the sample
    np.save(sample_folder / 'nodes.npy', nodes)
    np.save(sample_folder / 'hierarchy_edges.npy', edges)

print("Test data saved")


2024-08-11 17:32:06,823 - GraphData - INFO - [DATASET] 5159 instances were loaded


DATA SPLIT: 3611 train, 1031 val, 517 test samples


KeyboardInterrupt: 

In [10]:
args = argparse.Namespace(config='cfgs/train_mlp_caesar.yaml')

cfg = get_cfg(args=args, logger=None)
# cfg.dataset.train.return_raw_data = True

# init data module
data_module = PartNetEmbeddingsDataModule(cfg=cfg, args=None)
full_dataset = data_module.full_dataset

# make a 90% train+val and 10% test split
train_indices, test_indices = train_test_split(list(range(len(full_dataset))), test_size=0.1, random_state=42)

train_dataset = Subset(full_dataset, train_indices)
test_dataset = Subset(full_dataset, test_indices)

# make a 5 folds for 
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# create 5 folds, then save each fold as graph_fold_x
fold = 0
for train_idx, val_idx in kf.split(train_dataset):
    # save_dir = Path(f"/srv/healthcare/datascience/data/GRAPE/table/folds/fold_{fold}")
    save_dir = Path(f"/srv/healthcare/datascience/data/GRAPE/caesar_raw_embeddings_folds/folds/fold_{fold}")
    save_dir.mkdir(parents=True, exist_ok=True)

    train_fold = Subset(train_dataset, train_idx)
    val_fold = Subset(train_dataset, val_idx)
    
    # save training split
    for s, idx in enumerate(train_idx):
        sample = full_dataset[idx]     
        sample_folder = save_dir / 'training' / f"{s}/"
        sample_folder.mkdir(parents=True, exist_ok=True)

        encoded_pcd, labels = sample

        # Save the sample
        np.save(sample_folder / 'embeddings.npy', encoded_pcd)
        np.save(sample_folder / 'labels.npy', labels)

    # save test split
    for s, idx in enumerate(val_idx):
        sample = full_dataset[idx]     
        sample_folder = save_dir / 'validation' / f"{s}/"
        sample_folder.mkdir(parents=True, exist_ok=True)

        encoded_pcd, labels = sample

        # Save the sample
        np.save(sample_folder / 'embeddings.npy', encoded_pcd)
        np.save(sample_folder / 'labels.npy', labels)

    print(f"Fold {fold} saved")
    fold += 1

# Save test data separately
# test_save_dir = Path("/srv/healthcare/datascience/data/GRAPE/table/test")
test_save_dir = Path("/srv/healthcare/datascience/data/GRAPE/caesar_raw_embeddings_folds/test")

test_save_dir.mkdir(parents=True, exist_ok=True)

for s, idx in enumerate(test_indices):
    sample = full_dataset[idx]
    sample_folder = test_save_dir / f"{s}"
    sample_folder.mkdir(parents=True, exist_ok=True)

    encoded_pcd, labels = sample

    # Save the sample
    np.save(sample_folder / 'embeddings.npy', encoded_pcd)
    np.save(sample_folder / 'labels.npy', labels)
print("Test data saved")


2024-08-11 17:39:03,866 - PartNetEmbeddings - INFO - [DATASET] 80 instances were loaded


DATA SPLIT: 56 train, 16 val, 8 test samples
Fold 0 saved
Fold 1 saved
Fold 2 saved
Fold 3 saved
Fold 4 saved
Test data saved
