In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload


In [None]:
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn

from model import MultiSacleUNet
from dataset import ShapeNetPartDataset


def train(model, train_dataloader, val_dataloader, device, config):
    # Declare loss and move to device;     
    loss = nn.CrossEntropyLoss()
    loss.to(device)
    
    # Declare optimizer with learning rate given in config
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    
    # TODO: Set model to train
    model.train()
    best_loss_val = np.inf

    # Keep track of running average of train loss for printing
    train_loss_running = 0.

    for epoch in range(config['max_epochs']):
        for batch_idx, batch in enumerate(train_dataloader):
            print(batch.keys())
            # Move batch to device
            ShapeNetPartDataset.move_batch_to_device(batch, device)
            # set optimizer gradients to zero, perform forward pass
            optimizer.zero_grad()
            predicted_part_label = model(batch['3d_points'])
            

            # Compute loss, Compute gradients, Update network parameters
            loss = loss(predicted_part_label, batch['part_label'])  

            loss.backward()

            optimizer.step()
                
            # Logging
            train_loss_running += loss.item()
            iteration = epoch * len(train_dataloader) + batch_idx

            if iteration % config['print_every_n'] == (config['print_every_n'] - 1):
                print(f'[{epoch:03d}/{batch_idx:05d}] train_loss: {train_loss_running / config["print_every_n"]:.6f}')
                train_loss_running = 0.

            # Validation evaluation and logging
            if iteration % config['validate_every_n'] == (config['validate_every_n'] - 1):
                # Set model to eval
                model.eval()
                # Evaluation on entire validation set
                loss_val = 0.
                for batch_val in val_dataloader:
                    # Move batch to device
                    ShapeNetPartDataset.move_batch_to_device(batch_val, device)
                    #  validationf forward loss
                    with torch.no_grad():
                        prediction = model(batch_val['3d_points'])

                    loss_val += loss_criterion_test(prediction, batch_val['part_label']).item()

                loss_val /= len(val_dataloader)
                if loss_val < best_loss_val:
                    torch.save(model.state_dict(), f'/runs/{config["experiment_name"]}/model_best.ckpt')
                    best_loss_val = loss_val

                print(f'[{epoch:03d}/{batch_idx:05d}] val_loss: {loss_val:.6f} | best_loss_val: {best_loss_val:.6f}')

                # Set model back to train
                model.train()

In [None]:
def main(config):
    """
    Function for training multi-scale U-Net on ShapeNetPart
    :param config: configuration for training - has the following keys
                   'experiment_name': name of the experiment, checkpoint will be saved to folder "/runs/<experiment_name>"
                   'device': device on which model is trained, e.g. 'cpu' or 'cuda:0'
                   'batch_size': batch size for training and validation dataloaders
                   'resume_ckpt': None if training from scratch, otherwise path to checkpoint (saved weights)
                   'learning_rate': learning rate for optimizer
                   'max_epochs': total number of epochs after which training should stop
                   'print_every_n': print train loss every n iterations
                   'validate_every_n': print validation loss and validation accuracy every n iterations
                   'is_overfit': if the training is done on a small subset of data specified in exercise_2/split/overfit.txt,
                                 train and validation done on the same set, so error close to 0 means a good overfit. Useful for debugging.
    """

    # Declare device
    device = torch.device('cpu')
    if torch.cuda.is_available() and config['device'].startswith('cuda'):
        device = torch.device(config['device'])
        print('Using device:', config['device'])
    else:
        print('Using CPU')

    # Create Dataloaders
    train_dataset = ShapeNetPartDataset(path = 'shapenet_prepared.h5', split = 'train' if not config['is_overfit'] else 'overfit')
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,   # Datasets return data one sample at a time; Dataloaders use them and aggregate samples into batches
        batch_size=config['train_batch_size'],   # The size of batches is defined here
        shuffle=True,    # Shuffling the order of samples is useful during training to prevent that the network learns to depend on the order of the input data
        num_workers=4,   # Data is usually loaded in parallel by num_workers
        pin_memory=True,  # This is an implementation detail to speed up data uploading to the GPU
    )

    val_dataset = ShapeNetPartDataset(path = 'shapenet_prepared.h5',split = 'val' if not config['is_overfit'] else 'overfit')
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset,     # Datasets return data one sample at a time; Dataloaders use them and aggregate samples into batches
        batch_size=config['val_batch_size'],   # The size of batches is defined here
        shuffle=False,   # During validation, shuffling is not necessary anymore
        num_workers=4,   # Data is usually loaded in parallel by num_workers
        pin_memory=True,  # This is an implementation detail to speed up data uploading to the GPU
    )

    # Instantiate model
    model = MultiSacleUNet()

    # Load model if resuming from checkpoint
#     if config['resume_ckpt'] is not None:
#         model.load_state_dict(torch.load(config['resume_ckpt'], map_location='cpu'))

    # Move model to specified device
    model.to(device)

    # Create folder for saving checkpoints
    Path(f'./runs/{config["experiment_name"]}').mkdir(exist_ok=True, parents=True)

    # Start training
    train(model, train_dataloader, val_dataloader, device, config)

In [None]:
config = {
"experiment_name": 'test_1',
"is_overfit": False,
"device": 'cuda:0',
"max_epochs": 1,
"train_batch_size": 4,
"val_batch_size": 1,
"learning_rate": 0.0001,
"resume_ckpt": False,
"print_every_n": 1,
"validate_every_n": 1 
}

In [None]:
# main(config)

In [None]:
import h5py

data = h5py.File('shapenet_prepared.h5')

In [None]:
data.keys()

In [None]:
data['x_train']

In [None]:
data['p_train']

In [None]:
data['s_train']

In [None]:
data['y_train']

In [None]:
from dataset import ShapeNetPartDataset

train_dataset = ShapeNetPartDataset(path = 'shapenet_prepared.h5', split = 'train' if not config['is_overfit'] else 'overfit')


In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,  
    batch_size=2,
    shuffle=True,    # Shuffling the order of samples is useful during training to prevent that the network learns to depend on the order of the input data
    num_workers=4,   # Data is usually loaded in parallel by num_workers
    pin_memory=True,  # This is an implementation detail to speed up data uploading to the GPU
)


In [None]:
d = None
for idx,i in enumerate(train_dataloader):
    d = i
    print(i.keys())
    break

In [None]:
d.keys()

In [17]:
device = 'cuda:0'

In [30]:
model = MultiSacleUNet()
model.to(device)

MultiSacleUNet(
  (inception_1): Inception(
    (cv1): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (inception_2): Inception(
    (cv1): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (inception_3): Inception(
    (cv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(512, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (inception_4): Inception(
    (cv1): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(384, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (maxPool_1): MaxPool2d(kernel_size=16, stride=16, padding=(1, 1), dilation=1, ceil_mode=False)
  (maxPool_2): MaxPool2d(kernel_size=16, stride=16, padding=(1, 1), dilatio

In [31]:
# Declare loss and move to device;     
loss = nn.CrossEntropyLoss()
loss.to(device)

CrossEntropyLoss()

In [32]:
# Declare optimizer with learning rate given in config
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
# set optimizer gradients to zero, perform forward pass
optimizer.zero_grad()

In [33]:
# TODO: Set model to train
model.train()

MultiSacleUNet(
  (inception_1): Inception(
    (cv1): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (inception_2): Inception(
    (cv1): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (inception_3): Inception(
    (cv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(512, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (inception_4): Inception(
    (cv1): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), padding=same)
    (cv2): Conv2d(384, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (relu): ReLU()
  )
  (maxPool_1): MaxPool2d(kernel_size=16, stride=16, padding=(1, 1), dilation=1, ceil_mode=False)
  (maxPool_2): MaxPool2d(kernel_size=16, stride=16, padding=(1, 1), dilatio

In [40]:
d["3d_points"] = d["3d_points"].to(device)
d["part_label"] = d["part_label"].to(device)

In [41]:
predicted_part_label = model(d['3d_points'])

input to forward pass torch.Size([4, 3, 256, 256])


RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 1.96 GiB total capacity; 1.34 GiB already allocated; 40.12 MiB free; 1.35 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Compute loss, Compute gradients, Update network parameters
loss = loss(predicted_part_label, batch['part_label'])  

loss.backward()

optimizer.step()

In [None]:
# Logging
train_loss_running += loss.item()
iteration = epoch * len(train_dataloader) + batch_idx