In [1]:
import os

import torch

from cs330_project.datasets.video_data import TinyVIRAT
from cs330_project.models import ViTAutoEncoder
from cs330_project.datasets.data_loading import MaskedVideoAutoencoderTransform, TransformDataset, DataLoader
from cs330_project.training import train_mae_model, make_optimizer, make_scheduler
from cs330_project.losses import autoencoder_loss
from cs330_project.utils import get_rel_pkg_path

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
img_size = (32, 32)
num_frames = 16
num_channels = 3
patch_size = 8
tubelet_size = 4
sampling_rate = 4

In [3]:
# root_dir = r"D:\tiny_virat_composite_dataset"
root_dir = r"C:\Users\Windows\Desktop\Shahir\cs330-final-project-2022\resources\tiny_virat_processed"
dataset_train_orig = TinyVIRAT(
    root_dir=root_dir,
    train=True,
    new_length=num_frames,
    new_step=sampling_rate,
    temporal_jitter=False,
    verbose=False)

In [4]:
model = ViTAutoEncoder(
    in_img_size=img_size,
    in_channels=num_channels,
    patch_size=patch_size,
    spatio_temporal=True,
    tubelet_size=tubelet_size,
    in_num_frames=num_frames,
    encoder_embed_dim=96,
    encoder_depth=6,
    encoder_num_heads=8,
    decoder_embed_dim=48,
    decoder_depth=3,
    decoder_num_heads=8,
    mlp_dim_ratio=2,
    head_dim=16,
    class_embed=True,
    is_spt=True,
    is_lsa=False,
    use_masking=True)

In [5]:
train_transform = MaskedVideoAutoencoderTransform(
    input_size=img_size,
    num_patches=model.encoder.num_patches)
dataset_train = TransformDataset(
    dataset_train_orig,
    labeled=True,
    transform_func=train_transform)

In [6]:
optimizer = make_optimizer(model)
scheduler = make_scheduler(optimizer)

In [7]:
dataloader_train = DataLoader(
    dataset_train,
    batch_size=20,
    num_workers=20,
    pin_memory=True,
    prefetch_factor=10,
    persistent_workers=True)

In [8]:
weights_dir = get_rel_pkg_path("weights/")

In [9]:
model = model.to(device)

In [10]:
tracker = train_mae_model(
    device,
    model,
    dataloader_train,
    autoencoder_loss,
    optimizer,
    weights_dir,
    num_epochs=10,
    save_model=True,
    save_latest=True,
    save_log=True)

----------
Epoch 1/10
----------
Training


  0%|          | 0/384 [00:40<?, ?it/s]

RuntimeError: The size of tensor a (17) must match the size of tensor b (65) at non-singleton dimension 1

In [None]:
for x in dataloader_train:
    break

In [None]:
tracker.save_dir

In [None]:
x[0][1][3].dtype

In [16]:
model.encoder.pos_embedding.shape

torch.Size([1, 65, 96])

In [11]:
self = model.encoder.patch_embedder

In [None]:
self.num_patches = ((self.in_num_frames // self.tubelet_size) * (
            self.in_img_size[0] // self.shift_size) * (self.in_img_size[1] // self.shift_size))

In [12]:
self.num_patches

64