In the previous notebook we saw that the parallel loading now works properly. Let's compare them to make sure they're similar, we can use that to build a test before we refactor.

In [1]:
import numpy as np
import torch

from torch.utils.data import DataLoader
from flytracker.io.dataset import VideoDataset

from flytracker.tracker import _run, _initialize, _localize
from flytracker.preprocessing.preprocessing import  preprocessing_torch
from flytracker.localization.blob import localize_blob, default_blob_detector_params
from flytracker.localization.kmeans import localize_kmeans_torch

%load_ext autoreload
%autoreload 2

In [2]:
movie_path = "../../data/experiments/bruno/videos/seq_1.mp4"

mask = torch.ones((1080, 1280), dtype=bool)
mask[:130, :] = 0
mask[-160:, :] = 0
mask[:, :270] = 0
mask[:, -205:] = 0

mask[:190, :350] = 0
mask[:195, -270:] = 0
mask[-220:, :340] = 0
mask[870:, 1010:] = 0

In [3]:
n_frames = 10000

In [5]:
%%time
dataset = VideoDataset(movie_path, parallel=False)
loader = DataLoader(dataset, batch_size=None, pin_memory=True)

preprocessor_ini = preprocessing_torch(mask, torch.tensor(255, dtype=torch.uint8))
initial_position, initial_frame = _initialize(loader, localize_blob, (default_blob_detector_params(), preprocessor_ini), 100)
preprocessor_main= preprocessing_torch(mask.cuda(), torch.tensor(255, dtype=torch.uint8).cuda())

locs_sequential = _localize(loader, localize_kmeans_torch, (preprocessor_main, 120, 'cuda'), initial_position, 10000)

Done with frame 0
Done with frame 1000
Done with frame 2000
Done with frame 3000
Done with frame 4000
Done with frame 5000
Done with frame 6000
Done with frame 7000
Done with frame 8000
Done with frame 9000
Done with frame 10000
CPU times: user 10min 6s, sys: 10.6 s, total: 10min 17s
Wall time: 36.5 s


Now with the parallel loader:

In [6]:
%%time
dataset = VideoDataset(movie_path, parallel=True)
loader = DataLoader(dataset, batch_size=None, pin_memory=True)

preprocessor_ini = preprocessing_torch(mask, torch.tensor(255, dtype=torch.uint8))
initial_position, initial_frame = _initialize(loader, localize_blob, (default_blob_detector_params(), preprocessor_ini), 100)
preprocessor_main= preprocessing_torch(mask.cuda(), torch.tensor(255, dtype=torch.uint8).cuda())

locs_parallel = _localize(loader, localize_kmeans_torch, (preprocessor_main, 120, 'cuda'), initial_position, 10000)
dataset.reader.stop()

Done with frame 0
Done with frame 1000
Done with frame 2000
Done with frame 3000
Done with frame 4000
Done with frame 5000
Done with frame 6000
Done with frame 7000
Done with frame 8000
Done with frame 9000
Done with frame 10000
CPU times: user 7min 5s, sys: 6.54 s, total: 7min 11s
Wall time: 23.3 s


Whooosh. Now lets compare

In [10]:
torch.allclose(torch.stack(locs_parallel), torch.stack(locs_sequential))

True

Thats good enough. Great. Now let's turn it into a script for easy testing and then we can refactor.