<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

# Training a Multi-Object Tracking Model

## 00 Initialization

In [1]:
import sys

sys.path.append("../../")

import os
import os.path as osp
import time
from ipywidgets import Video
import matplotlib.pyplot as plt
import torch
import torchvision

from utils_cv.tracking.data import Urls
from utils_cv.tracking.dataset import TrackingDataset
from utils_cv.tracking.model import TrackingLearner, write_video

from utils_cv.common.data import data_path, download, unzip_url
from utils_cv.common.gpu import which_processor, is_windows

# Change matplotlib backend so that plots are shown for windows
if is_windows():
    plt.switch_backend("TkAgg")

print(f"TorchVision: {torchvision.__version__}")
which_processor()

TorchVision: 0.4.0a0
Torch is using GPU: Tesla K80


This shows your machine's GPUs (if it has any) and the computing device `torch/torchvision` is using.

In [2]:
# Ensure edits to libraries are loaded and plotting is shown in the notebook.
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Next, set some model runtime parameters.

In [3]:
EPOCHS = 1
LEARNING_RATE = 0.0001
BATCH_SIZE = 1

SAVE_MODEL = True
FRAME_RATE = 30

CONF_THRES = 0.3
TRACK_BUFFER = 300
IM_SIZE = (1080, 1920)

TRAIN_DATA_PATH = unzip_url(Urls.fridge_objects_path, exist_ok=True)
EVAL_DATA_PATH = unzip_url(Urls.carcans_annotations_path, exist_ok=True)

BASELINE_MODEL = "./models/all_dla34_new.pth"
FT_MODEL = "./models/model_30.pth"

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using torch device: {device}")

Using torch device: cuda


## 01 Finetune a Pretrained Model

Initialize the training dataset.

In [4]:
data_train = TrackingDataset(
    TRAIN_DATA_PATH,
    batch_size=BATCH_SIZE
)

dataset summary
OrderedDict([('default', 4.0)])
total # identities: 5
start index
OrderedDict([('default', 0)])


Initialize and load the model. We use the baseline FairMOT model, which can be downloaded [here](https://drive.google.com/file/d/1udpOPum8fJdoEQm6n0jsIgMMViOMFinu/view).

In [5]:
tracker = TrackingLearner(data_train, "./models/fairmot_ft.pth")
print(f"Model: {type(tracker.model)}")

Model: <class 'utils_cv.tracking.references.fairmot.models.networks.pose_dla_dcn.DLASeg'>


In [6]:
tracker.fit(num_epochs=EPOCHS, lr=LEARNING_RATE, resume=True)

Loading /home/jihon/computervision-recipes/scenarios/tracking/models/all_dla34.pth
loaded /home/jihon/computervision-recipes/scenarios/tracking/models/all_dla34.pth, epoch 10
Resumed optimizer with start lr 0.0001
=====  Epoch: 11/11  =====




loss: 1.1128346400433464
hm_loss: 0.06353224289612051
wh_loss: 1.57920023114543
off_loss: 0.18636367223715702
id_loss: 0.8860541224528692
time: 44.016666666666666
Model saved to ./models/fairmot_ft.pth


In [7]:
if SAVE_MODEL:
    tracker.save(f"./models/model_{EPOCHS:02d}.pth")

Model saved to ./models/model_01.pth


## 02 Evaluate

Note that `EVAL_DATA_PATH` follows the FairMOT input format.

In [10]:
eval_results = tracker.predict(
    EVAL_DATA_PATH,
    conf_thres=CONF_THRES,
    track_buffer=TRACK_BUFFER,
    im_size=IM_SIZE,
    frame_rate=FRAME_RATE
)

Creating model...
loaded ./models/fairmot_ft.pth, epoch 11


In [11]:
eval_metrics = tracker.evaluate(eval_results, EVAL_DATA_PATH) 

## 03 Predict

In [12]:
input_video = download(
    Urls.carcans_video_path, osp.join(data_path(), "carcans.mp4")
)

In [15]:
test_results = tracker.predict(
    input_video,
    conf_thres=CONF_THRES,
    track_buffer=TRACK_BUFFER,
    im_size=IM_SIZE,
)

Creating model...
loaded ./models/fairmot_ft.pth, epoch 11
Lenth of the video: 251 frames


In [16]:
output_video = osp.join(data_path(), "carcans_output.mp4")

In [None]:
write_video(test_results, input_video, output_video)

In [None]:
Video.from_file(output_video)