# Phase 1: LSTM motion predictor training

Train the improved LSTM predictor on the same datasets and setup as the transformer (phase1_improved). Uses `GTSequenceDataset` (src, trg, gt_src, gt_trg) and `LossFunction` with CIoU + confidence.

In [4]:
from dataset import GTSequenceDataset
from torch.utils.data import DataLoader, ConcatDataset

SEQ_IN_LEN = 30
SEQ_OUT_LEN = 20
SEQ_TOTAL_LEN = 50
BATCH_SIZE = 512
STEPS = 4
NOISE_COEFFICIENT = 0.15
NOISE_PROB = 0

BASE_DIR = '../../Datasets/'
train_dataset = GTSequenceDataset.from_roots([
    f'{BASE_DIR}/SportsMOT/train',
    f'{BASE_DIR}DanceTrack/train',
    f'{BASE_DIR}MOT17/train',
    f'{BASE_DIR}MOT20/train'
], seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, steps=STEPS, noise_coeff=NOISE_COEFFICIENT, noise_prob=NOISE_PROB)

val_dataset = GTSequenceDataset.from_roots([
    f'{BASE_DIR}/SportsMOT/val',
    f'{BASE_DIR}DanceTrack/val',
    f'{BASE_DIR}MOT17/val',
    f'{BASE_DIR}MOT20/val'
], seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, steps=STEPS, noise_coeff=NOISE_COEFFICIENT, noise_prob=NOISE_PROB)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

print(f'Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}')

Train samples: 372563, Val samples: 221031


In [2]:
# Optional: inspect one batch (src, trg, gt_src, gt_trg)
d = next(iter(train_loader))
print([x.shape for x in d])

[torch.Size([512, 30, 13]), torch.Size([512, 20, 13]), torch.Size([512, 30, 13]), torch.Size([512, 20, 13])]


In [6]:
from lstm_improved import ImprovedLSTMPredictor
from loss import LossFunction
from torch import optim

DEVICE = 'cuda'
model = ImprovedLSTMPredictor(
    input_dim=13,
    output_dim=5,
    d_model=512,
    hidden_dim=1024,
    num_layers=1,
    dropout=0,
    teacher_forcing_ratio=1,
).to(DEVICE)
criterion = LossFunction(loss1_coeff=1, loss2_coeff=0, loss3_coeff=0, loss4_coeff=0)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Model parameters: 6,763,525


In [7]:
# Optional: load a pretrained LSTM checkpoint
model.load_weight('pretrained/lstm-new-d512-h1024.pth')

In [None]:
model.evaluate(val_loader, criterion)

In [8]:
LR = 5e-4
NUM_EPOCHS = 30

optimizer = optim.AdamW(
    model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=1e-4,
    eps=1e-8
)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS + 1)
best_val_loss = float("inf")
log_file = open('file_lstm.log', 'w')
log_file.close()

for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = model.train_one_epoch(train_loader, optimizer, criterion, device=DEVICE)
    val_loss = model.evaluate(val_loader, criterion, device=DEVICE)

    scheduler.step()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_weight('pretrained/lstm-new-d256-h256-e3-d3.pth')

    current_lr = scheduler.get_last_lr()[0]
    print(f"Epoch {epoch}: Train Loss = {train_loss:.8f}, Val Loss = {val_loss:.8f}, LR = {current_lr:.8f}")
    log_file = open('file_lstm.log', 'a')
    log_file.write(f"Epoch {epoch}: Train Loss = {train_loss:.8f}, Val Loss = {val_loss:.8f}, LR = {current_lr:.8f}\n")
    log_file.close()

print("Training complete. Best Val Loss:", best_val_loss)

Epoch 1: Train Loss = 1.01493908, Val Loss = 0.98717827, LR = 0.00049872
Epoch 2: Train Loss = 0.28754904, Val Loss = 0.11128036, LR = 0.00049488
Epoch 3: Train Loss = 0.13727466, Val Loss = 0.09865989, LR = 0.00048853
Epoch 4: Train Loss = 0.11444886, Val Loss = 0.09831966, LR = 0.00047974
Epoch 5: Train Loss = 0.10489155, Val Loss = 0.10434487, LR = 0.00046859
Epoch 6: Train Loss = 0.09625999, Val Loss = 0.09516920, LR = 0.00045519
Epoch 7: Train Loss = 0.09020396, Val Loss = 0.09390839, LR = 0.00043969
Epoch 8: Train Loss = 0.09116861, Val Loss = 0.09483480, LR = 0.00042224
Epoch 9: Train Loss = 0.08809761, Val Loss = 0.09624782, LR = 0.00040303
Epoch 10: Train Loss = 0.08433444, Val Loss = 0.09237980, LR = 0.00038224
Epoch 11: Train Loss = 0.08076018, Val Loss = 0.09301889, LR = 0.00036010
Epoch 12: Train Loss = 0.07463898, Val Loss = 0.06794308, LR = 0.00033683
Epoch 13: Train Loss = 0.06083539, Val Loss = 0.06693332, LR = 0.00031266
Epoch 14: Train Loss = 0.05818817, Val Loss = 0

In [4]:
model.load_weight('pretrained/lstm-new-d512-h1024.pth')

In [15]:
from dataset import GTSequenceDataset
from torch.utils.data import DataLoader, ConcatDataset
from transformer_encoder import MotionTransformer
from loss import LossFunction
from torch import optim
import math, torch, torch.nn as nn
import torch.nn.functional as F
import random
import os


SEQ_IN_LEN = 30
SEQ_OUT_LEN = 20
SEQ_TOTAL_LEN = 50
BATCH_SIZE = 1024
STEPS = 4
NOISE_PROB = 0
NOISE_COEFFICIENT = 0.15
LR = 5e-4
DEVICE = 'cuda'
PARAMS = [[30, 0], [15, 0.2], [10, 0.4], [10, 0.6]]

DEVICE = 'cuda'
model = ImprovedLSTMPredictor(
    input_dim=13,
    output_dim=5,
    d_model=512,
    hidden_dim=1024,
    num_layers=1,
    dropout=0.15,
    teacher_forcing_ratio=1,
).to(DEVICE)
criterion = LossFunction(loss1_coeff=1, loss2_coeff=0, loss3_coeff=0, loss4_coeff=0)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

optimizer = optim.AdamW(
    model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    weight_decay=1e-4,
    eps=1e-8
)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=sum([p[0] for p in PARAMS]) + 1)
log_file = open('file_lstm.log', 'w')
log_file.close()

for param in PARAMS:
    NUM_EPOCHS, NOISE_PROB = param
    if NOISE_PROB > 0:
        criterion = LossFunction(loss1_coeff=0.5, loss2_coeff=0, loss3_coeff=1, loss4_coeff=0)
    else:
        criterion = LossFunction(loss1_coeff=1, loss2_coeff=0, loss3_coeff=0, loss4_coeff=0)
    best_val_loss = float("inf")

    BASE_DIR = '../../Datasets/'
    train_dataset = GTSequenceDataset.from_roots([
        f'{BASE_DIR}/SportsMOT/train',
        f'{BASE_DIR}DanceTrack/train',
        f'{BASE_DIR}MOT17/train',
        f'{BASE_DIR}MOT20/train'
    ], seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, steps=STEPS, noise_coeff=NOISE_COEFFICIENT, noise_prob=NOISE_PROB)
    val_dataset = GTSequenceDataset.from_roots([
        f'{BASE_DIR}/SportsMOT/val',
        f'{BASE_DIR}DanceTrack/val',
        f'{BASE_DIR}MOT17/val',
        f'{BASE_DIR}MOT20/val'
    # ], seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, steps=STEPS, noise_coeff=0, noise_prob=0)
    ], seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, steps=STEPS, noise_coeff=NOISE_COEFFICIENT, noise_prob=NOISE_PROB)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for epoch in range(1, NUM_EPOCHS + 1):
        train_loss = model.train_one_epoch(train_loader, optimizer, criterion)
        val_loss = model.evaluate(val_loader, criterion)

        scheduler.step()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model.save_weight('pretrained/lstm-new-d512-h1024.pth')

        current_lr = scheduler.get_last_lr()[0]
        print(f"Epoch {epoch}: Train Loss = {train_loss:.8f}, Val Loss = {val_loss:.8f}, LR = {current_lr:.8f}")
        log_file = open('file_lstm.log', 'a')
        log_file.write(f"Epoch {epoch}: Train Loss = {train_loss:.8f}, Val Loss = {val_loss:.8f}, LR = {current_lr:.8f}\n")
        log_file.close()

    print("Training complete. Best Val Loss:", best_val_loss)

    

    

Model parameters: 6,763,525
Epoch 1: Train Loss = 0.61372989, Val Loss = 0.27502551, LR = 0.00049972
Epoch 2: Train Loss = 0.27050042, Val Loss = 0.17924333, LR = 0.00049887
Epoch 3: Train Loss = 0.21027387, Val Loss = 0.14968078, LR = 0.00049746
Epoch 4: Train Loss = 0.19658001, Val Loss = 0.16729819, LR = 0.00049548
Epoch 5: Train Loss = 0.15750353, Val Loss = 0.12727606, LR = 0.00049295
Epoch 6: Train Loss = 0.13628713, Val Loss = 0.11548214, LR = 0.00048987
Epoch 7: Train Loss = 0.12436708, Val Loss = 0.10598248, LR = 0.00048625
Epoch 8: Train Loss = 0.12670715, Val Loss = 0.13624554, LR = 0.00048209
Epoch 9: Train Loss = 0.12757746, Val Loss = 0.09944880, LR = 0.00047741
Epoch 10: Train Loss = 0.10750821, Val Loss = 0.09436887, LR = 0.00047221
Epoch 11: Train Loss = 0.09445427, Val Loss = 0.09535331, LR = 0.00046651
Epoch 12: Train Loss = 0.09057219, Val Loss = 0.09372558, LR = 0.00046031
Epoch 13: Train Loss = 0.09011029, Val Loss = 0.09613086, LR = 0.00045364
Epoch 14: Train Los

In [10]:
import torch
from dataset import GTSequenceDataset

# SEQ_PATH = '../../Datasets/MOT20/val/MOT20-01/'
SEQ_PATH = '../../Datasets/MOT17/val/MOT17-05-FRCNN//'
# SEQ_PATH = '../../Datasets/DanceTrack/val/dancetrack0018/'
# SEQ_PATH = '../../Datasets/DanceTrack/train/dancetrack0006//'
# SEQ_PATH = '../../Datasets/SportsMOT/val/v_0kUtTtmLaJA_c004//'
SEQ_IN_LEN = 5
SEQ_OUT_LEN = 2
SEQ_TOTAL_LEN = SEQ_IN_LEN + SEQ_OUT_LEN
BATCH_SIZE = 512
STEPS = 4

# d = GTSequenceDataset.from_sequence(SEQ_PATH, seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, noise_prob=0, noise_coeff=0)
d = GTSequenceDataset.from_sequence(SEQ_PATH, seq_in_len=SEQ_IN_LEN, seq_out_len=SEQ_OUT_LEN, seq_total_len=SEQ_TOTAL_LEN, steps=STEPS, noise_coeff=0.15, noise_prob=0.4)
sources = torch.tensor(d.sources).to(DEVICE)[:BATCH_SIZE]
# targets = torch.tensor(d.targets).to(DEVICE)[:BATCH_SIZE]
gt_sources = torch.tensor(d.gt_sources).to(DEVICE)[:BATCH_SIZE]
targets = torch.tensor(d.gt_targets).to(DEVICE)[:BATCH_SIZE]
# gt_targets = torch.tensor(d.gt_targets).to(DEVICE)[:BATCH_SIZE]

# o = model.inference(sources, targets, num_steps=targets.size(1) - 1)
o = model.forward(sources, targets[:, :-1])

o[:, :, 0] *= d.image_width.item()
o[:, :, 2] *= d.image_width.item()
o[:, :, 1] *= d.image_height.item()
o[:, :, 3] *= d.image_height.item()
targets[:, :, 0] *= d.image_width.item()
targets[:, :, 2] *= d.image_width.item()
targets[:, :, 1] *= d.image_height.item()
targets[:, :, 3] *= d.image_height.item()
t_ = targets[:, 1:, :4]
sources[:, :, 0] *= d.image_width.item()
sources[:, :, 2] *= d.image_width.item()
sources[:, :, 1] *= d.image_height.item()
sources[:, :, 3] *= d.image_height.item()
o_ = o[:, :, :4]

index = 9
# t[index], o[index]
# f = (t[index] - o[index]).abs()[:, 3].mean()
# torch.diagonal(criterion.iou(t, o), dim1=1, dim2=2).mean()
# criterion.iou(t_, o_).mean()
criterion.ciou(t_, o_).mean()
# f = (t_ - o_).abs().mean()
# f


tensor(0.8807, device='cuda:0', grad_fn=<MeanBackward0>)