# Data

In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import configparser

class MOTSequenceDataset(Dataset):
    def __init__(self, root_dirs, seq_in_len=20, seq_out_len=10, seq_total_len=20, transform=None):
        """
        root_dirs: list of dataset root paths, e.g., ['dancetrack/train', 'mot17/train']
        seq_len: sequence length (e.g., 10 or 20)
        """
        self.transform = transform
        self.sources = []  # will hold all sequences across datasets
        self.targets = []

        for root in root_dirs:
            sequences = [os.path.join(root, d) for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
            for seq_path in sequences:
                gt_path = os.path.join(seq_path, 'gt', 'gt.txt')
                if not os.path.exists(gt_path):
                    continue
                cfp = configparser.ConfigParser()
                cfp.read(os.path.join(seq_path, 'seqinfo.ini'))
                df = pd.read_csv(gt_path, header=None)
                df.columns = ['frame', 'id', 'x', 'y', 'w', 'h', 'conf', 'class', 'visibility']

                # group by object id
                for obj_id, obj_df in df.groupby('id'):
                    obj_df = obj_df.sort_values('frame')
                    obj_df['x'] /= np.array(cfp['Sequence']['imWidth']).astype(float)
                    obj_df['y'] /= np.array(cfp['Sequence']['imHeight']).astype(float)
                    obj_df['w'] /=np.array(cfp['Sequence']['imWidth']).astype(float)
                    obj_df['h'] /= np.array(cfp['Sequence']['imHeight']).astype(float)
                    bboxes = obj_df[['x', 'y', 'w', 'h']].to_numpy()
                    # create sequences of length seq_len
                    for i in range(len(bboxes) - seq_total_len):
                        seq = bboxes[i:i+seq_total_len]
                        self.sources.append(seq[:seq_in_len])
                        self.targets.append(seq[-seq_out_len:])

        self.sources = np.array(self.sources, dtype=np.float64)
        self.targets = np.array(self.targets, dtype=np.float64)

    def __len__(self):
        return len(self.sources)

    def __getitem__(self, idx):
        source = self.sources[idx]
        target = self.targets[idx]
        return torch.tensor(source), torch.tensor(target)
    
# -----------------------------
# Create train and val loaders
# -----------------------------
seq_in_len = 20
seq_out_len = 10
seq_total_len = 20
batch_size = 8192

base_dir = './'

train_dataset = MOTSequenceDataset([
    f'{base_dir}DanceTrack/train',
    f'{base_dir}MOT17/train',
    f'{base_dir}MOT20/train'
], seq_in_len=seq_in_len, seq_out_len=seq_out_len, seq_total_len=seq_total_len)

val_dataset = MOTSequenceDataset([
    f'{base_dir}DanceTrack/val',
    f'{base_dir}MOT17/val',
    f'{base_dir}MOT20/val'
], seq_in_len=seq_in_len, seq_out_len=seq_out_len, seq_total_len=seq_total_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f'Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}')

# Model

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

# ----------------------------
# LSTM Seq2Seq for Bounding Box Prediction
# ----------------------------

class LSTMPredictor(nn.Module):
    def __init__(self, input_dim=4, hidden_dim=64, num_layers=2, dropout=0.2):
        super(LSTMPredictor, self).__init__()

        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, dtype=torch.float64)
        self.decoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, dtype=torch.float64)
        self.fc_out = nn.Linear(hidden_dim, input_dim, dtype=torch.float64)  # predict offset (dx, dy, dw, dh)

    def forward(self, src, trg=None, teacher_forcing_ratio=0.5):
        # src: (batch, seq_len, 4)
        # trg: (batch, seq_len, 4) - ground truth future sequence
        outputs = []

        batch_size, trg_size, _ = trg.size()
        # Encode
        _, (hidden, cell) = self.encoder(src)

        # First input to decoder is the last frame of src
        decoder_input = trg[:, 0, :].unsqueeze(1)  # shape (batch, 1, 4)
        

        for t in range(1, trg_size + 1):
            out, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            pred = self.fc_out(out)  # (batch, 1, 4)
            outputs.append(pred)

            # Decide if we use teacher forcing
            if t != trg_size:
                use_teacher = trg is not None and random.random() < teacher_forcing_ratio
                decoder_input = trg[:, t, :].unsqueeze(1) if use_teacher else pred

        outputs = torch.cat(outputs, dim=1)  # (batch, seq_len, 4)
        return outputs

# ----------------------------
# Training & Evaluation Loops
# ----------------------------

def train_one_epoch(model, dataloader, optimizer, criterion, device, teacher_forcing_ratio=0.5):
    model.train()
    total_loss = 0

    for src, trg in dataloader:
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1], teacher_forcing_ratio)

        loss = criterion(output, trg[:, 1:])
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg[:, :-1], teacher_forcing_ratio=0.0)  # no teacher forcing at eval
            loss = criterion(output, trg[:, 1:])
            total_loss += loss.item()

    return total_loss / len(dataloader)


num_epochs = 50
lr = 1e-3
teacher_forcing_ratio = 1
device="cuda"

model = LSTMPredictor().to(device)
criterion = nn.MSELoss()  # predicting offsets → regression
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

# best_val_loss = float("inf")

# for epoch in range(1, num_epochs + 1):
#     train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, teacher_forcing_ratio)
#     val_loss = evaluate(model, val_loader, criterion, device)

#     scheduler.step()

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), "best_lstm_model.pth")

#     current_lr = scheduler.get_last_lr()[0]
#     print(f"Epoch {epoch}: Train Loss = {train_loss:.8f}, Val Loss = {val_loss:.8f}, LR = {current_lr:.8f}")

# print("Training complete. Best Val Loss:", best_val_loss)

In [2]:
model.load_state_dict(torch.load('best_lstm_model.pth', map_location='cuda'))

  model.load_state_dict(torch.load('best_lstm_model.pth', map_location='cuda'))


<All keys matched successfully>

In [16]:
import os
import configparser
import pandas as pd
import numpy as np

seq_in_len = 20
seq_out_len = 10
seq_total_len = 20
batch_size = 8192


sources = []
targets = []

seq_path = '../../.Datasets/DanceTrack/train/dancetrack0001/'
gt_path = os.path.join(seq_path, 'gt', 'gt.txt')
cfp = configparser.ConfigParser()
cfp.read(os.path.join(seq_path, 'seqinfo.ini'))
df = pd.read_csv(gt_path, header=None)
df.columns = ['frame', 'id', 'x', 'y', 'w', 'h', 'conf', 'class', 'visibility']

# group by object id
for obj_id, obj_df in df.groupby('id'):
    obj_df = obj_df.sort_values('frame')
    obj_df['x'] /= np.array(cfp['Sequence']['imWidth']).astype(float)
    obj_df['y'] /= np.array(cfp['Sequence']['imHeight']).astype(float)
    obj_df['w'] /=np.array(cfp['Sequence']['imWidth']).astype(float)
    obj_df['h'] /= np.array(cfp['Sequence']['imHeight']).astype(float)
    bboxes = obj_df[['x', 'y', 'w', 'h']].to_numpy()
    # create sequences of length seq_len
    for i in range(len(bboxes) - seq_total_len):
        seq = bboxes[i:i+seq_total_len]
        sources.append(seq[:seq_in_len])
        targets.append(seq[-seq_out_len:])

sources = np.array(sources, dtype=np.float64)
targets = np.array(targets, dtype=np.float64)
sources = torch.tensor(sources).to(device)
targets = torch.tensor(targets).to(device)

In [17]:
o = model(sources, targets, teacher_forcing_ratio=0)
o[:, :, 0] *= 1920
o[:, :, 2] *= 1920
o[:, :, 1] *= 1080
o[:, :, 3] *= 1080
o[2]

tensor([[399.9756, 561.1806, 171.0167, 293.3949],
        [398.9274, 563.7619, 166.7873, 293.1987],
        [405.7996, 565.9647, 157.4878, 290.4375],
        [404.9469, 572.3746, 149.9252, 288.4795],
        [405.9105, 578.7210, 139.6302, 286.6032],
        [408.2653, 585.8526, 128.8247, 283.1529],
        [414.9424, 589.2550, 124.1929, 279.6036],
        [418.2337, 592.8661, 118.5904, 279.2181],
        [420.5799, 597.7014, 113.1811, 279.0897],
        [416.2885, 599.9997, 115.6383, 278.0821]], device='cuda:0',
       dtype=torch.float64, grad_fn=<SelectBackward0>)

In [18]:
sources[:, :, 0] *= 1920
sources[:, :, 2] *= 1920
sources[:, :, 1] *= 1080
sources[:, :, 3] *= 1080
sources[2]

tensor([[375.0000, 567.0000, 154.5000, 288.0000],
        [390.0000, 567.0000, 148.5000, 286.5000],
        [405.0000, 568.5000, 138.0000, 285.0000],
        [409.5000, 564.0000, 126.0000, 289.5000],
        [415.5000, 558.0000, 108.0000, 300.0000],
        [411.0000, 549.0000, 118.5000, 309.0000],
        [418.5000, 546.0000, 111.0000, 306.0000],
        [424.5000, 544.5000, 127.5000, 306.0000],
        [415.5000, 546.0000, 145.5000, 304.5000],
        [417.0000, 553.5000, 153.0000, 297.0000],
        [420.0000, 553.5000, 171.0000, 295.5000],
        [421.5000, 559.5000, 183.0000, 289.5000],
        [418.5000, 564.0000, 195.0000, 283.5000],
        [417.0000, 567.0000, 187.5000, 280.5000],
        [415.5000, 567.0000, 177.0000, 279.0000],
        [415.5000, 571.5000, 123.0000, 285.0000],
        [412.5000, 576.0000, 117.0000, 297.0000],
        [406.5000, 592.5000, 117.0000, 282.0000],
        [402.0000, 598.5000, 121.5000, 276.0000],
        [414.0000, 598.5000, 114.0000, 277.5000]],