# Solving a constrained Integer Linear Program using learned cost

We consider the problem of minimizing a constrained ILP program. The problem can be written formally as
\begin{equation}
\begin{aligned}
\mathbf{x}^{\ast} =& \mathop{\arg\min}_{\mathbf{x} \in \mathcal{X}} \mathbf{c(f;\mathbf{w})}^T \mathbf{x} \\
&\text{s.t.} \begin{aligned}[t]
     \mathbf{A}\mathbf{x} & = \mathbf{b} \\
     \mathbf{G}\mathbf{x} & \leq \mathbf{h}
  \end{aligned}
\end{aligned}
\end{equation}
where $\mathbf{c(f;\mathbf{w})} \in \mathbb{R}^n$ is the cost function parametrized by $\mathbf{\mathbf{w}}$, given input $\mathbf{f}$. And $\mathbf{A,b}$ and $\mathbf{G,h}$ defines the equality and in-equality constraints, respectively.

In [9]:
%load_ext autoreload
%autoreload 2

import os, time
import cv2, random
import pickle, joblib
import sklearn.metrics
import numpy as np
np.set_printoptions(suppress=True)
import gurobipy as gp
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from tqdm import tqdm

from lib.tracking import Tracker
from lib.utils import getIoU, computeBoxFeatures, interpolateTrack, interpolateTracks

class Net(nn.Module): 
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Sequential(nn.Linear(6,6), nn.ReLU(), nn.Linear(6,1))
    def forward(self, data):
        x = self.fc(data.edge_attr)
        x = nn.Sigmoid()(x)
        return x
    
net = Net()
net.load_state_dict(torch.load('ckpt/visdrone/epoch-20.pth'))
#net.load_state_dict(torch.load('ckpt/qp/epoch_11.pth'))

#net.load_state_dict(torch.load('../ckpt/bce/epoch_0010.pth'))
#net.load_state_dict(torch.load('../ckpt/spo_mlp/epoch_15.pth'))
#net.load_state_dict(torch.load('../ckpt/qptl_l2/epoch_0009.pth'))
#net.load_state_dict(torch.load('../ckpt/qptl_l1/epoch_0009.pth'))

tracker = Tracker(net)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  net.load_state_dict(torch.load('ckpt/visdrone/epoch-20.pth'))


In [10]:
def get_trans_probs(tracker, curr_dets, curr_app_feats, app_thresh, max_frame_gap = 5):
    """
    Inputs: tracker: an instance of the Tracker.
            curr_dets: frame, x1, y1, x2, y2, det_confidence, node_ind.
            curr_app_feats: normalized appearance features for curr_dets.
            max_frame_gap: frame gap used to connect detections.
    Return: transition probabilities for LP that handles false negatives(missing detections).
    """
    edge_ind = 0
    edge_feats, lifted_probs = [], []
    edge_type = [] #1:base edge 2:lifted edge-1:pruned lifted edge
    
    cos_sim_mat = np.dot(curr_app_feats, curr_app_feats.T)
    linkIndexGraph = np.zeros((curr_dets.shape[0], curr_dets.shape[0]), dtype=np.int32)
    for i in range(curr_dets.shape[0]):
        for j in range(curr_dets.shape[0]):
            frame_gap = curr_dets[j][0] - curr_dets[i][0]
            cos_sim = cos_sim_mat[i, j]

            if frame_gap == 1: #base edge
                edge_type.append(1)
                feats = computeBoxFeatures(curr_dets[i, 1:5], curr_dets[j, 1:5])
                iou = getIoU(curr_dets[i, 1:5], curr_dets[j, 1:5])
                feats.extend((iou, cos_sim))
                edge_feats.append(feats)
                edge_ind += 1
                linkIndexGraph[i, j] = edge_ind

            elif frame_gap > 1 and frame_gap <= max_frame_gap: #lifted edge
                if cos_sim > app_thresh:
                    edge_type.append(2)
                    time_weight = 0.9 ** frame_gap
                    lifted_probs.append(cos_sim * time_weight)
                else:
                    edge_type.append(-1)

                edge_ind += 1
                linkIndexGraph[i, j] = edge_ind
                
    edge_type = np.array(edge_type)
    edge_feats = torch.Tensor(edge_feats)
    with torch.no_grad():
        logits = tracker.net.fc(edge_feats)
        prob = nn.Sigmoid()(logits)
        prob = torch.clamp(prob, min=1e-7, max=1-1e-7).flatten().numpy()
        
    probs = np.zeros(edge_ind)
    base_inds = np.where(edge_type == 1)[0]
    lifted_inds = np.where(edge_type == 2)[0]
    pruned_lifted_inds = np.where(edge_type == -1)[0]
    probs[base_inds] = prob            #base probs
    probs[lifted_inds] = lifted_probs  #lifted probs
    return linkIndexGraph, probs

# Tracking!

In [11]:
def remove_duplicates(data):
    # Extract frame_id and object_id columns and make them a structured array for uniqueness
    keys = data[:, :2].astype(np.int64)  # assuming frame_id and object_id are integers
    # Convert keys to a string or tuple that can be hashed
    _, idx = np.unique(keys, axis=0, return_index=True)
    # Sort indices to keep original order
    idx_sorted = np.sort(idx)
    return data[idx_sorted]

In [12]:
test_seq = ["uav0000009_03358_v",
"uav0000073_00600_v",
"uav0000077_00720_v",
"uav0000088_00290_v",
"uav0000119_02301_v",
"uav0000120_04775_v",
"uav0000188_00000_v",
"uav0000201_00000_v",
"uav0000249_00001_v",
"uav0000249_02688_v",
"uav0000297_00000_v",
"uav0000297_02761_v",
"uav0000306_00230_v",
"uav0000355_00001_v",
"uav0000370_00001_v"]

In [13]:
app_thresh = 0.75 #0.7, 0.8
nms_thresh, eps = 0.3, 1e-7

# for seq in ['MOT17-01', 'MOT17-03', 'MOT17-06', 'MOT17-07', 'MOT17-08', 'MOT17-12', 'MOT17-14']:
# for seq in ['uav0000086_00000_v', 'uav0000117_02622_v', 'uav0000137_00458_v', 'uav0000182_00000_v', 'uav0000268_05773_v', 'uav0000305_00000_v', 'uav0000339_00001_v']:

for seq in test_seq:
    img = Image.open(f"/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/sequences/{seq}/0000001.jpg").convert('RGB')
    if seq == "uav0000073_04464_v" or seq == "uav0000161_00000_v":
        continue
    # img =  Image.open(f"/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-val/sequences/{seq}/0000001.jpg").convert('RGB')
    img_Height, img_Width = img.size

    #Static camera, moving camera
    if seq in ['MOT17-03']:
        batch_size, dist_thresh, prune_len = 50, 50, 2 #tracklets less than 2 are pruned
    else:
        batch_size, dist_thresh, prune_len = 100, 100, 3
        
    # if seq == 'MOT17-06':
    #     img_Height, img_Width = 480, 640
    # else:
    #     img_Height, img_Width = 1080, 1920
        
    #for detector in ['DPM','FRCNN','SDP']:
    for detector in ['GT']:
        print('Sequence {}, {} detection, app thresh {}, dist thresh {}, retain length {}'.format(
            seq, detector, app_thresh, dist_thresh, prune_len))
        # if os.path.isfile('output_visdroneV2/{}-{}.txt'.format(seq, detector)):
        #     continue
        # det_file = f"VisDrone/VisDrone2019-MOT-test-dev/det/{seq}.txt"
        # app_file = f"VisDrone/VisDrone2019-MOT-test-dev/feature/{seq}"

        det_file = f"VisDrone/VisDrone2019-MOT-test-dev/annotations/{seq}.txt"
        app_file = f"VisDrone/VisDrone2019-MOT-test-dev/feature_gt_box/{seq}"

        # det_file = './result/{}/det_{}.txt'.format(seq, detector)
        # app_file = './result/{}/app_det_{}.npy'.format(seq, detector)

        dets = np.loadtxt(det_file, delimiter=',')
        app_feats = np.load(app_file)
        print(dets.shape, app_feats.shape)
        assert dets.shape[0] == app_feats.shape[0], 'Shape mismatch'

        batch_overlap = 5                  #Number of frames to overlap between 2 batches
        num_frames = int(dets[:, 0].max()) #Number of frames for this video
        tracks_list, assignments_list, features_list, nms_list = [],[],[],[]
        
        for start_frame in range(1, num_frames+1, batch_size-batch_overlap):
            end_frame = start_frame + batch_size - 1
            if end_frame >= num_frames:
                end_frame = num_frames
                
            print('Tracking from frame %d to %d'%(start_frame, end_frame))
            curr_ind = np.logical_and(dets[:, 0] >= start_frame, dets[:, 0] <= end_frame)
            curr_dets = np.concatenate([dets[curr_ind, 0][:, None], dets[curr_ind, 2:7],
                                        np.arange(dets[curr_ind].shape[0])[:, None]], axis=1)

            curr_dets[:, 3:5] = curr_dets[:, 3:5] + curr_dets[:, 1:3] # convert to frame,x1,y1,x2,y2,conf,node_ind
            curr_app_feats = app_feats[curr_ind]
            curr_app_feats = curr_app_feats / np.linalg.norm(curr_app_feats, axis=1, keepdims=True)
            for iteration in range(2):
                if iteration == 0:
                    print('%d-th iteration'%iteration)
                    linkIndexGraph, probs = get_trans_probs(tracker, curr_dets, curr_app_feats, 
                                                            app_thresh, max_frame_gap = 5)
                    trans_cost = - np.log(probs + eps) #np.log((1 - probs + eps)/(probs + eps))
                    det_cost = - curr_dets[:, -2]
                    entry_cost = 0.5 * np.ones(det_cost.shape[0])
                    exit_cost = entry_cost
                    cost = np.concatenate((det_cost, entry_cost, exit_cost, trans_cost))

                    A_eq, b_eq, A_ub, b_ub = tracker.build_constraint(linkIndexGraph)
                    sol = tracker.linprog(c=cost, A_eq=A_eq, b_eq=b_eq, A_ub=A_ub, b_ub=b_ub)
                    
                    tracklets = tracker.recoverTracklets(curr_dets, sol, linkIndexGraph, prune_len=prune_len)    
                    tracklets_ = np.delete(tracklets, -1, axis=1)
                    interpolated_tracklets = interpolateTracks(tracklets_)
                    
                else:
                    print('%d-th iteration'%iteration)
                    assignment_list, feature_list = tracker.clusterSkipTracklets(tracklets, curr_app_feats, 
                                                                                 dist_thresh, app_thresh)
                    tracks = tracker.recoverClusteredTracklets(tracklets, assignment_list)
                    tracks = interpolateTracks(tracks)

                    assignments_list.append(assignment_list)
                    feature_array = np.stack(feature_list)
                    feature_array = feature_array / np.linalg.norm(feature_array, axis=1, keepdims=True)
                    
            tracks_list.append(tracks)
            features_list.append(feature_array)
            
        final_tracks = tracker.stitchTracklets(tracks_list, features_list)
        final_tracks = remove_duplicates(final_tracks)
        save_file = 'output_visdrone_v2_e20/{}-{}.txt'.format(seq, detector)
        print('Finished tracking, saving to {}'.format(save_file))
        np.savetxt(save_file, final_tracks, fmt='%d',delimiter=',')

Sequence uav0000009_03358_v, GT detection, app thresh 0.75, dist thresh 100, retain length 3
(12740, 10) (12740, 2048)
Tracking from frame 1 to 100
0-th iteration
Set parameter Username
Set parameter LicenseID to value 2641140
Academic license - for non-commercial use only - expires 2026-03-24
1-th iteration
Tracking from frame 96 to 195
0-th iteration
1-th iteration
Tracking from frame 191 to 219
0-th iteration
1-th iteration
Finished tracking, saving to output_visdrone_v2_e20/uav0000009_03358_v-GT.txt
Sequence uav0000073_00600_v, GT detection, app thresh 0.75, dist thresh 100, retain length 3
(14721, 10) (14721, 2048)
Tracking from frame 1 to 100
0-th iteration
1-th iteration
Tracking from frame 96 to 195
0-th iteration
1-th iteration
Tracking from frame 191 to 290
0-th iteration
1-th iteration
Tracking from frame 286 to 328
0-th iteration
1-th iteration
Finished tracking, saving to output_visdrone_v2_e20/uav0000073_00600_v-GT.txt
Sequence uav0000077_00720_v, GT detection, app thresh

# Show final tracking and detection results

In [5]:
test_seq = ["uav0000009_03358_v",
"uav0000073_00600_v",
"uav0000077_00720_v",
"uav0000088_00290_v",
"uav0000119_02301_v",
"uav0000120_04775_v",
"uav0000188_00000_v",
"uav0000201_00000_v",
"uav0000249_00001_v",
"uav0000249_02688_v",
"uav0000297_00000_v",
"uav0000297_02761_v",
"uav0000306_00230_v",
"uav0000355_00001_v",
"uav0000370_00001_v"]

In [7]:
import os
import cv2
import numpy as np

#seq = 'MOT17-03'
detector = 'GT'
# ['uav0000013_00000_v', 'uav0000013_01073_v', 'uav0000013_01392_v', 'uav0000020_00406_v', 'uav0000071_03240_v', 'uav0000072_04488_v']
# for seq in ['MOT17-01', 'MOT17-03', 'MOT17-06', 'MOT17-07', 'MOT17-08', 'MOT17-12', 'MOT17-14']:
# for seq in os.listdir("/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-val/sequences/"):
for seq in test_seq:
    print(seq)
    save_dir = 'visualization/{}-{}'.format(seq, detector)
    print('save dir {}'.format(save_dir))
    os.makedirs(save_dir, exist_ok=True)
    
    #tracks: frame, ID, x, y, w, h, -1, -1, -1, -1
    #dets:   frame, -1, x, y, w, h, conf, -1, -1, -1
    # tracks = np.loadtxt(f'BYTE_Results/{seq}-{detector}.txt', delimiter=',')
    # tracks = np.loadtxt(f'output_visdrone/{seq}-{detector}.txt', delimiter=',')
    # tracks = np.loadtxt(f'VisDrone/VisDrone2019-MOT-test-dev/det/{seq}.txt', delimiter=',')
    tracks = np.loadtxt(f'output_visdroneV2/{seq}-{detector}.txt', delimiter=',')
        
    tracks = tracks.astype(np.int32)
    
    colors = np.random.rand(1000,3)
    resize_scale = 0.5
    
    # Get the first frame to determine video dimensions
    first_frame = tracks[:, 0].min()
    # img_file = os.path.join('data/MOT/MOT17/test/{}-{}/img1/{:06d}.jpg'.format(seq, detector, first_frame))
    img_file = os.path.join('/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/sequences/{}/{:07d}.jpg').format(seq, first_frame)
    img = cv2.imread(img_file)
    height, width = int(resize_scale*img.shape[0]), int(resize_scale*img.shape[1])
    
    # Initialize video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # MP4 codec
    video_path = os.path.join("visualization", '{}-{}.mp4'.format(seq, detector))
    fps = 30  # Frames per second for the output video
    video_writer = cv2.VideoWriter(video_path, fourcc, fps, (width, height))
    
    for frame in range(tracks[:, 0].min(), tracks[:, 0].max()+1):
        if frame % 100 == 0:
            print('Processing frame {}'.format(frame))
        
        img_file = os.path.join('/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/sequences/{}/{:07d}.jpg').format(seq, frame)
        img = cv2.imread(img_file)
        img = cv2.resize(img, (int(resize_scale*img.shape[1]), int(resize_scale*img.shape[0])))
        cv2.putText(img, '{:04}'.format(frame), (0,50), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255,0,255), thickness=2)
        bboxes = tracks[tracks[:, 0] == frame, 1:6]
        
        if bboxes.shape[0] != 0:
            #detections = dets[dets[:, 0] == frame, 2:7]
            for i in range(bboxes.shape[0]):
                ID = int(bboxes[i][0])
                x, y = int(resize_scale*(bboxes[i][1])), int(resize_scale*(bboxes[i][2]))
                w, h = int(resize_scale*(bboxes[i][3])), int(resize_scale*(bboxes[i][4]))
                cv2.rectangle(img, (x,y), (x+w,y+h), 255*colors[ID], thickness=2)
                cv2.putText(img, str(ID), (x,y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 255*colors[ID], thickness=2)
        
        # Write frame to both image file and video
        cv2.imwrite(save_dir+'/'+'{:06d}.jpg'.format(frame), img)
        video_writer.write(img)
    
    # Release the video writer
    video_writer.release()
    print(f"Video saved at {video_path}")

uav0000009_03358_v
save dir visualization/uav0000009_03358_v-GT
Processing frame 100
Processing frame 200
Video saved at visualization/uav0000009_03358_v-GT.mp4
uav0000073_00600_v
save dir visualization/uav0000073_00600_v-GT
Processing frame 100
Processing frame 200
Processing frame 300
Video saved at visualization/uav0000073_00600_v-GT.mp4
uav0000077_00720_v
save dir visualization/uav0000077_00720_v-GT
Processing frame 100
Processing frame 200
Processing frame 300
Processing frame 400
Processing frame 500
Processing frame 600
Processing frame 700
Video saved at visualization/uav0000077_00720_v-GT.mp4
uav0000088_00290_v
save dir visualization/uav0000088_00290_v-GT
Processing frame 100
Processing frame 200
Video saved at visualization/uav0000088_00290_v-GT.mp4
uav0000119_02301_v
save dir visualization/uav0000119_02301_v-GT
Processing frame 100
Video saved at visualization/uav0000119_02301_v-GT.mp4
uav0000120_04775_v
save dir visualization/uav0000120_04775_v-GT
Processing frame 100
Proce

In [13]:
from IPython.display import Video

Video("BYTE_Results/MOT17-01-DPM/MOT17-01-DPM.mp4")

# Extract feature

In [9]:
from ultralytics import YOLO
import cv2 as cv
from PIL import Image
from torchvision import models
import torch
from torchvision import transforms
from torch.nn.functional import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
import numpy as np
import torch
import torch.nn.functional as F
from torchvision.ops import generalized_box_iou
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os

In [10]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load pretrained ResNet50 model without the final classification layer
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.to(device)
resnet.eval()

# 2. Define image transformation pipeline
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# 3. Custom dataset to apply transforms
class ImageDataset(Dataset):
    def __init__(self, images, transform):
        self.images = images
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.transform(self.images[idx])



In [11]:
# for data_name in tqdm(os.listdir("/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test/sequences/")):
for data_name in tqdm(test_seq):
    label_path = f"/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/annotations/{data_name}.txt"
    # label_path = f"/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/det/{data_name}.txt"
    video_path = f"/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/sequences/{data_name}/"
    frame_names = os.listdir(video_path)
    frame_names.sort()
    frame_path_list = [os.path.join(video_path, frame_names[i]) for i in range(len(frame_names))]
    # <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>, <x>, <y>, <z>
    gt = []
    with open(label_path, 'r') as f:
        for l in f.readlines():
            l = l.strip().split(',')
            gt.append([int(i) for i in l[:-1]] + [float(l[-1])])
    gt = np.array(gt)
    gt[:,4:6] = gt[:,2:4] + gt[:,4:6]
    
    video_frames = [None]
    for path in frame_path_list:
        video_frames.append(Image.open(path).convert('RGB'))
    
    isinstance_PIL = []
    for instance in gt:
        isinstance_PIL.append(video_frames[int(instance[0])].crop(instance[2:6]))
    
    instance_embedding = []
    dataset = ImageDataset(isinstance_PIL, transform)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            output = resnet(batch).squeeze(-1).squeeze(-1)  # Shape: (B, 2048)
            embeddings = output.cpu()  # Move to CPU if needed
            instance_embedding.extend(embeddings)
    instance_embedding = torch.stack(instance_embedding)
    
    with open(f'/home/khanh/data/LPT/VisDrone/VisDrone2019-MOT-test-dev/feature_gt_box/{data_name}', 'wb') as f:
        np.save(f, instance_embedding.numpy())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [13:28<00:00, 47.56s/it]


# Shape analysis

In [2]:
net = Net()
net.load_state_dict(torch.load('ckpt/visdrone/epoch-10.pth'))

  net.load_state_dict(torch.load('ckpt/visdrone/epoch-10.pth'))


<All keys matched successfully>

In [None]:
det_file = f'data/MOT16/train/MOT16-09/det/det.txt'
app_file = f'data/MOT/MOT17/train/{seq}-{detector}/feature.npy'

dets = np.loadtxt(det_file, delimiter=',')
app_feats = np.load(app_file)
assert dets.shape[0] == app_feats.shape[0], 'Shape mismatch'

batch_overlap = 5                  #Number of frames to overlap between 2 batches
num_frames = int(dets[:, 0].max()) #Number of frames for this video
tracks_list, assignments_list, features_list, nms_list = [],[],[],[]

for start_frame in range(1, num_frames+1, batch_size-batch_overlap):
    end_frame = start_frame + batch_size - 1
    if end_frame >= num_frames:
        end_frame = num_frames
        
    print('Tracking from frame %d to %d'%(start_frame, end_frame))
    curr_ind = np.logical_and(dets[:, 0] >= start_frame, dets[:, 0] <= end_frame)
    curr_dets = np.concatenate([dets[curr_ind, 0][:, None], dets[curr_ind, 2:7],
                                np.arange(dets[curr_ind].shape[0])[:, None]], axis=1)

    curr_dets[:, 3:5] = curr_dets[:, 3:5] + curr_dets[:, 1:3] # convert to frame,x1,y1,x2,y2,conf,node_ind
    curr_app_feats = app_feats[curr_ind]
    curr_app_feats = curr_app_feats / np.linalg.norm(curr_app_feats, axis=1, keepdims=True)
    for iteration in range(2):
        if iteration == 0:
            print('%d-th iteration'%iteration)
            linkIndexGraph, probs = get_trans_probs(tracker, curr_dets, curr_app_feats, 
                                                    app_thresh, max_frame_gap = 5)

In [3]:
net

Net(
  (fc): Sequential(
    (0): Linear(in_features=6, out_features=6, bias=True)
    (1): ReLU()
    (2): Linear(in_features=6, out_features=1, bias=True)
  )
)