# Import libraries

In [2]:
# Directory/to/your/repo
%cd C:/Users/Minh PC/tracking/src 

C:\Users\Minh PC\tracking\src


In [3]:
%load_ext autoreload
%autoreload 2

import argparse
from collections import OrderedDict, deque, defaultdict
import copy
import cv2
from cv2 import CAP_PROP_FRAME_COUNT
import dask
import glob
from hungarian_algorithm import algorithm
import numpy as np
from numpy import random
from numpy.linalg import norm
import os
from tqdm import tqdm
import time
from pathlib import Path
import pickle
from PIL.Image import Image
from scipy.optimize import linear_sum_assignment
from scipy.spatial import distance_matrix
import statistics
import shutil
import toolz

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import sys
sys.path.insert(0, 'yolov5/')

from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages, letterbox
from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
    scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path
from utils.plots import plot_one_box
from utils.torch_utils import select_device, load_classifier, time_synchronized
from detectObj_for_Tracking import detectFromImage
from reid_new.vehicle_embedder import VehicleEmbedder

  warn(f"Failed to load image Python extension: {e}")


In [4]:
print(torch.cuda.is_available())

True


In [5]:
class dotdict(dict):
    """ dot.notation access to dictionary attributes """
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [19]:
vid_name = 'vungtau194'
opt = {
    'agnostic_nms': False, 
    'batch_size': 128,
    'conf_thres': 0.25, 
    'device': 'cuda', 
    'euclidean_num': 3,
    'euclidean_thresh': 0.2, 
    'img_size': 640, 
    'iou_thres': 0.45, 
    # 'n_image': 100, 
    'num_embed': 5,
    'num_trajectory': 3, 
    'match_score_thr': 0.5,
    'max_age': 30,
    'output':f'output/{vid_name}',
    'out_txt':f'output/{vid_name}/info_xywh.txt',
    'save_image': True,
    'source': f'data/{vid_name}',
    'trajectory_thresh': 0.9, 
    'trajectory_path':'weights/trajectory/best_400class_10_3.pth', 
    'reid_path':'weights/reid/reid_vehicle_new.pt',
    'video_fps': 10, 
    'video_resolution': (1280, 720), 
#     'video_source': "datasets/ai_challenge_2020/sample_02.mp4",
    'weights': 'weights/yolov5/last_19.pt', 
    }

opt = dotdict(opt)

In [7]:
class DataCFG:
    n_id = 400
    window_size = 10

# Cosine + trajectory utils


In [8]:
def pad_sequence_fixed_size(sequences, batch_first=False, padding_value=0.0, max_len=256):
  # based on torch.nn.utils.rnn.pad_sequence
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims

    out_tensor = sequences[0].new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor

    return out_tensor

def track2tensor(track, device, half):
    if half:
        item = torch.tensor(track, dtype=torch.float16, device=device).unsqueeze(0) 
    else:
        item = torch.tensor(track, dtype=torch.float, device=device).unsqueeze(0) 
    item = pad_sequence_fixed_size(item, batch_first=True, max_len=DataCFG.window_size)
    return item

def get_normalized_box(box_info, img_shape):
    h, w, c = img_shape
    x1 = float(box_info[0])/w
    x2 = float(box_info[1])/w
    y1 = float(box_info[2])/h
    y2 = float(box_info[3])/h
    return [x1, x2, y1, y2]

In [9]:
class Attention(nn.Module):
    def __init__(self, input_size, units=128, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.fc1 = nn.Linear(input_size, input_size, bias=False)
        self.fc2 = nn.Linear(input_size*2, self.units, bias = False)
    
    def forward(self, x):
        score_first_part = self.fc1(x)
        h_t = x[:, -1, :] # Last hidden state
        score = torch.einsum('ik,ijk->ij', h_t, score_first_part)
        attention_weights = F.softmax(score, dim=1)
        context_vector = torch.einsum('ijk,ij->ik', x, attention_weights)
        pre_activation = torch.cat([context_vector, h_t], dim=1)
        attention_vector = torch.tanh(self.fc2(pre_activation))
        return attention_vector

class Trajectory(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.lstm = nn.LSTM(input_size=4, hidden_size=100, dropout=0.2, num_layers=3, batch_first=True)
        self.attention = Attention(input_size=100, units=128)
        self.drop = nn.Dropout(0.2)
        self.fc = nn.Linear(128, DataCFG.n_id)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.drop(out)
        att = self.attention(out)
        out = self.fc(att)
        return out, att

In [10]:
@dask.delayed
def load_transform(half, device, track_boxes, det_box = None):
    track = track_boxes[:]
    if det_box:
        track.append(det_box)
    tensor = track2tensor(track, device, half)
    return tensor

@dask.delayed
def predict(batch, model):
    with torch.no_grad():
        out, att = model(batch)
    return att

def get_trajectory_embeddings(tensors, dmodel):
    batches = [dask.delayed(torch.cat)(batch, dim=0)
            for batch in toolz.partition_all(opt.batch_size, tensors)]
    predictions = [predict(batch, dmodel) for batch in batches]
    predictions = dask.compute(*predictions)    
    if len(predictions)>0:
        predictions = torch.cat(predictions, dim=0)
    return predictions

def get_trajectory_matrix(tracks_box_list, det_boxes_list, trajectory_model, euclidean_matrix, trajectory_thresh, device, half):
    trajectory_matrix = {}
    # Get track trajectory embedding
    track_ids = sorted(tracks_box_list.keys())
    tensors = [load_transform(half, device, tracks_box_list[k]) for k in track_ids]
    track_predictions = get_trajectory_embeddings(tensors, trajectory_model)
    
    # Get candidate trajectory embedding
    tensors = []
    for k in track_ids:
        for j in range(len(det_boxes_list)):
            if euclidean_matrix[k][j] and len(tracks_box_list[k])>=opt.num_trajectory:
                tensors.append(load_transform(half, device, tracks_box_list[k], det_boxes_list[j]))
    predictions = get_trajectory_embeddings(tensors, trajectory_model)

    count = 0
    for i, k in enumerate(track_ids):
        score = []    
        for j in range(len(det_boxes_list)):
            if euclidean_matrix[k][j]:
                if len(tracks_box_list[k])<opt.num_trajectory:
                    score.append(True)
                else:
                    track_emb = track_predictions[i]
                    candidate_emb = predictions[count]
                    count += 1
                    score.append(float(F.cosine_similarity(track_emb, candidate_emb, dim=0)) >= trajectory_thresh)
            else:
                score.append(False)
        trajectory_matrix[k] = score
    return trajectory_matrix

In [11]:
def xyxy2cxcy(box):
    cx = statistics.mean([box[1], box[0]])
    cy = statistics.mean([box[3], box[2]])
    return (cx, cy)

def is_min_topk(a, k=1):
    '''
        Mask min top k values in array as 1, others as 0
    '''
    _, rix = np.unique(a, return_inverse=True)
    return np.where(rix < k, True, False).reshape(a.shape)

def get_euclidean_matrix(tracks_box_list, det_boxes_list, euclidean_num):
    matrix = {}
    track_ids = sorted(tracks_box_list.keys())
    det_boxes_cxcy = [xyxy2cxcy(det_box) for det_box in det_boxes_list]
    track_boxes_cxcy = []
    for k in track_ids:
        track_boxes_cxcy.append(xyxy2cxcy(tracks_box_list[k][-1]))
    
    scores = distance_matrix(track_boxes_cxcy, det_boxes_cxcy)
    for i, k in enumerate(track_ids):
        matrix[k] = is_min_topk(scores[i], euclidean_num)
    return matrix

def cosine_distance(X, Y, track_class_list, class_id, row2id, trajectory_matrix, INFY_COST=1e5):
    """Get cost matrix using cosine distance"""
    norm_1 = torch.norm(X, dim=1, keepdim=True)
    norm_2 = torch.norm(Y, dim=1, keepdim=True)
    
    cos_dis = 1 - (X@Y.T)/(norm_1@norm_2.T)
    cos_dis = cos_dis.cpu().numpy()
    for j in range(len(Y)):
        for i in range(len(X)):
            if track_class_list[i] != class_id[j] or not trajectory_matrix[row2id[i]][j]:
                cos_dis[i][j] = INFY_COST
    return cos_dis

# Online Inference

In [20]:
class Tracker:
    def __init__(self, opt):
        self.opt = opt
        set_logging()
        self.device = select_device(self.opt.device)
        self.half = self.device.type != 'cpu'  # half precision only supported on CUDA

        self.detector = self.load_detector(self.opt.weights)
        self.reid = VehicleEmbedder(self.opt.reid_path)
        self.trajectory_model = self.load_trajectory_model(self.opt.trajectory_path)

        self.frame_id = 1
        self.active_tracks = defaultdict(list)
        self.age = OrderedDict()
        self.num_tracks = 1

        if not os.path.exists(self.opt.output):
            os.mkdir(self.opt.output)
        self.vid_writer = cv2.VideoWriter('/content/video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), self.opt.video_fps, self.opt.video_resolution)
        self.check = defaultdict(list)
        self.vehicle_count = [0,0,0,0]

        self.f = open(opt.out_txt, 'w')
    
    def copy_video_to_destination(self):
        shutil.copy('/content/video.mp4', os.path.join(self.opt.output, 'video.mp4'))

    def load_detector(self, checkpoint_path):
        model = attempt_load(checkpoint_path, map_location=self.device)  # load FP32 model
        self.stride = int(model.stride.max())  # model stride
        self.imgsz = check_img_size(self.opt.img_size, s=self.stride)  # check img_size
        if self.half:
            model.half()  # to FP16 
        return model

    def load_trajectory_model(self, trajectory_path):
        model = Trajectory().to(self.device)
        checkpoint = torch.load(trajectory_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        model.eval()
        if self.half:
            model.half()  # to FP16 
        model = dask.delayed(model)
        return model

    def add_track_info(self, embed, cls, det_boxes, track=None):
        if track is None:
            track = defaultdict(list)
        track['embed'].append(embed)
        track['class'].append(cls)
        track['norm_boxes'].append(det_boxes)

        track['embed'] = track['embed'][-self.opt.num_embed:]
        track['norm_boxes'] = track['norm_boxes'][-self.opt.num_trajectory:]
        return track

    def detect(self, img):
        with torch.no_grad():
            boxes = detectFromImage(self.detector, self.half, self.opt, self.device, self.imgsz, self.stride, img)
            box_info = []
            for b in boxes:
                x1, y1, x2, y2 = map(int, [b.x_min, b.y_min, b.x_max, b.y_max])
                w = x2 - x1
                h = y2 - y1
                cfs = 1
                cls = b.semantic_id
                box_info.append([self.frame_id, x1, y1, w, h, cfs, cls])
        return box_info
    
    def get_embedding(self, img, box_info):
        with torch.no_grad():            
            crop_imgs = []
            boxes = []
            for info in box_info:
                x1 = info[1]
                y1 = info[2]
                x2 = x1 + info[3]
                y2 = y1 + info[4]
                cfs = info[5]
                cls = info[6]
                if x1 < 0:
                    x1 = 0
                if x2 > img.shape[1]:
                    x2 = img.shape[1]
                if y1 < 0:
                    y1 = 0
                if y2 > img.shape[0]:
                    y2 = img.shape[0]
                
                if x1 >= x2 or y1 >= y2:
                    continue
                else:
                    cfs = info[5]
                    cls = info[6]
                    crop_imgs.append(img[y1:y2, x1:x2])
                    boxes.append([x1, x2, y1, y2, cfs, cls])
            if len(boxes)==0:
                return OrderedDict()
            
            embed_list = self.reid.infer(crop_imgs)
            embed_list = embed_list.cpu()
            # Normalize embed
            embed_dict = OrderedDict()
            embed_dict['embed'] = F.normalize(embed_list)
            embed_dict['boxes'] = boxes    
            embed_dict['norm_boxes'] = [get_normalized_box(item, img.shape) for item in boxes]
        return embed_dict
    
    def matching_cascade(self, id_list, embed_list, class_list, check, det_boxes_list):
        row2id = {}
        track_embed_list = []
        track_class_list = []
        tracks_box_list = {}
        row_idx = 0

        for id, info in self.active_tracks.items():
            embeds = info['embed']
            classes = info['class']
            norm_boxes = info['norm_boxes']
            tracks_box_list[id] = norm_boxes[-self.opt.num_trajectory:]
            for i in range(len(embeds)):
                track_embed_list.append(embeds[i])
                track_class_list.append(classes[i])
                row2id[row_idx] = id
                row_idx += 1

        euclidean_matrix = get_euclidean_matrix(tracks_box_list, det_boxes_list, self.opt.euclidean_num)
        trajectory_matrix = get_trajectory_matrix(tracks_box_list, det_boxes_list, self.trajectory_model, euclidean_matrix, self.opt.trajectory_thresh, self.device, self.half)
        reid_dists = cosine_distance(torch.stack(track_embed_list), embed_list, track_class_list, class_list, row2id, trajectory_matrix)
        row, col = linear_sum_assignment(reid_dists)
        un_matched = [] # Những embedding mới chưa được match với id nào
        results = []
        for r, c in zip(row, col):
            dist = reid_dists[r, c]
            results.append([r, c, dist, row2id[r]])
        
        results.sort(key=lambda x: x[2])
        for r, c, dist, id in results:
            if dist < self.opt.match_score_thr:
                if not check[id]:
                    # Nếu id chưa được match 
                    id_list[c] = id
                    check[id] = True
                    self.active_tracks[id] = self.add_track_info(embed_list[c], class_list[c], det_boxes_list[c], self.active_tracks[id])
                    if len(self.active_tracks[id]['embed']) > self.opt.num_embed:
                        self.active_tracks[id]['embed'] = self.active_tracks[id]['embed'][-self.opt.num_embed:]
                        self.active_tracks[id]['norm_boxes'] = self.active_tracks[id]['norm_boxes'][-self.opt.num_trajectory:]
                else:
                    # Nếu id đã được matched rồi thì cho embedding đó vào tập chưa được match
                    un_matched.append(c)
            else:
                id_list[c] = self.num_tracks
                check[self.num_tracks] = True
                self.active_tracks[self.num_tracks] = self.add_track_info(embed_list[c], class_list[c], det_boxes_list[c])
                self.age[self.num_tracks] = 0
                self.num_tracks += 1
        
        return un_matched, id_list, check

    def matching(self, info):
        id_dict = OrderedDict()

        if len(info.keys())==0:
            for id, v in self.age.items():
                self.age[id] = self.age[id] + 1
            del_id = []
            for id, a in self.age.items():
                if a > self.opt.max_age:
                    del_id.append(id)
            for id in del_id:
                self.active_tracks.pop(id)
                self.age.pop(id)
            return id_dict
        
        embed_list = info['embed']
        boxes = info['boxes']
        norm_boxes = info['norm_boxes']
        class_list = []
        for i in range(len(boxes)):
            class_list.append(boxes[i][5])
        
        if len(self.active_tracks.keys()) == 0: # No active tracks
            id_list = list(range(self.num_tracks, self.num_tracks + embed_list.size(0)))
            for i in range(embed_list.size(0)):
                self.active_tracks[self.num_tracks + i] = self.add_track_info(embed_list[i-1], boxes[i-1][5], norm_boxes[i-1])
                self.age[self.num_tracks + i] = 0
            self.num_tracks = embed_list.size(0) + self.num_tracks
        else:
            id_list = [-1 for _ in range(embed_list.size(0))]
            check = defaultdict(list) # Kiểm tra xem các id đã được match chưa, ban đầu giá trị bằng False
            for id, info in self.active_tracks.items():
                check[id] = False
            
            un_matched, id_list, check = self.matching_cascade(id_list, embed_list, class_list, check, norm_boxes)
            # Nếu vẫn còn embedding mới chưa được match với id nào
            if len(un_matched) != 0:
                for c in un_matched:
                    id_list[c] = self.num_tracks
                    check[self.num_tracks] = True
                    self.active_tracks[self.num_tracks] = self.add_track_info(embed_list[c], class_list[c], norm_boxes[c])
                    self.age[self.num_tracks] = 0
                    self.num_tracks += 1
            for k, id in enumerate(id_list):
                if id == -1:
                    id_list[k] = self.num_tracks
                    check[self.num_tracks] = True
                    self.active_tracks[self.num_tracks] = self.add_track_info(embed_list[k], class_list[k], norm_boxes[k])
                    self.age[self.num_tracks] = 0
                    self.num_tracks += 1

            for id, v in check.items():
                if not v:
                    self.age[id] = self.age[id] + 1
            del_id = []
            for id, a in self.age.items():
                if a > self.opt.max_age:
                    del_id.append(id)
            for id in del_id:
                self.active_tracks.pop(id)
                self.age.pop(id)
            
        id_dict = {}
        id_dict['id'] = id_list
        id_dict['boxes'] = boxes
        for i, box in enumerate(boxes):
            if box[5]<4 and id_list[i] not in self.check:
                self.check[id_list[i]] = True
                self.vehicle_count[box[5]] += 1
        return id_dict
    
    def process_one_frame(self, img, box_info=None):
        if box_info is None:
            box_info = self.detect(img)
        embed_dict = self.get_embedding(img, box_info)
        id_dict = self.matching(embed_dict)
        self.frame_id += 1
        return id_dict
    
    def save_result(self, image, info):
        img = copy.deepcopy(image)
        colors = [(0, 255, 255), (0, 0, 255)]
        if len(info.keys())!=0:
            boxes = info['boxes']
            ids = info['id']
            for i, box in enumerate(boxes):
                cv2.rectangle(img, (box[0], box[2]), (box[1], box[3]), colors[1], 2)
                cv2.putText(img, str(ids[i]), (box[0], box[2]), cv2.FONT_HERSHEY_SIMPLEX, 1, colors[0], 2)
        cv2.rectangle(img, (0, 0), (int(img.shape[1]*0.3), int(img.shape[0]*0.07)), (0,0,0), -1)
        cv2.putText(img, "Frame: {} Motorcycle: {} Car: {} Pedestrian: {} Truck: {} Total: {}".format(self.frame_id, *self.vehicle_count, sum(self.vehicle_count)), (20, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[0], 1)
        if self.opt.save_image:
            name_save = os.path.join(self.opt.output, str(self.frame_id-1).zfill(6) + '.jpg')
            cv2.imwrite(name_save, img)
        self.vid_writer.write(img)
    
    def save_txt(self, info):    
        if len(info.keys()) != 0:
            ids = info['id']
            boxes = info['boxes']
            results = []
            for i, id in enumerate(ids):
                x = boxes[i][0]
                y = boxes[i][2]
                w = boxes[i][1] - boxes[i][0]
                h = boxes[i][3] - boxes[i][2]
                cfs = boxes[i][4]
                cls = boxes[i][5]
                vs = 1
                results.append([self.frame_id, id, x, y, w, h, cfs, cls, vs])

            for i in range(len(results)):
                self.f.write(str(results[i][0]) + ',' +  str(results[i][1]) + ',' + str(results[i][2]) + ',' + \
                        str(results[i][3]) + ',' + str(results[i][4]) + ',' + str(results[i][5]) + ',' + \
                        str(results[i][6]) + ',' + str(results[i][7]) + ',' + str(results[i][8]) + '\n')

    def save_txt_mot(self, info):
        if len(info.keys()) != 0:
            ids = info['id']
            boxes = info['boxes']
            results = []
            for i, id in enumerate(ids):
                x = boxes[i][0]
                y = boxes[i][2]
                w = boxes[i][1] - boxes[i][0]
                h = boxes[i][3] - boxes[i][2]
                cfs = boxes[i][4]
                results.append([self.frame_id, id, x, y, w, h, cfs, -1, -1, -1])

            for res in results:
                self.f.write("{},{},{},{},{},{},{},{},{},{}\n".format(*res))
    
    def release_all(self):
        self.f.close()
        self.vid_writer.release()

In [21]:
def load_images_path(source):
    imgs = sorted(glob.glob(os.path.join(source, '*.jpg')))
    # imgs = imgs[:min(opt.n_image, len(imgs))]
    return imgs

def get_box_from_file(source):
    boxes = []
    with open(source, 'r') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip('\n').split(",")
        boxes.append([int(line[0]), int(line[2]), int(line[3]), int(line[4]), int(line[5]), int(line[6]), int(line[7]), float(line[8])])
    return boxes

def get_box_from_file_mot(source):
    # [self.frame_id, x1, y1, w, h, cfs, cls]
    boxes = []
    with open(source, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip('\n').split(",") # frame, id, x1, y1, w, h, cfs, x, y, z
        frame = int(line[0])
        x1 = round(float(line[2]))
        y1 = round(float(line[3]))
        w = round(float(line[4]))
        h = round(float(line[5]))
        cfs = float(line[6])
        cls = 1
        boxes.append([frame, x1, y1, w, h, cfs, cls])
    return boxes

def get_frame_boxes(boxes, frame_id):
    box_info = []
    for box in boxes:
        if box[0]==frame_id and box[5]>=0.25:
            box_info.append(box)
    return box_info

In [22]:
def frame_iter(capture, description=''):
    def _iterator():
        while capture.grab():
            yield capture.retrieve()[1]

    return tqdm(
        _iterator(),
        desc=description,
        total=int(capture.get(CAP_PROP_FRAME_COUNT)),
    )

In [23]:
# video = cv2.VideoCapture(opt.video_source)
# input_fps = video.get(cv2.CAP_PROP_FPS)
# print(f"Input fps {input_fps}")
# tracker = Tracker(opt)

# for i, img in enumerate(frame_iter(video)):
#     id_dict = tracker.process_one_frame(img)
#     tracker.save_result(img, id_dict)
#     tracker.save_txt(id_dict)

# tracker.release_all()
# video.release()
# tracker.copy_video_to_destination()

In [24]:
imgs = load_images_path(opt.source)
# boxes = get_box_from_file(opt.file_det)

In [25]:
tracker = Tracker(opt)

for img_path in tqdm(imgs):
    img = cv2.imread(img_path)
    # box_info = get_frame_boxes(boxes, int(img_path.split('/')[-1].split('.')[0]))
    id_dict = tracker.process_one_frame(img)  
    tracker.save_result(img, id_dict)
    tracker.save_txt(id_dict)
# tracker.close_f()

YOLOv5  2021-10-12 torch 1.10.1+cu102 CUDA:cuda (NVIDIA GeForce GTX 1060, 3071.8125MB)



run


Fusing layers... 
Model Summary: 476 layers, 87198694 parameters, 0 gradients


SelecSLS42_B


100%|████████████████████████████████████████████████████████████████████████████████| 149/149 [00:56<00:00,  2.64it/s]
