# Object Tracking with YoloV7 and Pixeltable

In [None]:
import sys, glob, os
import pandas as pd
import numpy as np
sys.path.append('/home/marcel/pixeltable')
import PIL

In [None]:
import pixeltable as pt
import pixeltable.functions
%load_ext autoreload
%autoreload 2

We're loading the function in order to use it for a computed column. Note that we don't need to import the packages we needed to create the model in the first place.

In [None]:
cl = pt.Client()

The functions are stored in the `functions` database.

In [None]:
cl.list_functions()

In [None]:
functions_db = cl.get_db('functions')
yolov7 = functions_db.get_function('yolov7')
sort_track = functions_db.get_function('sort_track')
track_viz = functions_db.get_function('track_viz')

Sanity check

In [None]:
#img_file = '/home/marcel/pixeltable/pixeltable/tests/data/imagenette2-160/n03445777_2563.JPEG'
img_file = '/home/marcel/pixeltable/pixeltable/tests/data/imagenette2-160/n03888257_50622.JPEG'
img = PIL.Image.open(img_file)

print(img.size)
display(img)

In [None]:
detection = yolov7.eval_fn(img)
detection

In [None]:
state = sort_track.init_fn()
sort_track.update_fn(state, detection)
sort_track.value_fn(state)

We're now creating database `videos` for our video data.

In [None]:
try:
    videos = cl.get_db('videos')
except:
    videos = cl.create_db('videos')

The videos and their frames will be stored in a table `data`. Initially it only contains the path to the video file, the frame, and the frame sequence number (within the video).

In [None]:
videos.drop_table('test_data', ignore_errors=True)
cols = [
    pt.Column('video', pt.VideoType(), nullable=False),
    pt.Column('frame', pt.ImageType(), nullable=False),
    pt.Column('frame_idx', pt.IntType(), nullable=False),
]

When creating the table, we supply parameters needed for automatic frame extraction during `insert_rows()`/`insert_pandas()` calls:
- The `extract_frames_from` argument is the name of the column of type `video` from which to extract frames.
- During an `insert_rows()` call, each input row, corresponding to one video, is expanded into one row per frame (subject to the requested frame rate, in this case 1 fps).
- Each frame is extract to a JPEG file that is stored in the Pixeltable home directory.
- The columns `frame` and `frame_idx` receive the frame file path and frame sequence number, respectively.

In [None]:
t = videos.create_table(
    'test_data', cols,
    extract_frames_from='video', extracted_frame_col='frame', extracted_frame_idx_col='frame_idx',
    extracted_fps=1)

We're using a few short sample videos for the demo.

In [None]:
video_filepaths = glob.glob(f'/home/marcel/pixeltable-demo/videos/*.mp4', recursive=True)
video_filepaths = [os.path.abspath(p) for p in video_filepaths]
video_filepaths

In [None]:
t.insert_rows([[video_filepaths[1]]], columns=['video'])

We loaded frames of an intersection in Bangkok.

In [None]:
t[t.frame, t.frame_idx, t.frame.width, t.frame.height].show(1)

The same with detections:

In [None]:
t[t.frame, yolov7(t.frame)].show(1)

# Tracking

For tracking, we call the `sort_track` function with a `window()` clause that states that we want to track per-video, and in frame index (chronological) order.

In [None]:
t[t.frame,
  yolov7(t.frame),
  sort_track(t.frame_idx, yolov7(t.frame), group_by=t.video)].show(1)

We're happy with the result and decide to add detections and tracking results to the table. Running `yolov7()` is an expensive operation and adding it as a computed column caches the detections and makes them available as part of the stored table data.

In [None]:
t.add_column(pt.Column('detections', computed_with=yolov7(t.frame)))

In [None]:
t.add_column(pt.Column('tracked_objs', computed_with=sort_track(t.frame_idx, t.detections, group_by=t.video)))

In [None]:
t[t.frame, t.detections, t.tracked_objs].show(2)

The `tracked_objs` column contains JSON, and we can use standard JSON path expressions to select whichever elements are needed:

In [None]:
t[t.frame, t.detections, t.tracked_objs['*'].bbox].show(2)

# Visualization

We'll now use the previously created function `track_viz` to visualize the tracking data. As with the `sort_track` function, this is also a windowed function that creates a new image for every frame of a video, in chronological order.

In [None]:
t[t.frame,
  track_viz(t.frame_idx, t.frame, t.tracked_objs['*'].bbox, t.tracked_objs['*'].id, group_by=t.video)
 ].show(2)

We're happy with the result, and again we add it to the table:

In [None]:
t.add_column(
    pt.Column(
        'tracking_viz',
        computed_with=track_viz(t.frame_idx, t.frame, t.tracked_objs['*'].bbox, t.tracked_objs['*'].id,
                                group_by=t.video)))

In [None]:
t[t.frame, t.tracking_viz].show(2)

# Reassembling frames into a video

Now that we convinced ourselves that detection and visualization work as expected, let's apply it at the full frame rate and turn the visualizations back into a video.

In [None]:
videos.drop_table('data', ignore_errors=True)

In [None]:
try:
    t = videos.get_table('data')
except:
    cols = [
        pt.Column('video', pt.VideoType(), nullable=False),
        pt.Column('frame', pt.ImageType(), nullable=False, indexed=True),
        pt.Column('frame_idx', pt.IntType(), nullable=False),
    ]
    t = videos.create_table(
        'data', cols,
        extract_frames_from='video', extracted_frame_col='frame', extracted_frame_idx_col='frame_idx',
        extracted_fps=0)

    t.add_column(pt.Column('detections', computed_with=yolov7(t.frame)))
    t.add_column(
        pt.Column(
            'tracked_objs',
            computed_with=sort_track(t.frame_idx, t.detections, group_by=t.video)))
    t.add_column(
        pt.Column(
            'tracking_viz',
            computed_with=track_viz(t.frame_idx, t.frame, t.tracked_objs['*'].bbox, t.tracked_objs['*'].id,
                                    group_by=t.video)))

    t.insert_rows([[p] for p in video_filepaths], columns=['video'])

In [None]:
t.describe()

In [None]:
t[t.frame_idx == 200][t.frame, t.tracking_viz].show(20)

In [None]:
t[t.frame.matches('cat')][t.frame_idx, t.frame].show(2)

In [None]:
result = t[pt.make_video(t.frame_idx, t.tracking_viz)].group_by(t.video).show()

In [None]:
result