# Screen Tracking Notebook

<b>Importing Required Libraries</b>

In [14]:
import cv2
from pupil_apriltags import Detector
import numpy as np
import itertools
from scipy.spatial.distance import pdist
import pandas as pd
from tqdm import tqdm
import warnings
from datetime import datetime
import csv
warnings.filterwarnings('ignore')

<b>Initializing the constructor corresponding to the April Tag detection library</b>

In [15]:
csv_data = []

at_detector_36h11 = Detector(
    families="tag36h11",
    nthreads=4,
    quad_decimate=1.0,
    quad_sigma=0.0,
    refine_edges=1,
    decode_sharpening=0.25,
    debug=0
)

### Set the video location and the tobii data location

Example:<br>
video_loc = "C:\\Users\\ARL\\GroundingDINO\\app\\app_data_demo\\demo_video.mp4"<br>
tobii_data = pd.read_csv("C:\\Users\\ARL\\GroundingDINO\\app\\app_data_demo\\app_data.csv")

In [16]:
video_loc = "C:\\Users\\ARL\\GroundingDINO\\app\\app_data_demo\\demo_video.mp4"
tobii_data = pd.read_csv("C:\\Users\\ARL\\GroundingDINO\\app\\app_data_demo\\app_data.csv")
tobii_data

Unnamed: 0,timestamp,gaze2d_x,gaze2d_y
0,30.636567,0.603551,0.376057
1,30.706718,0.556380,0.376542
2,30.776836,0.517973,0.377112
3,30.836929,0.497531,0.378401
4,30.907079,0.470546,0.384249
...,...,...,...
70,35.305028,0.486544,0.360548
71,35.375106,0.485883,0.357980
72,35.445266,0.488010,0.356730
73,35.505390,0.489435,0.355231


In [17]:
# Output Video Frame Rate - Editable
output_frame_rate = 24

<b>Explanation of what the next 2 code cells do : </b>These code cells are used for the following tasks:

1. The `find_screen_containing_point` function takes in the screen results (centres, corners, etc) and the gaze point to check which screen is currently being looked at. It also leverages the `is_point_inside_rectangle` function as a helper function to check if the point is inside a specific bounding box or not. 

2. Once the current screen being looked at is found, the `plot_det_april` function along with its helper functions (`select_tightest_pack`, `calculate_total_distance`) find the 4 points which are closest to the screen. 

In [None]:
def calculate_total_distance(points):
    distances = pdist(points)
    return np.sum(distances)

def select_tightest_pack(points_list):
    min_total_distance = float('inf')
    tightest_pack = None

    for combination in itertools.product(*points_list):
        total_distance = calculate_total_distance(combination)
        if total_distance < min_total_distance:
            min_total_distance = total_distance
            tightest_pack = combination

    return tightest_pack

def plot_det_april(image, results, label):
    point_collection = []
    for r in results:
        (ptA, ptB, ptC, ptD) = r.corners
        ptB = (int(ptB[0]), int(ptB[1]))
        ptC = (int(ptC[0]), int(ptC[1]))
        ptD = (int(ptD[0]), int(ptD[1]))
        ptA = (int(ptA[0]), int(ptA[1]))

        point_collection.append([ptB, ptC, ptD, ptA])

    min_rectangle_points = select_tightest_pack(point_collection)
    return min_rectangle_points

In [19]:
def is_point_inside_rectangle(point, rectangle):
    x, y = point
    min_x = np.min(rectangle[:, 0])
    max_x = np.max(rectangle[:, 0])
    min_y = np.min(rectangle[:, 1])
    max_y = np.max(rectangle[:, 1])

    if min_x <= x <= max_x and min_y <= y <= max_y:
        return True
    return False

def find_screen_containing_point(screens, point):
    for screen_index, screen_results in enumerate(screens):
        if len(screen_results) > 0:
            centers = np.array([detection.center for detection in screen_results])
            min_x = np.min(centers[:, 0])
            max_x = np.max(centers[:, 0])
            min_y = np.min(centers[:, 1])
            max_y = np.max(centers[:, 1])
            bounnding_rectangle = np.array([[min_x, min_y], [max_x, min_y], [min_x, max_y], [max_x, max_y]])

            if is_point_inside_rectangle(point, bounnding_rectangle):
                return screen_index
            
    return None

<b>The `screen_track_single_img` function is used as a main function when a frame from the video is extracted.</b>

In [20]:
def screen_track_single_img(image, gaze2d, timestamp, op_ts):
    if gaze2d is None:
        return
    
    image_height, image_width, _ = image.shape
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    results_36h11 = at_detector_36h11.detect(gray)
    available_screens = {
        'screen1': [],
        'screen2': [],
        'screen3': [],
        'screen4': []
    }

    for detection in results_36h11:
        tag_id = detection.tag_id

        if tag_id in [1, 2, 3, 4]:
            available_screens['screen1'].append(tag_id)
        if tag_id in [5, 6, 7, 8]:
            available_screens['screen2'].append(tag_id)
        if tag_id in [9, 10, 11, 12]:
            available_screens['screen3'].append(tag_id)
        if tag_id in [13, 14, 15, 16]:
            available_screens['screen4'].append(tag_id)

    for screen, tags in available_screens.items():
        if len(tags) == 4:
            available_screens[screen] = tags

    screen1_results, screen2_results, screen3_results, screen4_results = [], [], [], []

    for detection in results_36h11:
        tag_id = detection.tag_id
        if tag_id in available_screens['screen1']:
            screen1_results.append(detection)
        if tag_id in available_screens['screen2']:
            screen2_results.append(detection)
        if tag_id in available_screens['screen3']:
            screen3_results.append(detection)
        if tag_id in available_screens['screen4']:
            screen4_results.append(detection)

    screens = [screen1_results, screen2_results, screen3_results, screen4_results]
    point = (gaze2d[0]*image_width, gaze2d[1]*image_height)
    screen_index = find_screen_containing_point(screens, point)

    if screen_index is not None:
        min_rectangle_points = plot_det_april(image, screens[screen_index], f"Screen {screen_index+1}")
        min_rectangle_points += (np.NaN,) * (4 - len(min_rectangle_points))
        csv_data.append({
            'timestamp': timestamp,
            'Output_Video_Timestamp': op_ts,
            'gaze2d_x': point[0],
            'gaze2d_y': point[1],
            'Screen': screen_index+1,
            'BL': min_rectangle_points[0],
            'BR': min_rectangle_points[1],
            'TR': min_rectangle_points[2],
            'TL': min_rectangle_points[3]
        })
    else:
        csv_data.append({
            'timestamp': timestamp,
            'Output_Video_Timestamp': op_ts,
            'gaze2d_x': point[0],
            'gaze2d_y': point[1],
            'Screen': np.NaN,
            'BL': np.NaN,
            'BR': np.NaN,
            'TR': np.NaN,
            'TL': np.NaN
        })

In [21]:
# MAIN FUNCTION
def batch_process_screen_det(video_loc, matched_rows):
    cap = cv2.VideoCapture(video_loc)
    total_frames = len(matched_rows)
    ti = 1 / output_frame_rate
    op_ts = 0.0
    pbar = tqdm(total=total_frames, desc='Processing Frames')
    for _, row in matched_rows.iterrows():
        timestamp = row['timestamp']
        if pd.notna(timestamp):
            frame_number = int(timestamp * cap.get(cv2.CAP_PROP_FPS))
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
            ret, frame = cap.read()
            if ret:
                gaze2d = (row['gaze2d_x'], row['gaze2d_y'])
                screen_track_single_img(frame, gaze2d, timestamp, op_ts)
                op_ts += ti
        pbar.update(1)

    pbar.close()
    cap.release()

In [22]:
batch_process_screen_det(video_loc, tobii_data)    # Processing started

Processing Frames:   0%|          | 0/75 [00:00<?, ?it/s]

Processing Frames: 100%|██████████| 75/75 [00:07<00:00, 10.33it/s]


In [23]:
# Results to CSV
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
csv_df = pd.DataFrame(csv_data)
csv_df.to_csv(f"screen_tracking_results_{current_datetime}.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

## Visualizer

In [24]:
csv_df

Unnamed: 0,timestamp,Output Video Timestamp,gaze2d_x,gaze2d_y,Screen,BL,BR,TR,TL
0,30.636567,0.000000,1158.817056,406.141949,2.0,"(829, 211)","(1392, 196)","(1385, 519)","(834, 517)"
1,30.706718,0.041667,1068.249408,406.665328,2.0,"(830, 211)","(1394, 196)","(1386, 519)","(835, 517)"
2,30.776836,0.083333,994.508851,407.281414,2.0,"(833, 212)","(1397, 196)","(1390, 519)","(839, 518)"
3,30.836929,0.125000,955.259232,408.673318,2.0,"(829, 212)","(1393, 197)","(1385, 519)","(834, 518)"
4,30.907079,0.166667,903.448397,414.988985,2.0,"(828, 212)","(1390, 197)","(1383, 519)","(832, 518)"
...,...,...,...,...,...,...,...,...,...
70,35.305028,2.916667,934.163520,389.391624,2.0,"(631, 286)","(1169, 301)","(1167, 600)","(628, 597)"
71,35.375106,2.958333,932.895072,386.618508,2.0,"(631, 285)","(1170, 301)","(1168, 599)","(628, 596)"
72,35.445266,3.000000,936.978950,385.267936,2.0,"(632, 285)","(1171, 301)","(1168, 599)","(629, 596)"
73,35.505390,3.041667,939.716083,383.649113,2.0,"(633, 285)","(1171, 301)","(1169, 599)","(629, 597)"


In [25]:
screen_coordinate_columns = ['BL', 'BR', 'TR', 'TL']

<b>The code cell below is used to visualize the screen detection results</b>

In [26]:
cap = cv2.VideoCapture(video_loc)
output_video = cv2.VideoWriter(f"screen_tracking_results_video_{current_datetime}.mp4", cv2.VideoWriter_fourcc(*'mp4v'), output_frame_rate, (1920, 1080))

current_frame = None

total_frames = len(csv_df)
pbar = tqdm(total=total_frames, desc='Processing Frames')

for index, row in csv_df.iterrows():
    timestamp = row['timestamp']
    cap.set(cv2.CAP_PROP_POS_MSEC, int(timestamp*1000))
    ret, frame = cap.read()

    if not ret:
        break

    screen_coordinates = [row[col] for col in screen_coordinate_columns if not pd.isna(row[col])]
    screen_coordinates = np.array(screen_coordinates, dtype=np.int32)
    gaze_m = (int(row['gaze2d_x']), int(row['gaze2d_y'])) if not pd.isna(row['gaze2d_x']) and not pd.isna(row['gaze2d_y']) else (None, None)
    screen_number = row['Screen']
    if pd.isna(gaze_m[0]) or pd.isna(gaze_m[1]):
        gaze_text = 'Gaze not found'
    else:
        gaze_text = 'Gaze Available'
        cv2.circle(frame, gaze_m, 15, (255, 0, 0), -1)

        if pd.isna(screen_number):
            screen_text = 'Screen not detected'
        else:
            if len(screen_coordinates) < 4:
                if not pd.isna(screen_number):
                    screen_text = f'Not all April tags were detected but predicted screen = {int(screen_number)}'
                # else:
                #     screen_text = 'Not all April tags were detected'
            else:
                screen_text = f'Screen {int(screen_number)}'
                hull = cv2.convexHull(screen_coordinates, clockwise=True)
                cv2.drawContours(frame, [hull], -1, (0, 255, 0), 2)

        cv2.putText(frame, screen_text, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, gaze_text, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0 ,0), 2)
    output_video.write(frame)
    cv2.waitKey(50)
    pbar.update(1)

pbar.close()
cap.release()
output_video.release()
cv2.destroyAllWindows()

Processing Frames: 100%|██████████| 75/75 [00:09<00:00,  7.93it/s]
