In [1]:
! pip install deep-sort-realtime
! pip install ultralytics

Collecting deep-sort-realtime
  Downloading deep_sort_realtime-1.3.2-py3-none-any.whl.metadata (12 kB)
Downloading deep_sort_realtime-1.3.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-sort-realtime
Successfully installed deep-sort-realtime-1.3.2
Collecting ultralytics
  Downloading ultralytics-8.4.2-py3-none-any.whl.metadata (36 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.4.2-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.18-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.4.2 ultralytics-thop-2.0.18


In [2]:
import requests
import cv2
import torch
from ultralytics import YOLO
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import datetime
from deep_sort_realtime.deepsort_tracker import DeepSort # for tracking
import matplotlib.pyplot as plt
from PIL import ImageDraw
import numpy as np

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


Link to Best weights: https://drive.google.com/file/d/1nI8n3Q6yn7c_X6a_iB15ZlUvqq-aWyd_/view?usp=drive_link

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Use a zero shot model for tennis player detection
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

preprocessor_config.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/689M [00:00<?, ?B/s]

In [5]:
# Use the customer trained model for tennis ball detection
weights = "best_instructor" # "best_bob"
yolo_model = YOLO (f"/content/drive/MyDrive/datasetyolo/weights/{weights}.pt")

In [6]:
def ball_detections (image):
  results = yolo_model (image, conf=0.01)
  results_for_image = results[0]
  num_detections = len(results_for_image.boxes)
  bbox = None
  if num_detections > 0:
    for i, detection in enumerate(results_for_image.boxes):
      bbox = detection.xyxy[0].cpu().numpy()
      break
  return bbox

In [7]:
def player_detections (frame):
  image = Image.fromarray(frame)

  text_labels = ["person"] # ["tennis player"]
  inputs = processor(
    images = image.convert("RGB"),
    text = text_labels,
    return_tensors = "pt").to(device)
  with torch.no_grad():
    outputs = model (**inputs)

  results = processor.post_process_grounded_object_detection (
    outputs,
    inputs.input_ids,
    text_threshold = 0.3,
    target_sizes = [image.size[::-1]]
  )

  result = results[0]
  preds = []
  for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
    if labels in text_labels:
      box = [int(x) for x in box.tolist()]
      preds.append([[box[0], box[1], box[2] - box[0], box[3] - box[1]], score.item(), 1])

  return preds


In [8]:
confidence = 0.01
weights = "best_instructor" # "best_bob"

yolo_model = YOLO (f"/content/drive/MyDrive/datasetyolo/weights/{weights}.pt")
input_video_path  = f"/content/drive/MyDrive/tennis-videos/tennis_video.mp4"
output_video_path = f"/content/drive/MyDrive/tennis-videos/output_video_s7_{weights}.mp4"

video_cap = cv2.VideoCapture (input_video_path)

tracker = DeepSort (max_age=100)

frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(video_cap.get(cv2.CAP_PROP_FPS))
print(f"total_frames = {total_frames}")
print(f"fps = {fps}")
print(f"frame width = {frame_width}")
print(f"frame height = {frame_height}")

fourcc = cv2.VideoWriter_fourcc (*'mp4v')
# fps = 1  # Slow the output video down for easier analysis
writer = cv2.VideoWriter (output_video_path, fourcc, fps, (frame_width, frame_height))


total_frames = 214
fps = 30
frame width = 1920
frame height = 1080


In [9]:
tracking = []

while video_cap.isOpened():

    start = datetime.datetime.now()
    ret, frame = video_cap.read()
    if not ret: break

    player_bbox = player_detections (frame)
    tracks = tracker.update_tracks(player_bbox, frame=frame)

    # used to determine which persons moved the most during the match (the two players)
    # this enables the removal of the persons who are not moving very much (line judge, etc)
    frame_tracking = []

    if len(player_bbox) > 0:

      for track in tracks:
        if not track.is_confirmed(): continue

        track_id = track.track_id
        ltrb = track.to_ltrb() # bbox coords
        xmin, ymin, xmax, ymax = int(ltrb[0]), int(ltrb[1]), int(ltrb[2]), int(ltrb[3])
        width = xmax - xmin
        frame_tracking.append([xmin + int(width / 2), ymax, track_id])

        # Player bboxes and labels
        cv2.rectangle (frame, (xmin, ymin), (xmax, ymax), (10, 255, 0), 2)
        cv2.putText (frame, "player_" + str(track_id), (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
        # Extra annotation
        cv2.putText (frame, f"MODEL WEIGHTS: {weights}", (300, 300), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 2)
        cv2.putText (frame, f"CONFIDENCE LEVEL: {confidence}", (300, 325), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 2)
        cv2.putText (frame, f"PLAYERS FOUND: {len(player_bbox)}", (300, 350), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 2)

      writer.write(frame)
      end = datetime.datetime.now()
      print(f"Time to process 1 frame: {(end - start).total_seconds() * 1000:.0f} milliseconds")

    tracking.append(frame_tracking)

video_cap.release()
writer.release()
# cv2.destroyAllWindows()




Time to process 1 frame: 6668 milliseconds
Time to process 1 frame: 733 milliseconds
Time to process 1 frame: 3287 milliseconds
Time to process 1 frame: 3317 milliseconds
Time to process 1 frame: 4629 milliseconds
Time to process 1 frame: 718 milliseconds
Time to process 1 frame: 642 milliseconds
Time to process 1 frame: 656 milliseconds
Time to process 1 frame: 649 milliseconds
Time to process 1 frame: 644 milliseconds
Time to process 1 frame: 646 milliseconds
Time to process 1 frame: 663 milliseconds
Time to process 1 frame: 651 milliseconds
Time to process 1 frame: 652 milliseconds
Time to process 1 frame: 648 milliseconds
Time to process 1 frame: 661 milliseconds
Time to process 1 frame: 655 milliseconds
Time to process 1 frame: 654 milliseconds
Time to process 1 frame: 688 milliseconds
Time to process 1 frame: 718 milliseconds
Time to process 1 frame: 755 milliseconds
Time to process 1 frame: 837 milliseconds
Time to process 1 frame: 3635 milliseconds
Time to process 1 frame: 762 

In [11]:
# tracking

# Filter out the non-player detections

In [12]:
def euclid_dist (p1, p2):
  # return np.sqrt((pos1[0] - pos2[0])**2 + (pos1[1] - pos2[1])**2)
  return np.linalg.norm(p1 - p2)

In [13]:
player_ids = []
player_pos = {}
player_dist = {}

for frame in tracking:
  for player in frame:
    player_id = int(player[2])
    if player_id not in player_ids:
      player_pos[player_id] = np.array([player[0], player[1]])
      player_dist[player_id] = 0
      player_ids.append(player_id)
    else:
      player_dist[player_id] = euclid_dist(player_pos[player_id], np.array([player[0], player[1]])) + player_dist[player_id]
      player_pos[player_id] = np.array([player[0], player[1]])

print(player_dist)

{1: np.float64(1540.472657794045), 2: np.float64(1018.820560213182), 3: np.float64(7.0), 4: np.float64(17.0), 5: np.float64(39.0), 6: np.float64(45.82842712474619), 7: np.float64(7.0), 8: np.float64(66.24264068711928), 9: np.float64(18.0), 10: np.float64(30.82842712474619), 11: np.float64(226.56024609923625), 12: np.float64(14.414213562373096), 17: np.float64(28.0), 19: np.float64(21.414213562373096), 23: np.float64(16.99070478491457), 28: np.float64(10.0), 29: np.float64(32.071067811865476), 31: np.float64(16.0), 32: np.float64(16.0), 49: np.float64(10.0), 51: np.float64(12.0), 52: np.float64(15.242640687119287), 53: np.float64(51.071067811865476), 55: np.float64(9.16227766016838), 56: np.float64(6.0), 57: np.float64(3.0), 59: np.float64(14.0), 60: np.float64(5.414213562373095), 62: np.float64(0.0), 63: np.float64(12.0), 64: np.float64(6.0), 65: np.float64(14.414213562373096), 66: np.float64(15.0), 67: np.float64(13.82842712474619), 68: np.float64(14.0), 69: np.float64(2.0), 70: np.fl

In [14]:
sorted_player_dist = dict(sorted(player_dist.items(), key=lambda item: item[1], reverse=True))
print(sorted_player_dist)

{1: np.float64(1540.472657794045), 2: np.float64(1018.820560213182), 11: np.float64(226.56024609923625), 86: np.float64(134.26496277221963), 8: np.float64(66.24264068711928), 53: np.float64(51.071067811865476), 6: np.float64(45.82842712474619), 5: np.float64(39.0), 29: np.float64(32.071067811865476), 10: np.float64(30.82842712474619), 17: np.float64(28.0), 81: np.float64(22.886349517372675), 19: np.float64(21.414213562373096), 9: np.float64(18.0), 74: np.float64(17.414213562373096), 4: np.float64(17.0), 23: np.float64(16.99070478491457), 95: np.float64(16.51323307596588), 31: np.float64(16.0), 32: np.float64(16.0), 75: np.float64(15.99070478491457), 52: np.float64(15.242640687119287), 66: np.float64(15.0), 12: np.float64(14.414213562373096), 65: np.float64(14.414213562373096), 59: np.float64(14.0), 68: np.float64(14.0), 67: np.float64(13.82842712474619), 88: np.float64(12.23606797749979), 51: np.float64(12.0), 63: np.float64(12.0), 28: np.float64(10.0), 49: np.float64(10.0), 55: np.flo

In [15]:
actual_players = [list(sorted_player_dist.items())[0][0],
                  list(sorted_player_dist.items())[1][0]]
print(actual_players)

[1, 2]


In [16]:
tracking_updated = []

for frame in tracking:
  tr = []
  for player in frame:
    if int(player[2]) in actual_players: # player_id
      tr.append(player)
  tracking_updated.append(tr)

print(tracking_updated)

[[], [], [[566, 930, '1'], [1069, 308, '2']], [[578, 929, '1'], [1070, 308, '2']], [[588, 929, '1'], [1069, 307, '2']], [[599, 928, '1'], [1069, 307, '2']], [[616, 928, '1'], [1067, 306, '2']], [[629, 928, '1'], [1067, 305, '2']], [[650, 928, '1'], [1067, 310, '2']], [[663, 928, '1'], [1068, 316, '2']], [[674, 928, '1'], [1070, 318, '2']], [[678, 927, '1'], [1072, 319, '2']], [[682, 926, '1'], [1072, 319, '2']], [[685, 923, '1'], [1073, 320, '2']], [[697, 918, '1'], [1073, 320, '2']], [[711, 910, '1'], [1073, 319, '2']], [[719, 907, '1'], [1074, 318, '2']], [[732, 906, '1'], [1075, 315, '2']], [[749, 906, '1'], [1076, 310, '2']], [[761, 905, '1'], [1076, 308, '2']], [[772, 909, '1'], [1077, 308, '2']], [[783, 911, '1'], [1078, 308, '2']], [[788, 912, '1'], [1078, 308, '2']], [[799, 911, '1'], [1077, 307, '2']], [[811, 913, '1'], [1074, 308, '2']], [[820, 912, '1'], [1072, 307, '2']], [[831, 913, '1'], [1065, 306, '2']], [[844, 915, '1'], [1058, 305, '2']], [[852, 917, '1'], [1054, 309,

#### Tennis court: 78 feet long, 36 feet wide

In [17]:
xmin_proj, ymin_proj, xmax_proj, ymax_proj = 1482, 125, 1865, 956
# xmin_pitch, ymin_pitch, xmax_pitch, ymax_pitch = 581, 184, 1745,991
xmin_pitch, ymin_pitch, xmax_pitch, ymax_pitch = 645, 254, 1582, 845

width_proj = xmax_proj - xmin_proj
height_proj = ymax_proj - ymin_proj

radius = 7
color = (0, 0, 0)
thickness = 3

# Define the tennis court 2D lines

######### vertical lines
line_11 = (xmin_proj, ymin_proj)
line_12 = (xmin_proj, ymax_proj)

line_21 = (xmin_proj + int((4.5 / 36) * width_proj), ymin_proj)
line_22 = (xmin_proj + int((4.5 / 36) * width_proj), ymax_proj)

line_31 = (xmin_proj + int((18 / 36) * width_proj), ymin_proj + int((18 / 78) * height_proj))
line_32 = (xmin_proj + int((18 / 36) * width_proj), ymax_proj - int((18 / 78)  *height_proj))

line_41 = (xmin_proj + int((31.5 / 36) * width_proj), ymin_proj)
line_42 = (xmin_proj + int((31.5 / 36) * width_proj), ymax_proj)

line_51 = (xmin_proj + width_proj, ymin_proj)
line_52 = (xmin_proj + width_proj, ymax_proj)

######### horizontal lines
line_61 = (xmin_proj, ymin_proj)
line_62 = (xmax_proj, ymin_proj)

line_71 = (xmin_proj + int(( 4.5 / 36) * width_proj), ymin_proj + int((18 / 78) * height_proj))
line_72 = (xmin_proj + int((31.5 / 36) * width_proj), ymin_proj + int((18 / 78) * height_proj))

line_81 = (xmin_proj, ymin_proj + int((39 / 78) * height_proj))
line_82 = (xmax_proj, ymin_proj + int((39 / 78) * height_proj))

line_91 = (xmin_proj + int(( 4.5 / 36) * width_proj), ymax_proj - int((18 / 78) * height_proj))
line_92 = (xmin_proj + int((31.5 / 36) * width_proj), ymax_proj - int((18 / 78) * height_proj))

line_101 = (xmin_proj, ymax_proj)
line_102 = (xmax_proj, ymax_proj)

# Define line color and thickness
line_color = (255, 255, 255)
line_thickness = 2
alpha = 0.79  # Transparency factor


In [18]:
def draw_board (image, player_pos_1, player_pos_2, ball_pos):

  overlay = image.copy()
  lamda_x = 50
  lamda_y = 200
  cv2.rectangle(overlay, (xmin_proj - lamda_x, ymin_proj - lamda_y), (xmax_proj + lamda_x, ymax_proj + lamda_y), (0, 219, 0), -1)

  # Draw the court lines on the image
  cv2.line(overlay, line_11, line_12, line_color, line_thickness)
  cv2.line(overlay, line_21, line_22, line_color, line_thickness)
  cv2.line(overlay, line_31, line_32, line_color, line_thickness)
  cv2.line(overlay, line_41, line_42, line_color, line_thickness)
  cv2.line(overlay, line_51, line_52, line_color, line_thickness)
  cv2.line(overlay, line_61, line_62, line_color, line_thickness)
  cv2.line(overlay, line_71, line_72, line_color, line_thickness)
  cv2.line(overlay, line_81, line_82, line_color, line_thickness)
  cv2.line(overlay, line_91, line_92, line_color, line_thickness)
  cv2.line(overlay, line_101, line_102, line_color, line_thickness)

  # Players
  cv2.circle(overlay, (player_pos_1[0], player_pos_1[1]), radius, color, thickness)
  cv2.circle(overlay, (player_pos_2[0], player_pos_2[1]), radius, color, thickness)

  # Ball
  cv2.circle(overlay, (ball_pos[0], ball_pos[1]), radius - 5, line_color, thickness + 2)

  # Following line overlays transparent rectangle over the image
  image_new = cv2.addWeighted (overlay, alpha, image, 1 - alpha, 0)
  return image_new


In [19]:
# Homography

In [20]:
# pitch = [[xmin_pitch, ymin_pitch], [1391, ymin_pitch], [xmax_pitch, ymax_pitch], [198, 1008]]
# projs = [[xmin_proj, ymin_proj], [xmax_proj, ymin_proj], [xmax_proj, ymax_proj], [xmin_proj, ymax_proj]]

pitch = [[xmin_pitch, ymin_pitch], [1267, ymin_pitch], [xmax_pitch, ymax_pitch], [333, ymax_pitch]]
projs = [[xmin_proj, ymin_proj], [xmax_proj, ymin_proj], [xmax_proj, ymax_proj], [xmin_proj, ymax_proj]]

In [21]:
matrix, _ = cv2.findHomography (np.float32(pitch), np.float32(projs), method=0)

In [22]:
print(matrix)

[[     1.0864      5.0336      635.56]
 [-5.0422e-16      5.3579     -1140.4]
 [-6.3777e-19   0.0030094           1]]


In [23]:
def get_projections (matrix, kps):
  trans_kps = cv2.perspectiveTransform(np.float32([kps]).reshape(-1, 1, 2), matrix)
  return trans_kps[0][0].astype(int).tolist()

In [24]:
get_projections (matrix, [1266, 850])

[1767, 959]

In [25]:
input_video_path = "/content/drive/MyDrive/tennis-videos/tennis_video.mp4" ## CHANGE THIS
output_video_path = f"/content/drive/MyDrive/tennis-videos/output_video_s7h_{weights}.mp4"

video_cap = cv2.VideoCapture (input_video_path)

tracker = DeepSort(max_age=100) # track to identify the real players

frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video_cap.get(cv2.CAP_PROP_FPS))

fourcc = cv2.VideoWriter_fourcc (*'mp4v')
writer = cv2.VideoWriter (output_video_path, fourcc, fps, (frame_width, frame_height))


In [26]:
tracking = []
k = 0

while video_cap.isOpened():

    start = datetime.datetime.now()
    ret, frame = video_cap.read()
    if not ret: break
    player_pos = []

    if len(tracking_updated[k]) > 0:

      for pl in tracking_updated[k]:
        xmin, ymin = pl[0], pl[1]
        cv2.rectangle(frame, (xmin, ymin), (xmin+2, ymin+2), (10, 255, 0), 2)
        cv2.putText(frame, "player_" + pl[2], (xmin, ymin-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
        player_pos.append([xmin, ymin])

      ball_pos = [0, 0]
      ball_bbox = ball_detections (frame) # Call the ball detection model

      if ball_bbox is not None:
        xmin_ball = int(ball_bbox[0])
        ymin_ball = int(ball_bbox[1])
        xmax_ball = int(ball_bbox[2])
        ymax_ball = int(ball_bbox[3])
        width = xmax_ball - xmin_ball
        height = ymax_ball - ymin_ball

        ball_pos = [xmin_ball + int(width/2), ymin_ball + int(height/2)]
        cv2.rectangle(frame, (xmin_ball, ymin_ball), (xmax_ball, ymax_ball), (255, 0, 0), 2)
        cv2.putText(frame, "ball", (xmin_ball, ymin_ball-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

      # Draw the 2D homograph
      frame = draw_board (
          frame,
          get_projections(matrix, player_pos[0]),
          get_projections(matrix, player_pos[1]),
          get_projections(matrix, ball_pos))

      writer.write(frame)
      end = datetime.datetime.now()
      print(f"Time to process 1 frame: {(end - start).total_seconds() * 1000:.0f} miliseconds")

    k += 1

video_cap.release()
writer.release()
# cv2.destroyAllWindows()



0: 480x800 7 tennis balls, 79.0ms
Speed: 7.5ms preprocess, 79.0ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 800)
Time to process 1 frame: 1446 miliseconds

0: 480x800 8 tennis balls, 51.8ms
Speed: 5.0ms preprocess, 51.8ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 800)
Time to process 1 frame: 89 miliseconds

0: 480x800 7 tennis balls, 53.0ms
Speed: 4.7ms preprocess, 53.0ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 800)
Time to process 1 frame: 86 miliseconds

0: 480x800 8 tennis balls, 52.7ms
Speed: 4.4ms preprocess, 52.7ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 800)
Time to process 1 frame: 83 miliseconds

0: 480x800 8 tennis balls, 51.3ms
Speed: 4.8ms preprocess, 51.3ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 800)
Time to process 1 frame: 82 miliseconds

0: 480x800 9 tennis balls, 52.3ms
Speed: 5.8ms preprocess, 52.3ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 800)
Time t