1. Install the Vertex AI SDK: Open a terminal window and enter the command below. You can also [install it in a virtualenv](https://googleapis.dev/python/aiplatform/latest/index.html)

In [None]:
!pip install --upgrade google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.74.0-py2.py3-none-any.whl.metadata (31 kB)
Downloading google_cloud_aiplatform-1.74.0-py2.py3-none-any.whl (6.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-aiplatform
  Attempting uninstall: google-cloud-aiplatform
    Found existing installation: google-cloud-aiplatform 1.71.1
    Uninstalling google-cloud-aiplatform-1.71.1:
      Successfully uninstalled google-cloud-aiplatform-1.71.1
Successfully installed google-cloud-aiplatform-1.74.0


In [56]:
# Define ground truth from 10 minute segment
ground_truth = [
{"end_time": "00:05", "play_type": "non_play", "start_time": "00:00"},
{"end_time": "00:31", "play_type": "non_play", "start_time": "00:05"},
{"end_time": "00:42", "play_type": "non_play", "start_time": "00:31"},
{"end_time": "00:46", "play_type": "non_play", "start_time": "00:42"},
{"end_time": "01:16", "play_type": "non_play", "start_time": "00:46"},
{"end_time": "01:46", "play_type": "non_play", "start_time": "01:16"},
{"end_time": "02:16", "play_type": "non_play", "start_time": "01:46"},
{"end_time": "02:29", "play_type": "non_play", "start_time": "02:16"},
{"end_time": "02:55", "play_type": "non_play", "start_time": "02:29"},
{"end_time": "03:32", "play_type": "non_play", "start_time": "02:55"},
{"end_time": "03:53", "play_type": "kickoff", "start_time": "03:32"},
{"end_time": "03:58", "play_type": "kickoff_return", "start_time": "03:53"},
{"end_time": "04:56", "play_type": "penalty", "start_time": "03:58"},
{"end_time": "05:34", "play_type": "pass", "start_time": "04:56"},
{"end_time": "06:15", "play_type": "pass", "start_time": "05:34"},
{"end_time": "06:48", "play_type": "pass", "start_time": "06:15"},
{"end_time": "07:20", "play_type": "penalty", "start_time": "06:48"},
{"end_time": "07:29", "play_type": "punt", "start_time": "07:20"},
{"end_time": "07:35", "play_type": "punt_return", "start_time": "07:29"},
{"end_time": "07:59", "play_type": "penalty", "start_time": "07:35"},
{"end_time": "08:49", "play_type": "non_play", "start_time": "07:59"},
{"end_time": "09:30", "play_type": "handoff", "start_time": "08:49"},
{"end_time": "10:10", "play_type": "pass", "start_time": "09:30"},
{"end_time": "10:37", "play_type": "pass", "start_time": "10:10"}
]

In [66]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting


def generate():
    vertexai.init(project="cloud-llm-preview1", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-pro-002",
        system_instruction=[textsi_1]
    )
    response = model.generate_content(
        [video1, """Extract moments from the video."""],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
    )

    return response

video1 = Part.from_uri(
    mime_type="video/mp4",
    uri="gs://lukasgeiger-fubo-demo-data/LIVE_97161119_TEN_MINUTES.mp4",
)
textsi_1 = """
Your job is to extract segments from the sport video.
Analyze the entire video.
Be as precise and accurate as possible.

Sport Type: American Football

If you are unsure of a segment's play_type, choose 'unsure' as play_type.

**Non-Play Segments:**
   - Non-play segments are advertisements breaks, promotional content, and replays.

**Format the Output:**
The output should be a list of JSON objects where each object represents a key moment, with the following fields:
     - **start_time:** Timecode (MM:SS format) indicating when the event begins in the video.
     - **end_time:** Timecode (MM:SS format) indicating when the event ends in the video.
     - **play_type:** The type of play as per the external data, which should be one of the following:
       ['rush', 'pass', 'punt', 'handoff', 'field_goal', 'penalty', 'extra_point', 'kickoff', 'kickoff_return', 'punt_return', 'conversion', 'non_play', 'touchdown', 'unsure']

**Example Output Format:**
[
  {
    "start_time": "01:15",
    "end_time": "01:45",
    "play_type": "rush"
  },
  {
    "start_time": "02:30",
    "end_time": "03:00",
    "play_type": "pass"
  },
  {
    "start_time": "10:20",
    "end_time": "10:40",
    "play_type": "non_play"
  }
]"""

response_schema = {
    "type": "ARRAY",
    "items":{
        "type": "OBJECT",
        "properties": {
            "start_time": {
              "type": "STRING",
            },
            "end_time": {
              "type": "STRING",
            },
            "play_type": {
              "type": "STRING",
            }
          },
          "required": [
            "start_time",
            "end_time",
            "play_type"
          ],
    }
}

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0,
    "top_p": 0.95,
    "response_mime_type": "application/json",
    "response_schema": response_schema,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

response = generate()
print(response.text)

[{"end_time": "00:05", "play_type": "non_play", "start_time": "00:00"},
{"end_time": "00:31", "play_type": "non_play", "start_time": "00:05"},
{"end_time": "00:42", "play_type": "non_play", "start_time": "00:31"},
{"end_time": "00:46", "play_type": "non_play", "start_time": "00:42"},
{"end_time": "01:16", "play_type": "non_play", "start_time": "00:46"},
{"end_time": "01:46", "play_type": "non_play", "start_time": "01:16"},
{"end_time": "02:16", "play_type": "non_play", "start_time": "01:46"},
{"end_time": "02:29", "play_type": "non_play", "start_time": "02:16"},
{"end_time": "03:59", "play_type": "kickoff_return", "start_time": "02:57"},
{"end_time": "05:00", "play_type": "pass", "start_time": "04:55"},
{"end_time": "05:07", "play_type": "pass", "start_time": "05:00"},
{"end_time": "06:06", "play_type": "pass", "start_time": "05:32"},
{"end_time": "07:17", "play_type": "punt", "start_time": "06:06"},
{"end_time": "07:24", "play_type": "punt", "start_time": "07:17"},
{"end_time": "07:46

In [67]:
import pandas as pd
import json

def time_to_seconds(time_str):
    """Converts a time string in the format MM:SS to seconds."""
    if not time_str:
        return 0
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

def calculate_iou(start_time1, end_time1, start_time2, end_time2):
    """Calculates the Intersection over Union (IoU) of two time intervals."""
    # IoU = (Intersection of the two intervals) / (Union of the two intervals)
    # An IoU of 0 means there's no overlap at all.
    # An IoU of 1 means the two intervals are identical.

    start1 = time_to_seconds(start_time1)
    end1 = time_to_seconds(end_time1)
    start2 = time_to_seconds(start_time2)
    end2 = time_to_seconds(end_time2)

    # Determine the intersection
    intersection_start = max(start1, start2)
    intersection_end = min(end1, end2)

    if intersection_end <= intersection_start:
        return 0.0  # No overlap

    intersection = intersection_end - intersection_start

    # Determine the union
    union = (end1 - start1) + (end2 - start2) - intersection

    return intersection / union if union > 0 else 0.0

def evaluate_play_extraction(llm_response, grounded_truth):
    """
    Evaluates the performance of an LLM in extracting plays from American football videos.

    Args:
        llm_response: A list of dictionaries, where each dictionary represents a play
                      extracted by the LLM and contains 'start_time', 'end_time', and 'play_type'.
        grounded_truth: A list of dictionaries, where each dictionary represents the
                       ground truth play and contains 'start_time', 'end_time', and 'play_type'.

    Returns:
        A dictionary containing the accuracy, precision, recall, and tables of correctly
        and incorrectly identified plays.

    Note:
        Can be an incorrect play for two reasons:
          1. IOU is out of bounds
          2. play_type was incorrectly indentified
    """

    true_positives = 0
    false_positives = 0
    false_negatives = 0

    correct_plays = []
    incorrect_plays = []

    matched_gt = [False] * len(grounded_truth)

    for llm_play in llm_response:
        best_iou = 0
        best_gt_match = -1

        for i, gt_play in enumerate(grounded_truth):
            iou = calculate_iou(llm_play['start_time'], llm_play['end_time'], gt_play['start_time'], gt_play['end_time'])

            if iou > best_iou:
                best_iou = iou
                best_gt_match = i

        if best_gt_match != -1 and best_iou >= 0.2:
            if llm_play['play_type'] == grounded_truth[best_gt_match]['play_type']:
                true_positives += 1
                correct_plays.append({
                    'LLM Start Time': llm_play['start_time'],
                    'LLM End Time': llm_play['end_time'],
                    'LLM Play Type': llm_play['play_type'],
                    'GT Start Time': grounded_truth[best_gt_match]['start_time'],
                    'GT End Time': grounded_truth[best_gt_match]['end_time'],
                    'GT Play Type': grounded_truth[best_gt_match]['play_type'],
                    'IoU': best_iou
                })
            else:
                false_positives += 1
                incorrect_plays.append({
                    'LLM Start Time': llm_play['start_time'],
                    'LLM End Time': llm_play['end_time'],
                    'LLM Play Type': llm_play['play_type'],
                    'GT Start Time': grounded_truth[best_gt_match]['start_time'],
                    'GT End Time': grounded_truth[best_gt_match]['end_time'],
                    'GT Play Type': grounded_truth[best_gt_match]['play_type'],
                    'IoU': best_iou
                })
            matched_gt[best_gt_match] = True

        else:
            false_positives +=1
            incorrect_plays.append({
                    'LLM Start Time': llm_play['start_time'],
                    'LLM End Time': llm_play['end_time'],
                    'LLM Play Type': llm_play['play_type'],
                    'GT Start Time': 'N/A',
                    'GT End Time': 'N/A',
                    'GT Play Type': 'N/A',
                    'IoU': 0
                })

    for i in range(len(grounded_truth)):
      if not matched_gt[i]:
        false_negatives += 1
        incorrect_plays.append({
            'LLM Start Time': 'N/A',
            'LLM End Time': 'N/A',
            'LLM Play Type': 'N/A',
            'GT Start Time': grounded_truth[i]['start_time'],
            'GT End Time': grounded_truth[i]['end_time'],
            'GT Play Type': grounded_truth[i]['play_type'],
            'IoU': 0
        })

    accuracy = (true_positives) / (len(llm_response) + len(grounded_truth) - true_positives) if (len(llm_response) + len(grounded_truth) - true_positives) > 0 else 0.0
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'true_positives': true_positives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'correct_plays': pd.DataFrame(correct_plays),
        'incorrect_plays': pd.DataFrame(incorrect_plays)
    }

results = evaluate_play_extraction(json.loads(response.text), ground_truth)
print(f"Accuracy: {results.get('accuracy')}")
print(f"Precision: {results.get('precision')}")
print(f"Recall: {results.get('recall')}")
print(results.get('correct_plays'))
print(results.get('incorrect_plays'))

Accuracy: 0.3333333333333333
Precision: 0.55
Recall: 0.55
   LLM Start Time LLM End Time LLM Play Type GT Start Time GT End Time  \
0           00:00        00:05      non_play         00:00       00:05   
1           00:05        00:31      non_play         00:05       00:31   
2           00:31        00:42      non_play         00:31       00:42   
3           00:42        00:46      non_play         00:42       00:46   
4           00:46        01:16      non_play         00:46       01:16   
5           01:16        01:46      non_play         01:16       01:46   
6           01:46        02:16      non_play         01:46       02:16   
7           02:16        02:29      non_play         02:16       02:29   
8           05:32        06:06          pass         05:34       06:15   
9           07:17        07:24          punt         07:20       07:29   
10          10:00        10:18          pass         10:10       10:37   

   GT Play Type       IoU  
0      non_play  1.000000