In [2]:
import torch
import cv2
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import subprocess
from ultralytics import YOLO, SAM
import matplotlib.pyplot as plt
import open_clip
from transformers import CLIPProcessor, CLIPModel

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
def get_frame_from_video(video_path, frame_number):
    """
    Captures a specific frame from a video file.

    Args:
        video_path (str): The path to the MP4 video file.
        frame_number (int): The 0-indexed number of the frame to capture.

    Returns:
        numpy.ndarray: The image data of the specified frame as a NumPy array (BGR format).
                       Returns None if the frame cannot be captured or video cannot be opened.
    """
    # Open the video file
    video_capture = cv2.VideoCapture(video_path)

    # Check if video opened successfully
    if not video_capture.isOpened():
        print(f"Error: Could not open video file: {video_path}")
        return None

    # Get total number of frames (optional, for validation)
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    # print(f"Total frames in video: {total_frames}")

    if frame_number < 0 or frame_number >= total_frames:
        print(f"Error: Frame number {frame_number} is out of bounds (0-{total_frames - 1}).")
        video_capture.release()
        return None

    # Set the video capture to the desired frame
    # cv2.CAP_PROP_POS_FRAMES is 0-based index of the frame to be decoded/captured next.
    video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

    # Read the frame
    success, frame = video_capture.read()

    # Release the video capture object
    video_capture.release()

    if success:
        # 'frame' is now a NumPy array containing the image data of the specified frame
        return frame
    else:
        print(f"Error: Could not read frame {frame_number} from video {video_path}.")
        return None

# if __name__ == '__main__':
#     # --- Example Usage ---
#     video_file_path = "path/to/your/video.mp4"  # Replace with your video file path
#     frame_to_capture = 100  # Capture the 101st frame (0-indexed)

#     # Get the specific frame
#     captured_frame_data = get_frame_from_video(video_file_path, frame_to_capture)

#     if captured_frame_data is not None:
#         print(f"Successfully captured frame {frame_to_capture}.")
#         print(f"Frame data type: {type(captured_frame_data)}")
#         print(f"Frame shape (Height, Width, Channels): {captured_frame_data.shape}")
#         print(f"Frame dtype: {captured_frame_data.dtype}")

#         # You can now process or display this frame
#         # For example, display it using OpenCV (requires a GUI environment)
#         # cv2.imshow(f"Frame {frame_to_capture}", captured_frame_data)
#         # cv2.waitKey(0)  # Wait for a key press
#         # cv2.destroyAllWindows()

#         # Or save it to a file
#         # cv2.imwrite(f"captured_frame_{frame_to_capture}.jpg", captured_frame_data)
#         # print(f"Frame {frame_to_capture} saved as captured_frame_{frame_to_capture}.jpg")
#     else:
#         print(f"Failed to capture frame {frame_to_capture}.")


In [4]:
def prepare_yolo_data_single_class(localization_data, base_dataset_dir, object_class_name="object"):
    """
    Converts structured localization data for a single object class to YOLO format.
    Assumes already normalized bounding boxes.

    Args:
        localization_data: A list of dictionaries, where each dict has:
                           'frame_num': int, the frame number for naming.
                           'frame': numpy.ndarray, the image data.
                           'boxes': a list of dicts, each with normalized 'x', 'y', 'width', 'height'.
                                    The 'class_name' field in boxes is ignored as it's a single class.
                                    'x', 'y' are top-left corner normalized.
                                    'width', 'height' are normalized dimensions.
        base_dataset_dir: Path to the root of your YOLO dataset.
        object_class_name (str): The name to use for your single class in data.yaml (e.g., "object").
    """
    images_train_dir = os.path.join(base_dataset_dir, "images", "train")
    labels_train_dir = os.path.join(base_dataset_dir, "labels", "train")
    os.makedirs(images_train_dir, exist_ok=True)
    os.makedirs(labels_train_dir, exist_ok=True)

    print(f"Starting YOLO data preparation for single class '{object_class_name}'. Outputting to: {base_dataset_dir}")

    # For a single class, the class index is always 0
    single_class_index = 0

    for data_point in localization_data:
        frame_num = data_point['frame_num']
        frame_image_data = data_point['frame']
        boxes_in_frame = data_point['boxes']

        image_filename = f"frame_{frame_num:05d}.jpg"
        label_filename = f"frame_{frame_num:05d}.txt"

        image_path = os.path.join(images_train_dir, image_filename)
        label_path = os.path.join(labels_train_dir, label_filename)

        if isinstance(frame_image_data, np.ndarray):
            cv2.imwrite(image_path, frame_image_data)
        else:
            print(f"Warning: Frame data for frame_num {frame_num} is not a NumPy array. Skipping.")
            continue

        yolo_labels_for_this_frame = []
        for box_info in boxes_in_frame:
            x_norm_top_left = box_info['x']
            y_norm_top_left = box_info['y']
            w_norm = box_info['width']
            h_norm = box_info['height']
            # 'class_name' from box_info is ignored, we use single_class_index

            x_center_norm = x_norm_top_left + (w_norm / 2)
            y_center_norm = y_norm_top_left + (h_norm / 2)

            yolo_labels_for_this_frame.append(
                f"{single_class_index} {x_center_norm:.6f} {y_center_norm:.6f} {w_norm:.6f} {h_norm:.6f}"
            )

        with open(label_path, 'w') as f:
            for line in yolo_labels_for_this_frame:
                f.write(line + "\n")

    print(f"Data preparation complete. Dataset at: {base_dataset_dir}")
    print(f"Remember to create a data.yaml file in '{base_dataset_dir}' with:")
    print("train: images/train")
    print("val: images/val  # Or remove if no validation set")
    print(f"nc: 1")
    print(f"names: ['{object_class_name}']")


# --- Example of how you might structure your input and call the function ---
# if __name__ == '__main__':
#     dummy_frame_image_1 = np.zeros((480, 640, 3), dtype=np.uint8)
#     cv2.putText(dummy_frame_image_1, "Frame 1 Objects", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

#     dummy_frame_image_2 = np.zeros((720, 1280, 3), dtype=np.uint8)
#     cv2.putText(dummy_frame_image_2, "Frame 2 Objects", (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 3)

#     # Note: 'class_name' in boxes is optional here, as it will be ignored by the function.
#     # If your data source provides it, it's fine to include it.
#     sample_localization_data_single_class = [
#         {
#             'frame_num': 101,
#             'frame': dummy_frame_image_1,
#             'boxes': [
#                 {'x': 0.1, 'y': 0.2, 'width': 0.3, 'height': 0.25}, # No 'class_name' needed
#                 {'x': 0.6, 'y': 0.5, 'width': 0.15, 'height': 0.20, 'class_name': 'irrelevant_name'} # 'class_name' will be ignored
#             ]
#         },
#         {
#             'frame_num': 102,
#             'frame': dummy_frame_image_2,
#             'boxes': [
#                 {'x': 0.4, 'y': 0.3, 'width': 0.2, 'height': 0.2}
#             ]
#         }
#     ]

#     yolo_dataset_directory_single_class = "./my_single_class_yolo_dataset"
#     custom_object_name = "target_object" # You can name your single class whatever you like

#     prepare_yolo_data_single_class(
#         sample_localization_data_single_class,
#         yolo_dataset_directory_single_class,
#         object_class_name=custom_object_name
#     )

    # After running, your data.yaml in './my_single_class_yolo_dataset/' should look like:
    #
    # train: images/train
    # val: images/val # (if you create a validation set later)
    #
    # nc: 1
    # names: ['target_object'] # Or whatever 'custom_object_name' you provided
    #

In [5]:
def add_box(obj_list, frame_num, new_box):
    for item in obj_list:
        if item['frame_num'] == frame_num:
            item['boxes'].append(new_box)
            return
    obj_list.append({'frame_num': frame_num, 'boxes': [new_box]})

In [6]:
import tator
MY_TOKEN = '0d3d74dc1595a2811b694478f714cd3e68f21354'
api = tator.get_api(host='https://cloud.tator.io', token=MY_TOKEN)
PROJECT_ID = 70
# MEDIA_ID =    4291234
MEDIA_ID = 4348803
# MEDIA_ID = 4166376
# MEDIA_ID = 4286688

localizations = api.get_localization_list(PROJECT_ID, media_id=[MEDIA_ID])
obj_dicts = []
i = 0
for localization in localizations:
    # print(localization)
    if localization.width is not None and localization.height is not None:
        # if i < 10:
        # if localization.width > 0 and localization.height > 0:
            # print(localization)
            add_box(obj_dicts, localization.frame, {'x': localization.x, 
                                                        'y': localization.y, 
                                                        'width': localization.width, 
                                                        'height': localization.height, 
                                                       }) 

   
            # try:
            #     img_path = api.get_localization_graphic(localization.id)
            #     img_paths.append(img_path)
            # except:
            #     pass
        # i += 1
print(len(obj_dicts))
print(obj_dicts)

22
[{'frame_num': 2181, 'boxes': [{'x': 0.23076923076923078, 'y': 0.44146341463414634, 'width': 0.047876602564102554, 'height': 0.08816621499548326}, {'x': 0.27197802197802196, 'y': 0.3463414634146342, 'width': 0.03427197802197804, 'height': 0.06847335140018065}]}, {'frame_num': 2297, 'boxes': [{'x': 0.42763157894736836, 'y': 0.4836448598130841, 'width': 0.023930921052631622, 'height': 0.06079958463136037}]}, {'frame_num': 2575, 'boxes': [{'x': 0.5741758241758242, 'y': 0.573170731707317, 'width': 0.046657509157509124, 'height': 0.07775519421860887}, {'x': 0.4107142857142857, 'y': 0.3902439024390244, 'width': 0.027306547619047626, 'height': 0.03383017163504965}]}, {'frame_num': 2906, 'boxes': [{'x': 0.5521978021978022, 'y': 0.46829268292682924, 'width': 0.08790636446886448, 'height': 0.051151761517615184}]}, {'frame_num': 2956, 'boxes': [{'x': 0.5013736263736264, 'y': 0.3024390243902439, 'width': 0.04393887362637366, 'height': 0.08737579042457091}]}, {'frame_num': 3922, 'boxes': [{'x': 

In [6]:
val = 0 
for data in obj_dicts:
    if len(data['boxes']) > 2:
        val += 1
print(val)

40


In [3]:
import tator
MY_TOKEN = '0d3d74dc1595a2811b694478f714cd3e68f21354'
api = tator.get_api(host='https://cloud.tator.io', token=MY_TOKEN)
PROJECT_ID = 70
MEDIA_ID =  4291234
# MEDIA_ID = 9432747
# MEDIA_IDS = 9431643 9431644
# k = 0
# for MEDIA_ID in range(9431643,9432434):
try:
    media = api.get_media(MEDIA_ID)
    out_path = f"./data/Images/{media.name}"
    print(f"Downloading {media.name}...")
    for progress in tator.util.download_media(api, media, out_path):
        print(f"Download progress: {progress}%")
    print(f"Download complete. Find at {out_path}")
    # k += 1
except Exception as e:
    print(f"Error downloading media {MEDIA_ID}: {e}")
    # continue
print(f"Total successful downloads: {k}")

Downloading PS2222_20220601T175323Z_FWD_ROV01_HD.mp4...
Download progress: 0%
Download progress: 25.0%
Download progress: 50.0%
Download progress: 75.0%
Download progress: 100.0%
Download progress: 100%
Download complete. Find at ./data/Images/PS2222_20220601T175323Z_FWD_ROV01_HD.mp4


NameError: name 'k' is not defined

In [None]:
i = 0
for frame in obj_dicts:
    print(f'list count {i}')
    try:
        frame['frame'] = get_frame_from_video(out_path, frame['frame_num'])
    except:
        print('failed')
    i += 1

list count 0


list count 1
list count 2
list count 3
list count 4
list count 5
list count 6
list count 7
Error: Could not read frame 133696 from video /tmp/PS2208_20211005T141715Z_FWD_ROV01_HD.mp4.
list count 8
Error: Could not read frame 134551 from video /tmp/PS2208_20211005T141715Z_FWD_ROV01_HD.mp4.
list count 9
Error: Could not read frame 135326 from video /tmp/PS2208_20211005T141715Z_FWD_ROV01_HD.mp4.
list count 10
list count 11
list count 12
list count 13
list count 14
list count 15
list count 16
list count 17
list count 18
list count 19
list count 20
list count 21
list count 22
list count 23
list count 24
list count 25
list count 26
list count 27
list count 28
list count 29
list count 30
list count 31
list count 32
list count 33
list count 34
list count 35
list count 36
list count 37
list count 38
list count 39
list count 40
list count 41
list count 42
list count 43
list count 44
list count 45
list count 46
list count 47
list count 48
list count 49
list count 50
list count 51
list count 52
li

In [None]:
prepare_yolo_data_single_class(obj_dicts, './data', object_class_name="object")

In [None]:
from ultralytics import YOLO

def finetune_yolo_model(data_yaml_path, pretrained_model_name='yolov8n.pt', epochs=50, batch_size=16, img_size=640, project_name='yolo_finetune', run_name='exp'):
    """
    Fine-tunes a YOLO model on a custom dataset.

    Args:
        data_yaml_path (str): Path to your data.yaml file.
        pretrained_model_name (str): Name of the pretrained model to start from (e.g., 'yolov8n.pt', 'yolov8s.pt').
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        img_size (int): Input image size for the model.
        project_name (str): Name of the project directory where results will be saved.
        run_name (str): Name of the specific run/experiment.
    """
    # Load a pretrained YOLO model
    model = YOLO(pretrained_model_name)

    print(f"Starting fine-tuning with model: {pretrained_model_name}")
    print(f"Dataset configuration: {data_yaml_path}")
    print(f"Training for {epochs} epochs with batch size {batch_size} and image size {img_size}.")

    # Train the model
    results = model.train(
        data=data_yaml_path,
        epochs=epochs,
        batch=batch_size,
        imgsz=img_size,
        project=project_name, # Results will be saved in 'runs/detect/project_name/run_name'
        name=run_name,
        # device=0,  # Uncomment to specify GPU, e.g., 0 for the first GPU
        # workers=8, # Number of dataloader workers
        # patience=10, # Early stopping patience
        # exist_ok=True # if you want to overwrite existing run with the same name
    )

    print("Fine-tuning complete.")
    print(f"Results saved to: {results.save_dir}") # Ultralytics 8.0.x returns a Results object
    # For older versions, the path might be under model.trainer.save_dir or similar.
    # The best model weights are typically saved as 'best.pt' in the run directory.


dataset_yaml_file = "/path/to/your_dataset/data.yaml"

# Check if the YAML file exists
if not os.path.exists(dataset_yaml_file):
    print(f"Error: Dataset YAML file not found at {dataset_yaml_file}")
    print("Please create the data.yaml file and ensure paths are correct.")
else:
    finetune_yolo_model(
        data_yaml_path=dataset_yaml_file,
        pretrained_model_name='yolov8s.pt', # Start with a small model like yolov8n.pt or yolov8s.pt
        epochs=25, # Start with fewer epochs to test
        batch_size=8, # Adjust based on your GPU memory
        img_size=640,
        project_name='custom_object_detection',
        run_name='first_finetune_run'
    )