In [1]:

import os
import shutil
import argparse
import traceback
from tqdm import tqdm
from typing import List, Dict, Tuple, Optional

from concurrent.futures import ThreadPoolExecutor, as_completed

import fiftyone as fo

from tator_tools.download_datasets import DatasetDownloader
from tator_tools.fiftyone_clustering import FiftyOneDatasetViewer

import tator

import cv2
import numpy as np
import pandas as pd

import torch
from ultralytics import YOLO
from ultralytics import RTDETR

# Custom DataDownloader (for getting select frames from media)

In [2]:
class DataDownloader:
    def __init__(self, api_token: str, project_id: int, media_ids: List[int], 
                 frame_ids_dict: Dict[int, List[int]], output_dir: str, 
                 max_workers: int = 10, max_retries: int = 10):
        """
        Initialize the DataDownloader with multiple media IDs and their corresponding frames.

        :param api_token: Tator API token for authentication
        :param project_id: Project ID in Tator
        :param media_ids: List of media IDs to process
        :param frame_ids_dict: Dictionary mapping media IDs to their frame IDs to download
        :param output_dir: Output directory for downloaded frames
        :param max_workers: Maximum number of concurrent download threads
        :param max_retries: Maximum number of retries for failed downloads
        """
        self.project_id = project_id
        self.media_ids = media_ids
        self.frames_dict = frame_ids_dict
        self.output_dir = output_dir
        self.max_workers = max_workers
        self.max_retries = max_retries
        
        # Create a single API instance for all operations
        self.api = self._authenticate(api_token)
        
        # Set up directories
        self._setup_directories()
        
        # Cache for media objects
        self.media_cache = {}
        
        # Output data
        self.output_data = None

    @staticmethod
    def _authenticate(api_token: str):
        """
        Authenticate with the Tator API.

        :param api_token: API token for authentication
        :return: Authenticated API instance
        """
        try:
            api = tator.get_api(host='https://cloud.tator.io', token=api_token)
            return api
        except Exception as e:
            raise Exception(f"ERROR: Could not authenticate with provided API Token\n{e}")

    def _setup_directories(self):
        """
        Create necessary directories for frame storage.
        """
        os.makedirs(f"{self.output_dir}/frames", exist_ok=True)

    def _get_media(self, media_id: int):
        """
        Get media object with caching to avoid redundant API calls.
        
        :param media_id: Media ID to retrieve
        :return: Media object
        """
        if media_id not in self.media_cache:
            self.media_cache[media_id] = self.api.get_media(id=int(media_id))
        return self.media_cache[media_id]

    def download_frame(self, params: tuple) -> Tuple[int, int, Optional[str]]:
        """
        Download a single frame for a given media with retry logic.

        :param params: Tuple containing (media_id, frame_id)
        :return: Tuple of (media_id, frame_id, frame_path or None if failed)
        """
        media_id, frame_id = params
        media = self._get_media(media_id)
        
        # Use absolute path for frame_path
        frame_path = os.path.abspath(f"{self.output_dir}/frames/{str(media_id)}_{str(frame_id)}.jpg")
        
        # Use absolute path for lock_path
        lock_path = f"{frame_path}.lock"
        
        # Rest of the method remains the same as before
        if os.path.exists(frame_path):
            return media_id, frame_id, frame_path
            
        if os.path.exists(lock_path):
            if os.path.getmtime(lock_path) < time.time() - 300:
                try:
                    os.remove(lock_path)
                except:
                    pass
            else:
                for _ in range(60):
                    time.sleep(1)
                    if os.path.exists(frame_path):
                        return media_id, frame_id, frame_path
                    if not os.path.exists(lock_path):
                        break
                
        try:
            with open(lock_path, 'w') as f:
                f.write(str(os.getpid()))
        except:
            time.sleep(1)
            if os.path.exists(frame_path):
                return media_id, frame_id, frame_path
        
        for attempt in range(self.max_retries):
            try:
                temp = self.api.get_frame(
                    id=media.id,
                    tile=f"{media.width}x{media.height}",
                    force_scale="1024x768",  # TODO remove hardcoding
                    frames=[int(frame_id)]
                )
                shutil.move(temp, frame_path)
                
                try:
                    os.remove(lock_path)
                except:
                    pass
                    
                return media_id, frame_id, frame_path
                
            except Exception as e:
                error_msg = f"Error downloading frame {frame_id} for media {media_id}: {e}"
                if attempt < self.max_retries - 1:
                    print(f"{error_msg}, retrying...")
                    time.sleep(2 ** attempt)
                else:
                    print(f"{error_msg}, giving up.")
        
        try:
            os.remove(lock_path)
        except:
            pass
            
        return media_id, frame_id, None

    def download_data(self) -> Dict[int, List[str]]:
        """
        Download frames for all media IDs using a single thread pool.

        :return: Dictionary mapping media IDs to lists of frame paths
        """
        # Prepare all download tasks
        all_tasks = []
        for media_id in self.media_ids:
            frames = self.frames_dict[media_id]
            for frame_id in frames:
                all_tasks.append((media_id, frame_id))
        
        results_dict = {media_id: [] for media_id in self.media_ids}
        
        # Use a single thread pool for all downloads
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = {
                executor.submit(self.download_frame, task): task 
                for task in all_tasks
            }
            
            with tqdm(total=len(all_tasks), desc="Downloading frames") as pbar:
                for future in as_completed(futures):
                    media_id, frame_id, frame_path = future.result()
                    if frame_path:  # If download was successful
                        results_dict[media_id].append(frame_path)
                    pbar.update(1)
        
        self.output_data = results_dict


In [3]:
api_token = os.getenv("TATOR_TOKEN")
project_id = 155

output_dir="../Data/NCICS/Madeline_Data/"

# Read in CSV file

In [21]:
# Read and preprocess the data
df = pd.read_csv("../Data/NCICS/MadelineIs_All_Annotations_20241113.csv")

# Only drop rows where TatorMediaID OR TatorFrame have NA values
df = df.dropna(subset=['TatorMediaID', 'TatorFrame'])

# Convert the columns to integers after removing NA values
df['TatorMediaID'] = df['TatorMediaID'].astype(int)
df['TatorFrame'] = df['TatorFrame'].astype(int)

# Create dictionary mapping media IDs to their frame lists
media_ids = df['TatorMediaID'].unique().tolist()
frame_ids_dict = {media_id: df[df['TatorMediaID'] == media_id]['TatorFrame'].tolist() for media_id in media_ids}

# Download frames from CSV

In [None]:
# Initialize downloader with multiple media IDs
downloader = DataDownloader(
    api_token=api_token,
    project_id=project_id,
    media_ids=media_ids,
    frame_ids_dict=frame_ids_dict,
    output_dir=output_dir,
    max_workers=10,
    max_retries=10,
)

# Download all frames for all media IDs
downloader.download_data()

In [25]:
frame_paths_dict = downloader.output_data

In [26]:
# Create a new dataframe with the paths
output_df = []
for media_id, group in df.groupby('TatorMediaID'):
    # Get the frame paths for this media ID
    media_frame_paths = frame_paths_dict[media_id]
    
    # Create a mapping of frame number to path
    frame_to_path = {
        int(path.split('_')[-1].replace('.jpg', '')): path 
        for path in media_frame_paths
    }
    
    # Add paths to the group
    group = group.copy()
    group['Image_Path'] = group['TatorFrame'].map(frame_to_path)
    output_df.append(group)

# Combine all groups back into a single dataframe
final_df = pd.concat(output_df, ignore_index=True)

In [None]:
final_df[['TatorMediaID', 'TatorFrame', 'Image_Path', 'Sclass', 'Ssubclass', 'Sgroup']].head(3)

In [28]:
# Create and output updated dataframe with the paths
final_df.to_csv("../Data/NCICS/Madeline_Data/MadelineIs_Modified.csv", index=False)

# Download Unlabeled AUV Data

In [4]:
# Search string comes from Tator's Data Metadata Export utility
search_string = "eyJtZXRob2QiOiJBTkQiLCJvcGVyYXRpb25zIjpbeyJhdHRyaWJ1dGUiOiJNaXNzaW9uTmFtZSIsIm9wZXJhdGlvbiI6Imljb250YWlucyIsImludmVyc2UiOmZhbHNlLCJ2YWx1ZSI6Ik1hZGVsaW5lIn0seyJtZXRob2QiOiJPUiIsIm9wZXJhdGlvbnMiOlt7ImF0dHJpYnV0ZSI6IiR0eXBlIiwib3BlcmF0aW9uIjoiZXEiLCJpbnZlcnNlIjpmYWxzZSwidmFsdWUiOjMzMX1dfV19"

# Demo for downloading labeled data
frac = 0.01

dataset_name = "Unlabeled_AUV_Data"
output_dir = "../Data/NCICS/"

In [5]:
# Create a downloader for the labeled data
downloader = DatasetDownloader(api_token,
                               project_id=project_id,
                               search_string=search_string,
                               frac=frac,
                               output_dir=output_dir,
                               dataset_name=dataset_name,
                               label_field="",
                               download_width=1024)

NOTE: Authentication successful for jordan.pierce
NOTE: Search string saved to e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\search_string.txt


In [6]:
# Download the labeled data
downloader.download_data()

NOTE: Querying Tator for labeled data
NOTE: Found 27064 objects in query


Processing query: 100%|██████████| 27064/27064 [00:02<00:00, 12911.25it/s]


NOTE: Found 270 query objects after sampling
NOTE: Data saved to e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\data.csv
NOTE: Data saved to e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\data.json
NOTE: Downloading images to e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\images


Downloading images: 100%|██████████| 270/270 [01:29<00:00,  3.01it/s]

NOTE: Images downloaded to e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\images





In [11]:
df = downloader.as_dataframe()  # .as_dict()

In [12]:
df.to_csv("../Data/NCICS/Unlabeled_AUV_Data/Unlabeled_AUV_Data.csv", index=False)

# Clustering

In [2]:
# Get the labeled data, subset
labeled_df = pd.read_csv("../Data/NCICS/Madeline_Data/MadelineIs_Modified.csv")
labeled_df = labeled_df[['Path', 'Sclass', 'Ssubclass', 'Sgroup']]

labeled_df.head(3)

Unnamed: 0,Path,Sclass,Ssubclass,Sgroup
0,e:\tator-tools\Data\NCICS\Madeline_Data\frames...,Fine Unconsolidated Mineral Substrate,Sandy Substrate,Sand
1,e:\tator-tools\Data\NCICS\Madeline_Data\frames...,Fine Unconsolidated Mineral Substrate,Sandy Substrate,Sand
2,e:\tator-tools\Data\NCICS\Madeline_Data\frames...,Fine Unconsolidated Mineral Substrate,Sandy Substrate,Sand


In [3]:
# Get the unlabeled data, subset, conform
unlabeled_df = pd.read_csv("../Data/NCICS/Unlabeled_AUV_Data/Unlabeled_AUV_Data.csv")
unlabeled_df = unlabeled_df[['image_path']]
unlabeled_df['Path'] = unlabeled_df['image_path']
unlabeled_df['Sclass'] = "Unknown"
unlabeled_df['Ssubclass'] = "Unknown"
unlabeled_df['Sgroup'] = "Unknown"

# Drop the image_path column
unlabeled_df.drop(columns=['image_path'], inplace=True)

unlabeled_df.head(3)

Unnamed: 0,Path,Sclass,Ssubclass,Sgroup
0,e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\i...,Unknown,Unknown,Unknown
1,e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\i...,Unknown,Unknown,Unknown
2,e:\tator-tools\Data\NCICS\Unlabeled_AUV_Data\i...,Unknown,Unknown,Unknown


In [19]:
# Combine the labeled and unlabeled data
combined_df = pd.concat([labeled_df, unlabeled_df], ignore_index=True)

# Perform QA / QC such that sClass, sSubclass, sGroup are not empty
# Replace any empty values or NaN values with "Unlabeled"
combined_df['Sclass'] = combined_df['Sclass'].fillna("Unlabeled").replace("", "Unlabeled")
combined_df['Ssubclass'] = combined_df['Ssubclass'].fillna("Unlabeled").replace("", "Unlabeled")
combined_df['Sgroup'] = combined_df['Sgroup'].fillna("Unlabeled").replace("", "Unlabeled")

combined_df.sample(3)

Unnamed: 0,Path,Sclass,Ssubclass,Sgroup
878,e:\tator-tools\Data\NCICS\Madeline_Data\frames...,Consolidated Mineral Substrate,Bedrock,Unlabeled
834,e:\tator-tools\Data\NCICS\Madeline_Data\frames...,Fine Unconsolidated Mineral Substrate,Muddy Substrate,Mud
67,e:\tator-tools\Data\NCICS\Madeline_Data\frames...,Fine Unconsolidated Mineral Substrate,Sandy Substrate,Sand


In [9]:
embeddings = None

if True:
    # Calculate custom embeddings
    model_weights = "E:\\tator-tools\\Data\\Runs\\2024-06-26_20-47-34_detect_yolov10m\\weights\\best.pt"
    # Load the model
    model = YOLO(model_weights)
    # Get the image size
    imgsz = model.__dict__['overrides']['imgsz']

    # Get the device
    device ='cuda' if torch.cuda.is_available() else 'cpu'
    print(f"NOTE: Using device {device}")

    # Run a blank image through the model to load the weights
    _ = model(np.zeros((imgsz, imgsz, 3), dtype=np.uint8), device=device) 

    embeddings_list = []

    # Use the length of combined_df as the total for tqdm
    total_items = len(combined_df)
    for path in tqdm(combined_df['Path'].tolist(), total=total_items, desc="Calculating embeddings"):
        embeddings = model.embed(path, imgsz=imgsz, stream=False, device=device, verbose=False)
        embeddings_list.append(embeddings[0].cpu().numpy())
        
    embeddings = np.array(embeddings_list)
    embeddings.shape

    torch.cuda.empty_cache()  

NOTE: Using device cuda

0: 640x640 (no detections), 50.0ms
Speed: 3.0ms preprocess, 50.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)


Calculating embeddings: 100%|██████████| 1179/1179 [00:36<00:00, 32.15it/s]


In [20]:
# Initialize the viewer with the path to the directory containing images
viewer = FiftyOneDatasetViewer(dataframe=combined_df,
                               image_path_column='Path',
                               feature_columns=['Sclass', 'Ssubclass', 'Sgroup'],
                               nickname='MadelineIs',
                               custom_embeddings=embeddings,  # Pass the embeddings, or None
                               clustering_method='tsne',      # umap, pca, tsne
                               num_dims=2)                    # Number of dimensions for UMAP (2 or 3)

In [21]:
# Process the dataset to create the FiftyOne dataset and generate the UMAP visualization
viewer.process_dataset()

Overwriting existing dataset: MadelineIs


Processing images: 100%|██████████| 1179/1179 [00:37<00:00, 31.66it/s]


 100% |███████████████| 1179/1179 [780.6ms elapsed, 0s remaining, 1.5K samples/s]      
Computing embeddings...
Using provided custom embeddings
Computing UMAP visualization...
Generating visualization...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1179 samples in 0.000s...
[t-SNE] Computed neighbors for 1179 samples in 0.067s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1179
[t-SNE] Computed conditional probabilities for sample 1179 / 1179
[t-SNE] Mean sigma: 1.026537
[t-SNE] Computed conditional probabilities in 0.023s




[t-SNE] Iteration 50: error = 64.1906509, gradient norm = 0.0474356 (50 iterations in 0.153s)
[t-SNE] Iteration 100: error = 61.9038925, gradient norm = 0.0344904 (50 iterations in 0.132s)
[t-SNE] Iteration 150: error = 61.6679840, gradient norm = 0.0288542 (50 iterations in 0.144s)
[t-SNE] Iteration 200: error = 61.6632118, gradient norm = 0.0295466 (50 iterations in 0.153s)
[t-SNE] Iteration 250: error = 61.6369820, gradient norm = 0.0286119 (50 iterations in 0.140s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.636982
[t-SNE] Iteration 300: error = 0.8862907, gradient norm = 0.0061396 (50 iterations in 0.130s)
[t-SNE] Iteration 350: error = 0.7760760, gradient norm = 0.0043927 (50 iterations in 0.129s)
[t-SNE] Iteration 400: error = 0.7472803, gradient norm = 0.0020962 (50 iterations in 0.159s)
[t-SNE] Iteration 450: error = 0.7370064, gradient norm = 0.0013734 (50 iterations in 0.154s)
[t-SNE] Iteration 500: error = 0.7308135, gradient norm = 0.0010996 (50 

In [22]:
# Launch the FiftyOne app
try:
    session = fo.launch_app(viewer.dataset)
except:
    # Weird behavior in notebook
    session = fo.launch_app(viewer.dataset)