In [1]:
import fiftyone as fo

def load_mcity_fisheye_3_months_dev(dataset_info):
    """
    Loads the Mcity Fisheye 3 months dataset based on the provided dataset information.

    Args:
        dataset_info (dict): A dictionary containing the following keys:
            - "name" (str): The name of the dataset.
            - "local_path" (str): The local path to the dataset directory.
            - "v51_type" (str): The type of the dataset, corresponding to a type in `fo.types`.
            - "v51_splits" (list): A list of dataset splits to be loaded.

    Returns:
        fo.Dataset: The loaded dataset object.

    Raises:
        KeyError: If any of the required keys are missing in `dataset_info`.
        AttributeError: If `v51_type` does not correspond to a valid type in `fo.types`.
    """
    dataset_name = "mcity_3_months_dev"
    dataset_dir = dataset_info["local_path"]
    dataset_type = getattr(fo.types, dataset_info["v51_type"])
    dataset_splits = dataset_info["v51_splits"]  # Use all available splits

    if dataset_name in fo.list_datasets():
        fo.delete_dataset(dataset_name)
    
    dataset = fo.Dataset(dataset_name)
    for split in dataset_splits:
        dataset.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=dataset_type,
            split=split,
            tags=split,
        )
    dataset.compute_metadata(num_workers=8)

    dataset.persistent = False  # https://docs.voxel51.com/user_guide/using_datasets.html#dataset-persistence
    return dataset

In [2]:
from tqdm import tqdm

def get_metadata(dataset):
    # Add dataset specific metedata based on filename
    view = dataset.view()
    for sample in tqdm(view, desc="Deriving metadata from filenames"):  # https://docs.voxel51.com/api/fiftyone.core.sample.html
        metadata = process_mcity_fisheye_3_months_filename_dev(sample["filepath"])
        sample["location"] = metadata["location"]
        sample["name"] = metadata["name"]
        sample["timestamp"] = metadata["timestamp"]
        sample.save()

In [10]:
import re
import os
from datetime import datetime
import logging

def process_mcity_fisheye_3_months_filename_dev(filename):
    """
    Processes a given filename to extract metadata including location, name, and timestamp.

    Args:
        filename (str): The full path or name of the file to be processed.

    Returns:
        dict: A dictionary containing the following keys:
            - 'filename' (str): The base name of the file.
            - 'location' (str or None): The location extracted from the filename, if available.
            - 'name' (str or None): The cleaned name extracted from the filename.
            - 'timestamp' (datetime or None): The timestamp extracted from the filename, if available.

    The function performs the following steps:
        1. Extracts the base name of the file.
        2. Searches for a known location within the filename.
        3. Splits the filename into two parts based on the first occurrence of a 4-digit year.
        4. Cleans up the first part to derive the name.
        5. Extracts and parses the timestamp from the second part of the filename.
    """

    filename = os.path.basename(filename)
    #Works gridsmart_ne_stage_1_2021_05_02_labeled_2021-05-02_13-52-35-657986.jpg
    #Error gs_Geddes_Huron1_2023-09-01 13-01-15-785124.jpg
    
    results = {"filename": filename, "location": None, "name": None, "timestamp": None}

    available_locations = [
        "beal",
        "bishop",
        "georgetown",
        "gridsmart_ne",
        "gridsmart_nw",
        "gridsmart_se",
        "gridsmart_sw",
        "Huron_Plymouth-Geddes",
        "Main_stadium",
        "gs_Geddes_Huron",
        "gs_Huron_Plymouth",
        "gs_Plymouth_Beal",
        "gs_Plymouth_Georgetown",
        "gs_Plymouth_Bishop",
        "gs_Plymouth_EPA"]

    for location in available_locations:
        if location in filename:
            results["location"] = location
            break
    
    if results["location"] is None:
        logging.error(f"Filename {filename} could not be assigned to a known location")

    # Split string into first and second part based on first 4 digit year number
    match = re.search(r"\d{4}", filename)
    if match:
        year_index = match.start()
        part1 = filename[:year_index]
        part2 = filename[year_index:]

    # Cleanup first part
    results["name"] = re.sub(r"[-_]+$", "", part1)

    # Extract timestamp from second part
    match = re.search(r"\d{8}T\d{6}|\d{4}-\d{2}-\d{2}[_ ]\d{2}-\d{2}-\d{2}", part2)
    if match:
        extracted_timestamp = match.group(0)
    
        if re.match(r"\d{8}T\d{6}", extracted_timestamp):
            results["timestamp"] = datetime.strptime(extracted_timestamp, "%Y%m%dT%H%M%S")
        elif re.match(r"\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}", extracted_timestamp):
            results["timestamp"] = datetime.strptime(extracted_timestamp, "%Y-%m-%d_%H-%M-%S")
        elif re.match(r"\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}", extracted_timestamp):
            results["timestamp"] = datetime.strptime(extracted_timestamp, "%Y-%m-%d %H-%M-%S")
        else:
            logging.error("Unknown timestamp format")
    else:
        logging.error("No valid timestamp found in string")
        
    return results

In [4]:
import sys

sys.path.append("..")

from utils.dataset_loader import *

SELECTED_DATASET = "mcity_fisheye_3_months"
dataset_info = load_dataset_info(SELECTED_DATASET, config_path="/home/dbogdoll/mcity_data_engine/config/datasets.yaml")
dataset = load_mcity_fisheye_3_months_dev(dataset_info)


 100% |█████████████| 48568/48568 [1.7m elapsed, 0s remaining, 860.2 samples/s]      


INFO:eta.core.utils: 100% |█████████████| 48568/48568 [1.7m elapsed, 0s remaining, 860.2 samples/s]      


 100% |█████████████████| 744/744 [1.7s elapsed, 0s remaining, 447.0 samples/s]      


INFO:eta.core.utils: 100% |█████████████████| 744/744 [1.7s elapsed, 0s remaining, 447.0 samples/s]      


Computing metadata...


INFO:fiftyone.core.metadata:Computing metadata...


 100% |█████████████| 49312/49312 [4.8s elapsed, 0s remaining, 10.2K samples/s]      


INFO:eta.core.utils: 100% |█████████████| 49312/49312 [4.8s elapsed, 0s remaining, 10.2K samples/s]      


In [11]:
get_metadata(dataset)

Deriving metadata from filenames:   0%|          | 0/49312 [00:00<?, ?it/s]

Deriving metadata from filenames: 100%|██████████| 49312/49312 [01:12<00:00, 681.38it/s] 
