In [8]:
#read json file
import json
with open('/Users/nelesuffo/Promotion/projects/TAAT/categories/human_actions/100898.json', 'r') as file:
    data = json.load(file)

In [21]:
data['annotations'][0]

{'categoryId': 'human_actions',
 'eventId': 'location',
 'fields': {'Type of Location': 'Livingroom'},
 'id': 'location_1745396774002',
 'time': 0,
 'type': 'start',
 'videoName': '100898.MP4'}

In [23]:
# Read the JSON file
import json

with open('/Users/nelesuffo/Promotion/projects/TAAT/categories/human_actions/100898.json', 'r') as file:
    data = json.load(file)

# Extract all 'id' and 'startAnnotationId' values from the annotations
ids = {annotation['id'] for annotation in data['annotations'] if 'id' in annotation}
start_annotation_ids = {annotation['startAnnotationId'] for annotation in data['annotations'] if 'startAnnotationId' in annotation}

# Find IDs that are only in 'id' or only in 'startAnnotationId'
only_in_ids = ids - start_annotation_ids
only_in_start_annotation_ids = start_annotation_ids - ids

# Print the results
print("IDs only in 'id':", only_in_ids)
print("IDs only in 'startAnnotationId':", only_in_start_annotation_ids)

IDs only in 'id': set()
IDs only in 'startAnnotationId': set()


## Convert SuperAnnotate to TAAT format

In [4]:
import json
import time # For generating parts of unique IDs if needed
from pathlib import Path # Added for file operations

def get_taat_fields_for_event(event_id, sa_attributes):
    """
    Constructs the TAAT 'fields' object based on event_id and SuperAnnotate attributes.
    'sa_attributes' is expected to be a dictionary of attribute_name: attribute_value.
    """
    fields = {}
    # Ensure sa_attributes is a dictionary
    if not isinstance(sa_attributes, dict):
        sa_attributes = {}

    if event_id == "other_person_talking":
        fields["1st Person Age Group"] = sa_attributes.get("1st Person Age Group", "")
        fields["1st Person Gender"] = sa_attributes.get("1st Person Gender", "")
    elif event_id == "overheard_speech":
        pass # fields remains empty
    elif event_id == "location": # Specific handling for "location" eventId
        fields["Type of Location"] = sa_attributes.get("Type of Location", "")
    else:
        # Default "big setup" for other activities (like those derived from SA "Action")
        fields["Alone?"] = sa_attributes.get("Alone?", "")
        person_ordinals = {1: "1st", 2: "2nd", 3: "3rd", 4: "4th", 5: "5th", 6: "6th"}
        for i in range(1, 7):
            ordinal = person_ordinals[i]
            age_group_key = f"{ordinal} Person Age Group"
            gender_key = f"{ordinal} Person Gender"
            
            fields[age_group_key] = sa_attributes.get(age_group_key, "")
            fields[gender_key] = sa_attributes.get(gender_key, "")
    return fields

def convert_superannotate_to_taat_formats(superannotate_event_instances, video_name, category_id):
    """
    Converts a list of preprocessed SuperAnnotate event instances into detailed and concise TAAT formats.

    Args:
        superannotate_event_instances: A list of dictionaries. Each dictionary represents one
                                       SuperAnnotate event instance and should contain:
                                       - "taat_eventId": str (the derived TAAT eventId)
                                       - "startTimeSeconds": float
                                       - "endTimeSeconds": float
                                       - "attributes_for_taat": dict (attributes structured for get_taat_fields_for_event)
                                       - "instanceId_SA": str (optional, a unique ID from SuperAnnotate for the instance)
        video_name: str, The name of the video file (e.g., "711881.MP4").
        category_id: str, The category ID for these annotations (e.g., "human_actions").

    Returns:
        A tuple: (detailed_taat_output_dict, concise_taat_output_dict)
    """
    detailed_annotations_list = []
    concise_annotations_list = []
    instance_counter = 0 

    for sa_instance in superannotate_event_instances:
        taat_event_id = sa_instance.get("taat_eventId") # Use the derived TAAT eventId
        start_time = sa_instance.get("startTimeSeconds")
        end_time = sa_instance.get("endTimeSeconds")
        # Attributes specifically prepared for get_taat_fields_for_event
        attributes_for_taat = sa_instance.get("attributes_for_taat", {}) 

        if taat_event_id is None or start_time is None or end_time is None:
            print(f"Warning: Skipping preprocessed instance due to missing critical data: {sa_instance}")
            continue

        # Get TAAT fields using the derived taat_event_id and prepared attributes
        taat_fields = get_taat_fields_for_event(taat_event_id, attributes_for_taat)

        unique_suffix = f"{int(time.time() * 1000)}_{instance_counter}"
        # Use instanceId_SA if available, otherwise generate one with taat_event_id
        start_annotation_id = sa_instance.get("instanceId_SA", f"{taat_event_id}_{unique_suffix}")
        instance_counter += 1
        
        # 1. Detailed TAAT Format
        start_ann = {
            "id": start_annotation_id,
            "time": start_time,
            "categoryId": category_id,
            "eventId": taat_event_id,
            "fields": taat_fields,
            "type": "start",
            "videoName": video_name 
        }
        detailed_annotations_list.append(start_ann)

        end_ann = {
            "time": end_time,
            "categoryId": category_id,
            "eventId": taat_event_id,
            "type": "end",
            "startAnnotationId": start_annotation_id,
            "fields": taat_fields, 
            "videoName": video_name 
        }
        detailed_annotations_list.append(end_ann)

        # 2. Concise TAAT Format
        duration = end_time - start_time
        concise_ann = {
            "time": start_time, 
            "categoryId": category_id,
            "eventId": taat_event_id,
            "fields": taat_fields,
            "type": "complete",
            "videoName": video_name, 
            "startTime": start_time,
            "endTime": end_time,
            "duration": duration
        }
        concise_annotations_list.append(concise_ann)

    detailed_taat_output = {
        "video_name": video_name,
        "category_id": category_id,
        "annotations": detailed_annotations_list,
        "activeAnnotations": {} 
    }

    concise_taat_output = {
        "video_name": video_name,
        "category_id": category_id,
        "annotations": concise_annotations_list,
        "activeAnnotations": {} 
    }

    return detailed_taat_output, concise_taat_output

def preprocess_superannotate_data(sa_json_data_filepath, category_id_for_taat, default_video_name="unknown.mp4"):
    """
    Loads SuperAnnotate JSON from a file and preprocesses it into the format
    expected by 'convert_superannotate_to_taat_formats'.
    """
    try:
        with open(sa_json_data_filepath, 'r') as f:
            sa_data = json.load(f)
    except Exception as e:
        print(f"Error loading SuperAnnotate JSON file '{sa_json_data_filepath}': {e}")
        return None, [] # Return None for video_name if file can't be loaded

    preprocessed_instances = []
    
    video_name = sa_data.get("metadata", {}).get("name", default_video_name)
    if video_name == default_video_name:
        print(f"Warning: Using default video name '{default_video_name}' for file '{sa_json_data_filepath}'")

    sa_instances_list = sa_data.get("instances", []) 

    for instance in sa_instances_list:
        meta = instance.get("meta", {})
        instance_type = meta.get("type")

        if instance_type != "event": 
            continue

        sa_class_name = meta.get("className") 
        instance_id_sa = meta.get("id")
        
        # Parameters list, default to empty list if not found
        parameters = instance.get("parameters", [])
        if not parameters: # Ensure parameters list is not empty
            print(f"Skipping instance (ID: {instance_id_sa}): 'parameters' list is missing or empty.")
            continue
        
        # Event times are in microseconds, convert to seconds
        # Default to None if keys are missing
        start_time_microseconds = parameters[0].get("start")
        end_time_microseconds = parameters[0].get("end")

        if start_time_microseconds is None or end_time_microseconds is None:
            print(f"Skipping instance (ID: {instance_id_sa}): missing start or end time in parameters.")
            continue
            
        start_time_s = float(start_time_microseconds) / 1_000_000.0
        end_time_s = float(end_time_microseconds) / 1_000_000.0
        
        # Extract attributes from the first timestamp object
        # Uses .get() with defaults to avoid errors if structure is slightly different
        timestamps = parameters[0].get("timestamps", [])
        if not timestamps: # Ensure timestamps list is not empty
            print(f"Skipping instance (ID: {instance_id_sa}): 'timestamps' list is missing or empty in parameters.")
            continue

        sa_attributes_list = timestamps[0].get("attributes", [])
        attributes_dict = {} # Stores {groupName: name}
        for attr in sa_attributes_list:
            group_name = attr.get("groupName")
            attr_name_val = attr.get("name") # This is the actual value of the attribute
            if group_name and attr_name_val is not None:
                attributes_dict[group_name] = attr_name_val
        
        # Determine TAAT eventId and the attributes to pass to get_taat_fields_for_event
        taat_event_id = None
        attributes_for_taat_fields = {}

        if sa_class_name == "Location":
            taat_event_id = "location"
            # For "location", get_taat_fields_for_event expects {"Type of Location": value}
            attributes_for_taat_fields["Type of Location"] = attributes_dict.get("Type of Location")
        elif sa_class_name == "Action":
            type_of_action = attributes_dict.get("Type of Action")
            if type_of_action:
                taat_event_id = type_of_action.lower().replace(" ", "_")
                # For "Action" types, pass all extracted SA attributes.
                # get_taat_fields_for_event will pick what it needs based on the derived taat_event_id.
                attributes_for_taat_fields = attributes_dict.copy()
            else:
                print(f"Skipping 'Action' instance (ID: {instance_id_sa}): 'Type of Action' attribute missing.")
                continue
        else:
            # Fallback for other SuperAnnotate classNames if any
            if sa_class_name:
                taat_event_id = sa_class_name.lower().replace(" ", "_")
                attributes_for_taat_fields = attributes_dict.copy()
            else:
                print(f"Skipping instance (ID: {instance_id_sa}): 'className' is missing in meta.")
                continue
        
        if not taat_event_id: # Should be caught by earlier continues, but as a safeguard
            print(f"Skipping instance (ID: {instance_id_sa}): Could not determine TAAT eventId.")
            continue

        preprocessed_instances.append({
            "taat_eventId": taat_event_id,
            "startTimeSeconds": start_time_s,
            "endTimeSeconds": end_time_s,
            "attributes_for_taat": attributes_for_taat_fields,
            "instanceId_SA": instance_id_sa 
        })
        
    return video_name, preprocessed_instances


def batch_process_superannotate_files(input_directory, output_directory_detailed, output_directory_concise, category_id_for_taat):
    input_dir = Path(input_directory)
    output_dir_detailed = Path(output_directory_detailed)
    output_dir_concise = Path(output_directory_concise)

    output_dir_detailed.mkdir(parents=True, exist_ok=True)
    output_dir_concise.mkdir(parents=True, exist_ok=True)

    processed_count = 0
    error_count = 0

    for file_path in input_dir.glob('*.json'):
        print(f"\nProcessing file: {file_path.name}")
        
        # category_id_for_taat can be dynamic here if needed, e.g., from folder name
        video_name, preprocessed_instances = preprocess_superannotate_data(file_path, category_id_for_taat)

        if video_name is None or not preprocessed_instances:
            print(f"Failed to preprocess or no instances found in {file_path.name}. Skipping.")
            error_count +=1
            continue
        
        if not preprocessed_instances: # Double check after preprocessing call
            print(f"No processable instances found in {file_path.name} after preprocessing. Skipping.")
            continue

        detailed_output, concise_output = convert_superannotate_to_taat_formats(
            preprocessed_instances,
            video_name, # Use video_name extracted from SA file
            category_id_for_taat
        )

        # Prepare output filenames
        base_filename = video_name.replace('.MP4', '') # Remove .MP4 if present
        if not base_filename: # Handle cases where video_name might be just ".MP4" or empty
            base_filename = file_path.stem # Fallback to original file stem

        output_path_detailed = output_dir_detailed / f"{base_filename}.json"
        output_path_concise = output_dir_concise / f"{base_filename}.json" # Your desired output format

        try:
            with output_path_detailed.open('w') as outfile:
                json.dump(detailed_output, outfile, indent=4)
            print(f"Saved detailed TAAT to: {output_path_detailed}")

            with output_path_concise.open('w') as outfile:
                json.dump(concise_output, outfile, indent=4)
            print(f"Saved concise TAAT to: {output_path_concise}")
            processed_count += 1
        except Exception as e:
            print(f"Error saving output for {file_path.name}: {e}")
            error_count += 1
            
    print(f"\nBatch processing complete. Processed files: {processed_count}, Errors: {error_count}")

In [5]:
SA_INPUT_DIR = '/Users/nelesuffo/Promotion/ProcessedData/annotations_superannotate'
TAAT_OUTPUT_DIR_DETAILED = '/Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/platform_format'
TAAT_OUTPUT_DIR_CONCISE = '/Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/final' # For your target format
TAAT_CATEGORY_ID = "human_actions" # Assuming this is constant for all files in this batch

batch_process_superannotate_files(
    SA_INPUT_DIR, 
    TAAT_OUTPUT_DIR_DETAILED, 
    TAAT_OUTPUT_DIR_CONCISE, 
    TAAT_CATEGORY_ID
)

print("Processing finished. Check the output directories.")


Processing file: 410731.MP4.json
Saved detailed TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/platform_format/410731.json
Saved concise TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/final/410731.json

Processing file: 488644.MP4.json
Saved detailed TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/platform_format/488644.json
Saved concise TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/final/488644.json

Processing file: 183537.MP4.json
Saved detailed TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/platform_format/183537.json
Saved concise TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/final/183537.json

Processing file: 487661.MP4.json
Skipping 'Action' instance (ID: Mjg3OTAzMDAuMTkzNzQxNzM=): 'Type of Action' attribute missing.
Saved detailed TAAT to: /Users/nelesuffo/Promotion/ProcessedData/annotations_keeper/platform_format/487661.json
Saved concise TAAT