## Generate GT RTTM File for ChildLens_v2

In [None]:
import json
from pathlib import Path
from glob import glob
from collections import Counter
import pandas as pd

# Folder and output paths
input_folder = "/home/nele_pauline_suffo/ProcessedData/childlens_annotations/keeper/v1"
output_dir = Path("/home/nele_pauline_suffo/ProcessedData/audio_cls_input")
output_dir.mkdir(exist_ok=True)

# Updated valid event IDs based on new structure
valid_event_ids = {"child_talking", "other_person_talking", "overheard_speech", "singing/humming"}

all_files = []
speaker_counts = Counter()
files_processed = 0

# Step 1: Load all JSON files and collect metadata
json_files = glob(f"{input_folder}/*.json")
for json_file in json_files:
    try:
        with open(json_file, "r") as f:
            data = json.load(f)
        
        # Extract video name and calculate duration from annotations
        video_name = data.get('video_name', '')
        if not video_name:
            print(f"Warning: No video_name found in {json_file}")
            continue
            
        # Calculate duration from annotations (find max endTime)
        annotations = data.get('annotations', [])
        if not annotations:
            print(f"Warning: No annotations found in {json_file}")
            continue
            
        duration = max(ann.get('endTime', 0) for ann in annotations)
        
        all_files.append({
            "path": json_file,
            "uri": video_name,
            "duration": duration
        })
    except Exception as e:
        print(f"Skipping file {json_file} due to error: {e}")

# Step 2: Sort and split files by total duration
all_files.sort(key=lambda x: x["duration"], reverse=True)
total_duration = sum(f["duration"] for f in all_files)

train_duration, dev_duration, test_duration = 0, 0, 0
train_files, dev_files, test_files = [], [], []

for f in all_files:
    if train_duration < 0.8 * total_duration:
        train_files.append(f)
        train_duration += f["duration"]
    elif dev_duration < 0.1 * total_duration:
        dev_files.append(f)
        dev_duration += f["duration"]
    else:
        test_files.append(f)
        test_duration += f["duration"]

splits = {
    "train": train_files,
    "dev": dev_files,
    "test": test_files
}
split_durations = {
    "train": train_duration,
    "dev": dev_duration,
    "test": test_duration
}

# Step 3: Process each split and write RTTM
all_df_rows = [] # Initialize list for DataFrame rows
all_rttm_lines_combined = [] # Initialize list for all RTTM lines for the complete.rttm

for split_name, files_in_split in splits.items():
    rttm_lines_split = [] # RTTM lines for the current split

    for f_info in files_in_split:
        try:
            with open(f_info["path"], "r") as file_handle:
                data = json.load(file_handle)
            
            uri = data.get('video_name', '')
            files_processed += 1
            
            # Process annotations with new structure
            for annotation in data.get('annotations', []):
                try:
                    event_id = annotation.get('eventId', '')
                    
                    if event_id not in valid_event_ids:
                        continue
                        
                    # Get timing information
                    start_sec = annotation.get('startTime', 0)
                    end_sec = annotation.get('endTime', 0)
                    duration_sec = end_sec - start_sec
                    
                    if duration_sec <= 0:
                        print(f"Warning: Non-positive duration {duration_sec:.3f}s for segment in {f_info['path']}. Skipping.")
                        continue
                    
                    # Map event IDs to speaker IDs
                    speaker_id = "NA"
                    fields = annotation.get('fields', {})
                    
                    if event_id in ["child_talking", "singing/humming"]:
                        speaker_id = "KCHI"
                    elif event_id == "other_person_talking":
                        speaker_id = "CDS"
                    elif event_id == "overheard_speech":
                        speaker_id = "OHS"     

                    rttm_line = f"SPEAKER {uri} 1 {start_sec:.3f} {duration_sec:.3f} <NA> <NA> {speaker_id} <NA> <NA>"
                        
                    if speaker_id in ["KCHI", "CDS", "OHS"]:
                        rttm_lines_split.append(rttm_line)
                        all_rttm_lines_combined.append(rttm_line)
                        speaker_counts[speaker_id] += 1    
                        
                        # Add additional SPEECH line only for non-SPEECH speakers
                        if speaker_id != "SPEECH":
                            rttm_line_speech = f"SPEAKER {uri} 1 {start_sec:.3f} {duration_sec:.3f} <NA> <NA> SPEECH <NA> <NA>"
                            rttm_lines_split.append(rttm_line_speech)
                            all_rttm_lines_combined.append(rttm_line_speech)
                            speaker_counts["SPEECH"] += 1
                        
                        row_data_specific = {
                            "audio_file_name": uri,
                            "Utterance_Start": round(start_sec, 3),
                            "Utterance_Duration": round(duration_sec, 3),
                            "Voice_type": speaker_id,
                            "Utterance_End": round(end_sec, 3)
                        }
                        all_df_rows.append(row_data_specific)

                        if speaker_id != "SPEECH":
                            row_data_speech = {
                                "audio_file_name": uri,
                                "Utterance_Start": round(start_sec, 3),
                                "Utterance_Duration": round(duration_sec, 3),
                                "Voice_type": "SPEECH",
                                "Utterance_End": round(end_sec, 3)
                            }
                            all_df_rows.append(row_data_speech)
                        
                except Exception as e:
                    print(f"Skipping annotation in {f_info['path']} due to error: {e}")
                    
        except Exception as e:
            print(f"Error reading file {f_info['path']}: {e}")

    # Save to RTTM file for the current split
    rttm_path_split = output_dir / f"{split_name}.rttm"
    with open(rttm_path_split, "w") as out_f:
        for line in rttm_lines_split:
            out_f.write(line + "\n")
    print(f"📝 RTTM file for {split_name} split saved to {rttm_path_split}")

# Save the complete RTTM file after processing all splits
complete_rttm_path = output_dir / "complete.rttm"
with open(complete_rttm_path, "w") as out_f:
    for line in all_rttm_lines_combined:
        out_f.write(line + "\n")
print(f"📝 Complete RTTM file for all splits saved to {complete_rttm_path}")

# Create and save ONE COMBINED DataFrame after processing all splits
if all_df_rows:
    combined_df = pd.DataFrame(all_df_rows)
    # Ensure desired column order
    combined_df = combined_df[["audio_file_name", "Utterance_Start", "Utterance_Duration", "Voice_type", "Utterance_End"]]
    # Define the single output path for the combined pickle file
    df_pkl_path = Path("/home/nele_pauline_suffo/ProcessedData/audio_cls_input/annotations_gt.pkl")
    # Ensure the directory exists
    df_pkl_path.parent.mkdir(parents=True, exist_ok=True)
    combined_df.to_pickle(df_pkl_path)
    print(f"✅ Combined DataFrame for all splits saved to {df_pkl_path} ({len(combined_df)} rows)")
else:
    print(f"ℹ️ No data to create combined DataFrame.")

# Step 3.5: Save a complete UEM file with all video information
uem_lines = []
for f in all_files:
    uri = f["uri"]
    start_time = 0.000
    end_time = f["duration"]
    uem_lines.append(f"{uri} 1 {start_time:.3f} {end_time:.3f}")

uem_path = output_dir / "complete.uem"
with open(uem_path, "w") as uem_file:
    for line in uem_lines:
        uem_file.write(line + "\n")
print(f"✅ Combined UEM file for all videos saved to {uem_path} ({len(uem_lines)} segments)")

# Step 4: Summary logs
print(f"\n✅ Total processed files: {files_processed}")
print("\n🎙️ Speaker total durations in all splits (in minutes):")
if not combined_df.empty:
    speaker_durations = combined_df.groupby("Voice_type")["Utterance_Duration"].sum()
    for speaker_id in ['KCHI', 'CDS', 'OHS', 'SPEECH']:
        duration_sec = speaker_durations.get(speaker_id, 0.0)
        duration_min = duration_sec / 60
        print(f"  {speaker_id}: {duration_min:.2f} min")
else:
    print("No data available to compute durations.")

print("\n📊 RTTM split durations and video counts:")
for split_name in ["train", "dev", "test"]:
    dur = split_durations[split_name]
    perc = (dur / total_duration) * 100
    count = len(splits[split_name])
    print(f"  {split_name}: {dur:.2f} sec ({perc:.1f}%), {count} videos")

# Generate .lst files for train, development, and test splits
for split_name, files in splits.items():
    lst_path = output_dir / f"{split_name}.lst"
    with open(lst_path, "w") as lst_file:
        for f in files:
            lst_file.write(f"{f['uri']}\n")

print("✅ .lst files created for train, development, and test splits.")

# Generate .uem files for train, development, and test splits
for split_name, files in splits.items():
    uem_path = output_dir / f"{split_name}.uem"
    with open(uem_path, "w") as uem_file:
        for f in files:
            try:
                # Extract the URI and duration for each video
                uri = f["uri"]
                start = 0  # Start time is always 0
                end = f["duration"]  # End time is the video's duration

                # Write a single line for each video
                uem_line = f"{uri} 1 {start:.3f} {end:.3f}"
                uem_file.write(uem_line + "\n")
            except Exception as e:
                print(f"Error processing file {f['path']}: {e}")

print("✅ .uem files created for train, development, and test splits.")

