In [7]:
import json
from pathlib import Path
from typing import Dict, List, Any, Union
import os

In [8]:
class TranscriptDataManager:
    """
    A class to manage loading, merging, and accessing podcast transcript data.
    """
    def __init__(self, base_path: str = ".."):
        """
        Initialize the manager.
        
        Args:
            base_path (str): The directory where the JSON files are located relative to the notebook.
        """
        self.base_path = Path(base_path)
        self.data: Dict[str, List[Dict[str, Any]]] = {}

    def load_data(self, filenames: List[str]) -> None:
        """
        Loads multiple JSON transcript files and merges them into a single data source.
        
        Args:
            filenames (List[str]): List of filenames to load.
        """
        for filename in filenames:
            file_path = self.base_path / filename
            if not file_path.exists():
                print(f"Warning: File not found: {file_path.resolve()}")
                continue
            
            try:
                print(f"Loading {filename}...")
                with open(file_path, 'r', encoding='utf-8') as f:
                    chunk = json.load(f)
                    self._merge_chunk(chunk)
                print(f"Successfully loaded {filename}. Total episodes so far: {len(self.data)}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

    def _merge_chunk(self, new_data: Dict[str, List[Dict[str, Any]]]) -> None:
        """
        Merges a new chunk of data into the main dataset.
        
        Args:
            new_data (Dict): Dictionary of episodes to merge.
        """
        # Check for overlaps
        existing_keys = set(self.data.keys())
        new_keys = set(new_data.keys())
        overlap = existing_keys.intersection(new_keys)
        
        if overlap:
            print(f"Warning: {len(overlap)} episodes overlap and will be overwritten.")
            
        self.data.update(new_data)

    def get_combined_data(self) -> Dict[str, List[Dict[str, Any]]]:
        """Returns the raw combined dictionary."""
        return self.data

    def get_all_utterances(self) -> List[Dict[str, Any]]:
        """
        Flattens the data structure to return a list of all utterances across all episodes.
        Useful for creating RAG chunks.
        """
        all_utterances = []
        for episode_id, utterances in self.data.items():
            # Optionally inject episode_id into each utterance if not present
            for u in utterances:
                if 'episode_id' not in u:
                    u['episode_id'] = episode_id
            all_utterances.extend(utterances)
        return all_utterances
    
    def get_stats(self) -> Dict[str, int]:
        """Returns basic statistics about the loaded data."""
        return {
            "total_episodes": len(self.data),
            "total_utterances": sum(len(u) for u in self.data.values())
        }

    def save_to_json(self, output_filename: str) -> None:
        """
        Saves the combined data to a JSON file.
        
        Args:
            output_filename (str): The name of the file to save.
        """
        output_path = self.base_path / output_filename
        try:
            print(f"Saving data to {output_path}...")
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(self.data, f, indent=4)
            print(f"Successfully saved data to {output_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

In [9]:
# Define the files to load
files_to_load = [
    "train-transcripts-aligned.json",
    "valid-transcripts-aligned.json",
    "test-transcripts-aligned.json"
]

# Initialize the manager
# Note: The files are in the parent directory relative to this notebook
manager = TranscriptDataManager(base_path="..")

# Load the files
manager.load_data(files_to_load)

# Print stats
stats = manager.get_stats()
print("\nData Loading Complete.")
print(f"Total Episodes: {stats['total_episodes']}")
print(f"Total Utterances: {stats['total_utterances']}")

Loading train-transcripts-aligned.json...
Successfully loaded train-transcripts-aligned.json. Total episodes so far: 593
Loading valid-transcripts-aligned.json...
Successfully loaded valid-transcripts-aligned.json. Total episodes so far: 627
Loading test-transcripts-aligned.json...
Successfully loaded test-transcripts-aligned.json. Total episodes so far: 663

Data Loading Complete.
Total Episodes: 663
Total Utterances: 163808
Successfully loaded train-transcripts-aligned.json. Total episodes so far: 593
Loading valid-transcripts-aligned.json...
Successfully loaded valid-transcripts-aligned.json. Total episodes so far: 627
Loading test-transcripts-aligned.json...
Successfully loaded test-transcripts-aligned.json. Total episodes so far: 663

Data Loading Complete.
Total Episodes: 663
Total Utterances: 163808


In [10]:
# Save the combined data to a new JSON file
output_filename = "combined_transcripts.json"
manager.save_to_json(output_filename)

Saving data to ../combined_transcripts.json...
Successfully saved data to ../combined_transcripts.json
Successfully saved data to ../combined_transcripts.json


In [11]:
# Inspect the problematic data point
ep1_data = manager.data['ep-1']
first_utterance = ep1_data[0]
print("Alignments for first utterance of ep-1:")
print(json.dumps(first_utterance['alignments'], indent=4))

Alignments for first utterance of ep-1:
[
    [
        0.17,
        0.45000000000000007,
        0
    ],
    [
        0.45000000000000007,
        0.5700000000000001,
        1
    ]
]


In [12]:
# Find the utterance with the problematic alignment
for i, u in enumerate(manager.data['ep-1']):
    for align in u.get('alignments', []):
        if 8 in align and 5.45 in align:
            print(f"Found in utterance index {i}:")
            print(json.dumps(u['alignments'], indent=4))
            break

Found in utterance index 4:
[
    [
        4.45,
        4.65,
        0
    ],
    [
        4.65,
        4.65,
        1
    ],
    [
        4.65,
        4.65,
        2
    ],
    [
        4.65,
        4.8500000000000005,
        3
    ],
    [
        4.8500000000000005,
        4.8500000000000005,
        4
    ],
    [
        4.8500000000000005,
        4.890000000000001,
        5
    ],
    [
        4.890000000000001,
        5.45,
        6
    ],
    [
        5.45,
        5.85,
        8
    ]
]
