THIS FILE DOES NOT NEED TO BE RUN BY GRADERS, AND IS ONLY FOR THE VIDEO WALKTHROUGH. 
This notebook was used to combine the original datasets into transcripts_full.json, which is the data that is used for the database. That data can be accessed here: 

https://duke.box.com/s/to632zn4o0pdfezhubtj0kkoc5rfi8kj



In [None]:
import json
from pathlib import Path
from typing import Dict, List, Any


In [None]:
class TranscriptDataManager:
    """
    A class to manage loading, merging, and accessing podcast transcript data.
    """
    def __init__(self, base_path: str = ".."):
        """
        Initialize the manager.
        
        Args:
            base_path (str): The directory where the JSON files are located relative to the notebook.
        """
        self.base_path = Path(base_path)
        self.data: Dict[str, List[Dict[str, Any]]] = {}

    def load_data(self, filenames: List[str]) -> None:
        """
        Loads multiple JSON transcript files and merges them into a single data source.
        
        Args:
            filenames (List[str]): List of filenames to load.
        """
        for filename in filenames:
            file_path = self.base_path / filename
            if not file_path.exists():
                print(f"Warning: File not found: {file_path.resolve()}")
                continue
            
            try:
                print(f"Loading {filename}...")
                with open(file_path, 'r', encoding='utf-8') as f:
                    chunk = json.load(f)
                    self._merge_chunk(chunk)
                print(f"Successfully loaded {filename}. Total episodes so far: {len(self.data)}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

    def _merge_chunk(self, new_data: Dict[str, List[Dict[str, Any]]]) -> None:
        """
        Merges a new chunk of data into the main dataset.
        
        Args:
            new_data (Dict): Dictionary of episodes to merge.
        """
        existing_keys = set(self.data.keys())
        new_keys = set(new_data.keys())
        overlap = existing_keys.intersection(new_keys)
        
        if overlap:
            print(f"Warning: {len(overlap)} episodes overlap and will be overwritten.")
            
        self.data.update(new_data)

    def get_combined_data(self) -> Dict[str, List[Dict[str, Any]]]:
        """Returns the raw combined dictionary."""
        return self.data

    def get_all_utterances(self) -> List[Dict[str, Any]]:
        """
        Flattens the data structure to return a list of all utterances across all episodes.
        Useful for creating RAG chunks.
        """
        all_utterances = []
        for episode_id, utterances in self.data.items():
            for u in utterances:
                if 'episode_id' not in u:
                    u['episode_id'] = episode_id
            all_utterances.extend(utterances)
        return all_utterances
    
    def get_stats(self) -> Dict[str, int]:
        """Returns basic statistics about the loaded data."""
        return {
            "total_episodes": len(self.data),
            "total_utterances": sum(len(u) for u in self.data.values())
        }

    def save_to_json(self, output_filename: str) -> None:
        """
        Saves the combined data to a JSON file.
        
        Args:
            output_filename (str): The name of the file to save.
        """
        output_path = self.base_path / output_filename
        try:
            print(f"Saving data to {output_path}...")
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(self.data, f, indent=4)
            print(f"Successfully saved data to {output_path}")
        except Exception as e:
            print(f"Error saving data: {e}")

In [None]:
files_to_load = [
    "train-transcripts-aligned.json",
    "valid-transcripts-aligned.json",
    "test-transcripts-aligned.json"
]

manager = TranscriptDataManager(base_path="..")

manager.load_data(files_to_load)

stats = manager.get_stats()
print("\nData Loading Complete.")
print(f"Total Episodes: {stats['total_episodes']}")
print(f"Total Utterances: {stats['total_utterances']}")

Loading train-transcripts-aligned.json...
Successfully loaded train-transcripts-aligned.json. Total episodes so far: 593
Loading valid-transcripts-aligned.json...
Successfully loaded valid-transcripts-aligned.json. Total episodes so far: 627
Loading test-transcripts-aligned.json...
Successfully loaded test-transcripts-aligned.json. Total episodes so far: 663

Data Loading Complete.
Total Episodes: 663
Total Utterances: 163808
Successfully loaded train-transcripts-aligned.json. Total episodes so far: 593
Loading valid-transcripts-aligned.json...
Successfully loaded valid-transcripts-aligned.json. Total episodes so far: 627
Loading test-transcripts-aligned.json...
Successfully loaded test-transcripts-aligned.json. Total episodes so far: 663

Data Loading Complete.
Total Episodes: 663
Total Utterances: 163808


In [None]:
output_filename = "combined_transcripts.json"
manager.save_to_json(output_filename)

Saving data to ../combined_transcripts.json...
Successfully saved data to ../combined_transcripts.json
Successfully saved data to ../combined_transcripts.json
