In [None]:
PODCAST_NAME = 'lex_fridman'

# episode index
START_INDEX = 0
END_INDEX = 9

In [1]:
import os
from google import genai
from dotenv import load_dotenv
from pydantic import BaseModel
import pandas as pd
import json
import plotly.express as px
import plotly.graph_objects as go

import sys
from pathlib import Path

project_root = Path.cwd().parent.parent.parent
sys.path.append(str(project_root))

from src.config import PODCAST_TANSCRIBED_PATHS, PODCAST_SEGMENTED_DIRS

Importing DEVELOPMENT config...


In [2]:
def plot_episode_segments_timeline(episode_path):

   with open(episode_path, 'r') as f:
       segments = json.load(f)

   fig = go.Figure()

   y_positions = []
   for i, segment in enumerate(segments):
       y_pos = i
       y_positions.append(y_pos)

       fig.add_shape(
           type="rect",
           x0=segment['start_index'],
           x1=segment['end_index'],
           y0=y_pos - 0.3,
           y1=y_pos + 0.3,
           fillcolor=f"hsl({i * 360 / len(segments)}, 70%, 50%)",
           opacity=0.8,
           line=dict(width=1, color="black")
       )

       fig.add_trace(go.Scatter(
           x=[(segment['start_index'] + segment['end_index']) / 2],
           y=[y_pos],
           mode='markers',
           marker=dict(size=0.1, opacity=0),
           hovertemplate=(
               f"<b>{segment['topic_description_llm'][:50]}...</b><br>" +
               f"Start: {segment['start_index']:,}<br>" +
               f"End: {segment['end_index']:,}<br>" +
               f"Length: {segment['character_length']:,}<br>" +
               f"Text: {'<br>'.join(segment['segment_text'][:50].split('. '))}...<br>" +
               "<extra></extra>"
           ),
           showlegend=False
       ))

       fig.add_annotation(
           x=(segment['start_index'] + segment['end_index']) / 2,
           y=y_pos,
           text=f"{segment['segment_id']}",
           showarrow=False,
           font=dict(color='black', size=12, family="Arial Black"),
           bgcolor='rgba(255,255,255,0.3)',
           borderpad=4
       )

   overlaps = []
   for i in range(len(segments) - 1):
       if segments[i]['end_index'] > segments[i+1]['start_index']:
           overlap_start = segments[i+1]['start_index']
           overlap_end = min(segments[i]['end_index'], segments[i+1]['end_index'])
           overlaps.append({
               'segments': (i, i+1),
               'start': overlap_start,
               'end': overlap_end,
               'length': overlap_end - overlap_start
           })

           fig.add_shape(
               type="rect",
               x0=overlap_start,
               x1=overlap_end,
               y0=-0.5,
               y1=len(segments) - 0.5,
               fillcolor="red",
               opacity=0.2,
               line=dict(width=0)
           )

   fig.update_layout(
       title=f"Episode Segments Timeline - Character Positions \n {episode_path}",
       xaxis_title="Character Position",
       yaxis_title="Segment",
       height=max(400, len(segments) * 80),
       showlegend=False,
       yaxis=dict(
           tickmode='array',
           tickvals=y_positions,
           ticktext=[f"Seg {s['segment_id']}: {s['topic_description_llm'][:40]}..." for s in segments],
           autorange='reversed'
       ),
       xaxis=dict(
           showgrid=True,
           gridwidth=1,
           gridcolor='LightGrey'
       ),
       hovermode='closest'
   )

   return fig, overlaps

def plot_episode_segments_multiple_timelines(episode_paths):
    for episode_path in episode_paths:
        fig, overlaps = plot_episode_segments_timeline(episode_path)
        fig.show()

        if overlaps:
            print(f"\nFound {len(overlaps)} overlapping segments:")
            for overlap in overlaps:
                print(f"  Segments {overlap['segments'][0]} and {overlap['segments'][1]}: "
                        f"{overlap['length']:,} characters overlap "
                        f"(positions {overlap['start']:,} to {overlap['end']:,})")

In [None]:
input_dir = PODCAST_SEGMENTED_DIRS[PODCAST_NAME]
episode_paths = [f'{input_dir}/episode_{i}.json' for i in range(START_INDEX, END_INDEX + 1)]

plot_episode_segments_multiple_timelines(episode_paths)


Found 1 overlapping segments:
  Segments 18 and 19: 34,361 characters overlap (positions 12,348 to 46,709)
