In [None]:
PODCAST_NAME = 'gradient_perspectives'

# episode index
START_INDEX = 0
END_INDEX = 146

# only consider characters in podcast episode up to this limit
CHARACTER_LIMIT = 100000 

MIN_CHARACTERS = 300
MAX_CHARACTERS = 10000

In [9]:
import os
import sys
import time
from pathlib import Path

project_root = Path.cwd().parent.parent.parent
sys.path.append(str(project_root))

from src.config import PODCAST_TANSCRIBED_PATHS, PODCAST_SEGMENTED_DIRS
from google import genai
from dotenv import load_dotenv
from pydantic import BaseModel
import pandas as pd
import json
import plotly.express as px
import plotly.graph_objects as go

load_dotenv()

True

In [10]:
PROMPT_TOPIC_LIST = """
You are a state-of-the-art podcast transcript analyzer.
Your task is to divide the ENTIRE transcript into 5-40 consecutive NON-OVERLAPPING segments based on topics.
The segments should contain 10-20 sentences and represent self-contained topics.
Commercial breaks should be labeled as a seperate segment.
Your only output should be a chronological list of topic labels (3-5 words each).

For each of the topics provide the following:
[
{
"topic_description": "[topic_label]",
},
{
"topic_description": "[topic_label]",
}
]

PODCAST TRANSCRIPT: '''
"""


PROMPT_ADD_FIRST_LAST_5_WORDS = f"""
You are adding two fields to the previous step in the process. 

The previous prompt was: *** {PROMPT_TOPIC_LIST} ***

YOUR GOAL: For each of the topics provide the following:
[
{{
"topic_description": "[topic_label]",
"first_5_words_topic": "[exact 5 words from transcript]",
"last_5_words_topic": "[exact 5 words from transcript]",
}},
{{
"topic_description": "[topic_label]",
"first_5_words_topic": "[exact 5 words from transcript]",
"last_5_words_topic": "[exact 5 words from transcript]",
}}
]
Note: 
- the first 5 words should be the beginning of a sentence and the last 5 words the ending of a sentence.
- do not change given topic labels.

"""

In [None]:
class SegmentTopicsLLM:
   """
   Creates formatted segmented topics and add boundaries using an episode text as input.
   Format is (JSON):
   [{
        "topic_description": "[topic_label]",
        "first_5_words_topic": "[exact 5 words from transcript]",
        "last_5_words_topic": "[exact 5 words from transcript]",
    }]
   """
   def __init__(self, api_key=None):
       self.api_key = api_key or os.getenv("GOOGLE_API_KEY")
       self.model = "gemini-2.5-flash"
       
   def _call_api(self, prompt):
       return self.client.models.generate_content(
           model=self.model,
           contents=prompt,
           config={
               "temperature": 0.0, 
               "response_mime_type": "application/json",
               "thinking_config": {
                   "thinking_budget": 0
               }
            }
       )
   
   def segment_episode(self, episode_text, print_diagnostics):
       
       topic_segments = self._extract_topics(episode_text)
       if print_diagnostics:
        self._print_results(topic_segments, "creating topic segments")
       
       topics_with_boundaries = self._add_boundaries(topic_segments.text, episode_text)
       if print_diagnostics:
           self._print_results(topics_with_boundaries, "addding first and last 5 words to segments")
       
       return topics_with_boundaries

   def _extract_topics(self, episode_text):
       prompt = f"{PROMPT_TOPIC_LIST}{episode_text}'''"
       return self._call_api(prompt)
   
   def _add_boundaries(self, topic_descriptions, episode_text):
       prompt = f"{PROMPT_ADD_FIRST_LAST_5_WORDS} TOPICS: {topic_descriptions} EPISODE TEXT: {episode_text}"
       return self._call_api(prompt)

   def _print_results(self, response, action_name):
       print(response)
       print(f"Tokens used for {action_name}: {response.usage_metadata.total_token_count}")
       print(response.text)
       print("")


In [12]:
class SegmentProcessor:
    @staticmethod
    def process_segments(segments_json, episode_text, episode_data, episode_idx, print_diagnostics):
        try:
            segments = json.loads(segments_json)
        except json.JSONDecodeError as e:
            if print_diagnostics:
                print(f"Skipping episode {episode_idx} due to JSON decode error")
            return []

        processed_segments = []
        
        for segment_idx, segment in enumerate(segments):
            first_5_words = segment.get('first_5_words_topic', '')
            last_5_words = segment.get('last_5_words_topic', '')
            
            if first_5_words:
                start_pos = episode_text.find(first_5_words)
                if start_pos != -1:
                    if last_5_words:
                        end_search_start = start_pos + len(first_5_words)
                        end_pos = episode_text.find(last_5_words, end_search_start)
                        if end_pos != -1:
                            end_pos = end_pos + len(last_5_words)
                        else:
                            if segment_idx < len(segments) - 1:
                                next_segment = segments[segment_idx + 1]
                                next_first_5_words = next_segment.get('first_5_words_topic', '')
                                end_pos = episode_text.find(next_first_5_words, start_pos + len(first_5_words))
                                if end_pos == -1:
                                    end_pos = len(episode_text)
                            else:
                                end_pos = len(episode_text)
                    else:
                        if segment_idx < len(segments) - 1:
                            next_segment = segments[segment_idx + 1]
                            next_first_5_words = next_segment.get('first_5_words_topic', '')
                            end_pos = episode_text.find(next_first_5_words, start_pos + len(first_5_words))
                            if end_pos == -1:
                                end_pos = len(episode_text)
                        else:
                            end_pos = len(episode_text)

                    ordered_segment = {
                        'episode_id': episode_idx,
                        'podcast_title': episode_data['podcast_title'],
                        'episode_title': episode_data['episode_title'],
                        'date': pd.to_datetime(episode_data['date']).strftime('%Y-%m-%d'),
                        'segment_id': segment_idx,
                        'topic_description_llm': segment.get('topic_description', ''),
                        'first_5_words': first_5_words,
                        'start_index': start_pos,
                        'end_index': end_pos,
                        'character_length': end_pos - start_pos,
                        'segment_text': episode_text[start_pos:end_pos]
                    }

                    processed_segments.append(ordered_segment)
                else:
                    if print_diagnostics:
                        print(f"WARNING: Could not find '{first_5_words}' for segment {segment_idx}: {segment.get('topic_description')}")
                    
        return processed_segments
    
    @staticmethod
    def save_segments(segments, output_dir, episode_idx):
        os.makedirs(output_dir, exist_ok=True)
        with open(f'{output_dir}/episode_{episode_idx}.json', 'w') as f:
            json.dump(segments, f, indent=2)



In [None]:
def segment_episodes_batch(PODCAST_NAME, START_INDEX, END_INDEX, CHARACTER_LIMIT, print_diagnostics=True, trottle_for_free_api=True):
   
   input_file_path = PODCAST_TANSCRIBED_PATHS[PODCAST_NAME]
   output_dir = PODCAST_SEGMENTED_DIRS[PODCAST_NAME]

   df = pd.read_parquet(input_file_path)

   print(f"Starting segmentation of {len(df.iloc[START_INDEX:END_INDEX+1])} podcast episodes...")
   print("Input data of first episode in selection:")

   if print_diagnostics:
       print(df.iloc[START_INDEX])

   segmentor = SegmentTopicsLLM()
   processor = SegmentProcessor()

   for i, (idx, episode) in enumerate(df.iloc[START_INDEX:END_INDEX+1].iterrows()):
       
       if os.path.exists(f'{output_dir}/episode_{idx}.json'):
            print(f"Skipping already processed episode {idx}")
            continue
       
       episode_text = episode['transcript'][:CHARACTER_LIMIT]
       
       print(f"Processing episode {i + 1} of {len(df.iloc[START_INDEX:END_INDEX+1])}, #{idx} {episode['podcast_title']}, episode: {episode['episode_title']}...")       
       if print_diagnostics:
           print(f'Character length of episode: {len(episode_text)}\n')
       
       topics_with_boundaries = segmentor.segment_episode(episode_text, print_diagnostics).text

       processed_segments = processor.process_segments(topics_with_boundaries, episode_text, episode, idx, print_diagnostics)
       
       processor.save_segments(processed_segments, output_dir, idx)
       
       if trottle_for_free_api:
        import time
        time.sleep(60) # seconds

   
   print("Segmentation completed.")


In [None]:
def filter_segments_by_length(PODCAST_NAME, START_INDEX, END_INDEX, MIN_CHARACTERS, MAX_CHARACTERS):
    
    output_dir = PODCAST_SEGMENTED_DIRS[PODCAST_NAME]
    
    for episode_idx in range(START_INDEX, END_INDEX + 1):
        file_path = f'{output_dir}/episode_{episode_idx}.json'
        
        if os.path.exists(file_path):
            with open(file_path, 'r') as f:
                segments = json.load(f)
            
            filtered_segments = [
                segment for segment in segments 
                if MIN_CHARACTERS <= segment['character_length'] <= MAX_CHARACTERS
            ]
            
            with open(file_path, 'w') as f:
                json.dump(filtered_segments, f, indent=2)
            
            print(f"Episode {episode_idx}: Kept {len(filtered_segments)} of {len(segments)} segments")

In [None]:
segment_episodes_batch(
    PODCAST_NAME, 
    START_INDEX, 
    END_INDEX,
    CHARACTER_LIMIT, 
    print_diagnostics=False,
    trottle_for_free_api=True
    )


Starting segmentation of 147 podcast episodes...
Input data of first episode in selection:
Processing episode 1 of 147, #0 The_Gradient_Perspectives_on_AI, episode: Lt. General Jack Shanahan: AI in the DoD, Project Maven, and Bridging the Tech-DoD Gap...
Processing episode 2 of 147, #1 The_Gradient_Perspectives_on_AI, episode: Laura Weidinger: Ethical Risks, Harms, and Alignment of Large Language Models...
Processing episode 3 of 147, #2 The_Gradient_Perspectives_on_AI, episode: Richard Socher: Re-Imagining Search...
Processing episode 4 of 147, #3 The_Gradient_Perspectives_on_AI, episode: Joel Simon on AI art and Artbreeder...
Processing episode 5 of 147, #4 The_Gradient_Perspectives_on_AI, episode: Russ Maschmeyer: Spatial Commerce and AI in Retail...
Processing episode 6 of 147, #5 The_Gradient_Perspectives_on_AI, episode: Seth Lazar: Normative Philosophy of Computing...
Processing episode 7 of 147, #6 The_Gradient_Perspectives_on_AI, episode: Vivek Natarajan: Towards Biomedical AI.

In [None]:

# apply character filter to saved JSON files
filter_segments_by_length(PODCAST_NAME, START_INDEX, END_INDEX, MIN_CHARACTERS, MAX_CHARACTERS)

Episode 0: Kept 12 of 12 segments
Episode 1: Kept 17 of 17 segments
Episode 2: Kept 9 of 13 segments
Episode 3: Kept 13 of 14 segments
Episode 4: Kept 18 of 18 segments
Episode 5: Kept 17 of 19 segments
Episode 6: Kept 23 of 23 segments
Episode 7: Kept 14 of 17 segments
Episode 8: Kept 8 of 11 segments
Episode 9: Kept 10 of 11 segments
Episode 10: Kept 12 of 15 segments
Episode 11: Kept 12 of 13 segments
Episode 12: Kept 12 of 14 segments
Episode 13: Kept 14 of 17 segments
Episode 14: Kept 42 of 42 segments
Episode 15: Kept 19 of 20 segments
Episode 16: Kept 26 of 28 segments
Episode 17: Kept 8 of 9 segments
Episode 18: Kept 17 of 21 segments
Episode 19: Kept 20 of 20 segments
Episode 20: Kept 28 of 28 segments
Episode 21: Kept 21 of 24 segments
Episode 22: Kept 19 of 24 segments
Episode 23: Kept 13 of 14 segments
Episode 24: Kept 15 of 17 segments
Episode 25: Kept 12 of 13 segments
Episode 26: Kept 21 of 23 segments
Episode 27: Kept 12 of 14 segments
Episode 28: Kept 22 of 25 segments