In [None]:
import pandas as pd 
import math
import re
import json

df = pd.read_parquet("TV-20250105-2032-2400.webm.h264.mp4_filtered.parquet", engine="pyarrow")
df

In [None]:
with open("TV-20250105-2032-2400.webm.h264.mp4.json", "r") as f:
    transcript = json.load(f)

transcript

In [None]:
# Exclude Tagesschau Intro
transcript_without_intro = transcript[2:]

In [17]:
def process_transcripts(transcripts, fps=25, frame_window=750):
    """
    Process video transcripts to find text segments and associated words within a frame window.
    Each result contains all complete sentences that fit within the frame window.
    
    Args:
        transcripts (list): List of transcript dictionaries
        fps (int): Frames per second of the video
        frame_window (int): Number of frames to look ahead
        
    Returns:
        list: List of dictionaries containing segment text and all segments within frame window
    """
    # Convert frame window to seconds
    time_window = frame_window / fps
    results = []
    
    # Helper function to check if text starts a sentence
    def is_sentence_start(text):
        # Strip leading whitespace and check if first char is uppercase
        cleaned_text = text.lstrip()
        return bool(cleaned_text and cleaned_text[0].isupper())
    
    # Helper function to check if text ends a sentence
    def is_sentence_end(text):
        # Check if text ends with common sentence endings
        return bool(re.search(r'[.!?]\s*$', text))
    
    for i, transcript in enumerate(transcripts):
        segment_start = transcript['start']
        start_frame = math.floor(segment_start * fps)
        window_end_time = segment_start + time_window
        
        # Skip if this segment doesn't start a sentence and isn't the first segment
        if i > 0 and not is_sentence_start(transcript['text']):
            continue
            
        # Collect all segments that fall within this window
        segments_in_window = []
        current_text = []
        current_sentence = []
        
        # Look at all segments including and after the current one
        for next_transcript in transcripts[i:]:
            # Skip segments that start after our window
            if next_transcript['start'] > window_end_time:
                # If we have a partial sentence when we hit the window boundary,
                # remove it from the collections
                if current_sentence and not is_sentence_end(current_sentence[-1]):
                    segments_in_window = segments_in_window[:-(len(current_sentence))]
                    current_text = current_text[:-(len(current_sentence))]
                break
                
            # Add segment to current sentence collection
            current_sentence.append(next_transcript['text'])
            
            # If we hit the end of a sentence
            if is_sentence_end(next_transcript['text']):
                # Add all segments from the complete sentence
                segments_in_window.extend([{
                    'time': next_transcript['start'],
                    'text': text
                } for text in current_sentence])
                current_text.extend(current_sentence)
                # Reset current sentence collection
                current_sentence = []
        
        # Only create result if we have complete sentences
        if segments_in_window:
            result = {
                'start_frame': start_frame,
                'segment_start_time': segment_start,
                'window_end_time': window_end_time,
                'original_segment_text': transcript['text'],
                'all_segments': segments_in_window,
                'combined_text': ' '.join(current_text)
            }
            results.append(result)
    
    return results

In [18]:
def process_transcripts_fixed_windows(transcripts, fps=25, frame_window=750):
    """
    Process video transcripts by splitting them into fixed frame windows.
    Each window contains all words that appear within that frame range.
    
    Args:
        transcripts (list): List of transcript dictionaries
        fps (int): Frames per second of the video
        frame_window (int): Size of each frame window
        
    Returns:
        list: List of dictionaries containing all segments within each frame window
    """
    results = []
    
    # Find the last timestamp to determine total number of windows needed
    max_time = max(transcript['start'] for transcript in transcripts)
    max_frame = int(max_time * fps)
    
    # Calculate number of full windows needed
    num_windows = (max_frame // frame_window) + 1
    
    for window_idx in range(num_windows):
        # Calculate window boundaries in frames
        start_frame = window_idx * frame_window
        end_frame = start_frame + frame_window - 1  # -1 because frames are 0-indexed
        
        # Convert to timestamps
        start_time = start_frame / fps
        end_time = (end_frame + 1) / fps  # +1 to include the full last frame
        
        # Collect all segments that fall within this window
        segments_in_window = []
        current_text = []
        
        for transcript in transcripts:
            segment_time = transcript['start']
            
            # Skip segments before this window
            if segment_time < start_time:
                continue
                
            # Stop if we've gone beyond this window
            if segment_time >= end_time:
                break
                
            segments_in_window.append({
                'time': segment_time,
                'text': transcript['text']
            })
            current_text.append(transcript['text'])
        
        # Only create result if we have segments in this window
        if segments_in_window:
            result = {
                'start_frame': start_frame,
                'end_frame': end_frame,
                'window_start_time': start_time,
                'window_end_time': end_time,
                'all_segments': segments_in_window,
                'combined_text': ' '.join(current_text)
            }
            results.append(result)
    
    return results

In [None]:
processed_transcript = process_transcripts(transcript_without_intro)
processed_transcript

In [None]:
processed_transcript = process_transcripts_fixed_windows(transcript_without_intro)
processed_transcript