# Transcribing and processing Harris 2024 event speeches

## dependencies 
 


In [8]:
!pip install yt-dlp whisper-timestamped pandas tqdm



- ffmpeg installation on machine is required for yt-dlp

In [None]:
!pip install tensorflow==2.13.0 tf-keras

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Collecting transformers<5.0.0,>=4.41.0
  Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Collecting huggingface-hub>=0.20.0
  Using cached huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Installing collected packages: huggingface-hub, transformers, sentence-transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.16.4
    Uninstalling huggingface-hub-0.16.4:
      Successfully uninstalled huggingface-hub-0.16.4
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.0
    Uninstalling transformers-4.30.0:
      Successfully uninstalled transformers-4.30.0
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.2.2
    Uninstalling sentence-transformers-2.2.2:
      Successfully uninstalled sentence-transformers-2.2.2
Successfully installed huggingfa



## Script to transcribe and Chunk 

In [7]:
import yt_dlp
import whisper_timestamped
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer, util


def download_audio(youtube_url, start_time, duration=None):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }],
        'outtmpl': 'audio',
        'ffmpeg_location': r"C:\Users\nicho\Documents\ffmpeg-master-latest-win64-gpl-shared\bin"  
    }

    if duration:
        ydl_opts['download_ranges'] = lambda info: [[start_time, start_time + duration]]
        ydl_opts['force_keyframes_at_cuts'] = True

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

def transcribe_audio(start_time_seconds):
    model = whisper_timestamped.load_model("medium")
    audio_path = "audio.wav"

    result = whisper_timestamped.transcribe(model, audio_path)

    for segment in result['segments']:
        # Adjust timestamps relative to the start time
        segment['start'] += start_time_seconds
        segment['end'] += start_time_seconds
        print(f"[{segment['start']:.2f}s -> {segment['end']:.2f}s] {segment['text']}")


def new_transcribe_audio(start_time_seconds, video_id=None, event_date=None, location=None):
    """
    Transcribe audio and return formatted data ready for DataFrame insertion

    Parameters:
    - start_time_seconds: Offset to add to all timestamps
    - video_id: Optional YouTube video ID for reference
    - event_date: Optional date of the rally
    - location: Optional location of the rally

    Returns:
    - Dictionary with transcription data and metadata
    - List of segment dictionaries for adding to DataFrame rows
    """
    model = whisper_timestamped.load_model("small")
    audio_path = "audio.wav"

    result = whisper_timestamped.transcribe(model, audio_path)

    # Adjust timestamps relative to the start time
    for segment in result['segments']:
        segment['start'] += start_time_seconds
        segment['end'] += start_time_seconds

    # Create a clean formatted transcript with timestamps
    formatted_transcript = "\n".join([
        f"[{segment['start']:.2f}s -> {segment['end']:.2f}s] {segment['text']}"
        for segment in result['segments']
    ])

    # Create a clean text-only transcript
    full_text = " ".join([segment['text'] for segment in result['segments']])

    # Create metadata dictionary
    transcript_data = {
        "video_id": video_id,
        "event_date": event_date,
        "location": location,
        "start_time": start_time_seconds,
        "segments": result['segments'],
        "formatted_transcript": formatted_transcript,
        "full_text": full_text
    }

    # Create a list of segments for row-by-row DataFrame insertion
    segment_rows = []
    for segment in result['segments']:
        segment_rows.append({
            "video_id": video_id,
            "event_date": event_date,
            "location": location,
            "start_time": segment['start'],
            "end_time": segment['end'],
            "duration": segment['end'] - segment['start'],
            "text": segment['text']
        })

    return transcript_data, segment_rows


def clean_transcript(text, min_repeat_threshold=3, min_word_length=2):

    """
    Clean transcript text by removing repetitive chants, audience responses, and noise.

    Parameters:
    - text (str): The transcript text to clean
    - min_repeat_threshold (int): Minimum repeats to consider as a chant
    - min_word_length (int): Minimum word length to consider for repetition analysis

    Returns:
    - str: Cleaned transcript text
    """

    if not text or not isinstance(text, str):
        return ""

    # Split into lines for processing
    lines = text.split('\n')
    cleaned_lines = []

    # Process each line
    for line in lines:
        # Skip lines that are entirely timestamps
        if re.match(r'^\[\d+\.\d+s -> \d+\.\d+s\]\s*$', line):
            continue

        # Extract text from timestamped lines
        timestamp_match = re.match(r'^\[\d+\.\d+s -> \d+\.\d+s\]\s*(.*)', line)
        if timestamp_match:
            line = timestamp_match.group(1)

        # Skip empty lines
        if not line.strip():
            continue

        # Check for repetitive words in the line (chants)
        words = re.findall(r'\b\w+\b', line.lower())
        word_counts = Counter([w for w in words if len(w) >= min_word_length])

        # Skip lines that are repetitive chants
        if any(count >= min_repeat_threshold for word, count in word_counts.items()):
            most_common = word_counts.most_common(1)
            if most_common and most_common[0][1] / len(words) > 0.4:  # If >40% is the same word
                # It's likely a chant - check if it's audience response
                chant_word = most_common[0][0]
                audience_responses = ['thank', 'trump', 'yes', 'applause', 'clap', 'cheer']
                if chant_word in audience_responses:
                    continue

        # Remove sections of thank you repeats
        line = re.sub(r'(thank you,?\s*){3,}', 'thank you ', line, flags=re.IGNORECASE)

        # Add to cleaned lines if it has meaningful content
        if len(line.strip()) > 0:
            cleaned_lines.append(line.strip())

    # Final cleaning - consolidate sequential short audience responses
    final_lines = []
    skip_next = False

    for i in range(len(cleaned_lines)):
        if skip_next:
            skip_next = False
            continue

        # Check if this is a very short line followed by another short line (likely audience response)
        if i < len(cleaned_lines) - 1:
            curr_line = cleaned_lines[i]
            next_line = cleaned_lines[i + 1]

            if len(curr_line.split()) <= 3 and len(next_line.split()) <= 3:
                if re.search(r'\b(yes|no|applause|thank)\b', curr_line.lower()) and \
                   re.search(r'\b(yes|no|applause|thank)\b', next_line.lower()):
                    # Skip both lines - they're likely audience responses
                    skip_next = True
                    continue

        final_lines.append(cleaned_lines[i])

    # Join the cleaned lines
    cleaned_text = ' '.join(final_lines)

    # Final regex cleanups
    # Remove duplicate Trump mentions
    cleaned_text = re.sub(r'(Trump\s*){3,}', 'Trump ', cleaned_text)

    # Remove "we will" / "we are" repetitions
    cleaned_text = re.sub(r'(we will\s*){2,}', 'we will ', cleaned_text)
    cleaned_text = re.sub(r'(we are\s*){2,}', 'we are ', cleaned_text)

    # Remove "we are not going back" repetitions
    cleaned_text = re.sub(r'(we are not going back\.*\s*){2,}', 'we are not going back. ', cleaned_text)

    # Clean up any resulting double spaces
    cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text)

    return cleaned_text.strip()

def test_chunker (speech_text, threshold=0.6, min_sentences=5, target_chunk_size=2000, max_chunk_size=4000):
    """
    Create semantic chunks from campaign speech text for pentad analysis,
    optimized for LLM processing with larger chunk sizes.

    Args:
        speech_text (str): Full campaign speech text
        threshold (float): Cosine similarity threshold (0-1)
        min_sentences (int): Minimum sentences before considering a chunk complete
        target_chunk_size (int): Target character size for chunks (optimal for LLM)
        max_chunk_size (int): Maximum character size for chunks (LLM constraint)

    Returns:
        list: List of semantically coherent text chunks
    """
    # Simple sentence splitting
    sentences = [s.strip() for s in speech_text.replace('\n', ' ').split('. ')]
    sentences = [s + '.' if not s.endswith('.') else s for s in sentences if s]

    # Load model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for all sentences
    embeddings = model.encode(sentences)

    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_embeddings = []
    current_chunk_chars = 0

    for i, sentence in enumerate(sentences):
        sentence_chars = len(sentence)

        if not current_chunk:
            # Start a new chunk
            current_chunk.append(sentence)
            current_chunk_embeddings.append(embeddings[i])
            current_chunk_chars += sentence_chars
        else:
            # Calculate similarity with the average of current chunk
            avg_embedding = np.mean(current_chunk_embeddings, axis=0)
            similarity = util.pytorch_cos_sim(embeddings[i], avg_embedding).item()

            # Logic for adding sentences to chunks based on multiple criteria:
            # 1. If very similar and below max size - add to current chunk
            # 2. If reached target size with enough sentences - start new chunk
            # 3. If would exceed max size - force start new chunk

            if (similarity >= threshold and
                current_chunk_chars + sentence_chars <= max_chunk_size):
                # Add to current chunk if similar enough and within size limits
                current_chunk.append(sentence)
                current_chunk_embeddings.append(embeddings[i])
                current_chunk_chars += sentence_chars
            elif (len(current_chunk) >= min_sentences and
                  current_chunk_chars >= target_chunk_size):
                # Start new chunk if current chunk is big enough
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_chunk_embeddings = [embeddings[i]]
                current_chunk_chars = sentence_chars
            elif current_chunk_chars + sentence_chars > max_chunk_size:
                # Force start new chunk if would exceed max size
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_chunk_embeddings = [embeddings[i]]
                current_chunk_chars = sentence_chars
            else:
                # Add to current chunk even if less similar
                current_chunk.append(sentence)
                current_chunk_embeddings.append(embeddings[i])
                current_chunk_chars += sentence_chars

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    # Post-processing: Combine consecutive chunks if they're small
    i = 0
    while i < len(chunks) - 1:
        combined_size = len(chunks[i]) + len(chunks[i+1]) + 1  # +1 for space
        if combined_size <= target_chunk_size:
            chunks[i] = chunks[i] + ' ' + chunks[i+1]
            chunks.pop(i+1)
        else:
            i += 1

    return chunks

### script to transcribe single speaking event saved in dataframe

In [8]:
import pandas as pd

def process_event(row):
    """Process a single event from your DataFrame"""
    if pd.isna(row['Link']) or 'youtube.com' not in row['Link']:
        return pd.DataFrame()

    # Convert time to seconds
    start_seconds = pd.to_timedelta(row['Start Time']).total_seconds() if row['Start Time'] else 0
    end_seconds = pd.to_timedelta(row['End Time']).total_seconds() if row['End Time'] else None
    duration = end_seconds - start_seconds if end_seconds else None

    # Download and transcribe
    download_audio(row['Link'], start_seconds)
    transcript_data, _ = new_transcribe_audio(start_seconds)

    # Clean and chunk
    cleaned_text = clean_transcript(transcript_data['full_text'])
    chunks = test_chunker(cleaned_text)

    # Create rows for DataFrame
    data = []

    # Add full text
    data.append({
        'date': row['Date'],
        'type': row['Type'],
        'description': row['Description'],
        'text_type': 'full',
        'text': cleaned_text,
        'chunk_id': 'full',
        'words': len(cleaned_text.split())
    })

    # Add chunks
    for i, chunk in enumerate(chunks):
        data.append({
            'date': row['Date'],
            'type': row['Type'],
            'description': row['Description'],
            'text_type': 'chunk',
            'text': chunk,
            'chunk_id': f'chunk_{i+1}',
            'words': len(chunk.split())
        })

    return pd.DataFrame(data)


## Processing 

loading campaign event data and processing it 

In [9]:
import os
import pandas as pd

# Get the current directory where your script is located
current_dir = os.getcwd()

# Create the full path to your CSV file
path = os.path.join(current_dir, 'Kamala Acvtivity.csv')

events_df = pd.read_csv(path)

events_df = events_df.dropna(subset='Link')
events_df = events_df.rename(columns = {'Start Time ': 'Start Time', 'Description ': 'Description'})


In [None]:
#check df is loaded correctly 
len(events_df)

71

In [None]:
# Process multiple events
results = []

for idx, row in events_df.iterrows():
    try:
        result = process_event(row)
        if not result.empty:
            results.append(result)
            print(f"Processed: {row['Date']} - {row['Type']}")
    except Exception as e:
        print(f"Error: {e}")

# Combine all results
analysis_df = pd.concat(results, ignore_index=True)

# Save to CSV
analysis_df.to_csv('pentad_analysis_data.csv', index=False)

# Quick view
print(f"Total events processed: {len(analysis_df[analysis_df['text_type']=='full'])}")
print(f"Total chunks created: {len(analysis_df[analysis_df['text_type']=='chunk'])}")


[youtube] Extracting URL: https://www.youtube.com/watch?v=JPQpRRPT5BU
[youtube] JPQpRRPT5BU: Downloading webpage
[youtube] JPQpRRPT5BU: Downloading tv client config
[youtube] JPQpRRPT5BU: Downloading player 9a279502-main
[youtube] JPQpRRPT5BU: Downloading tv player API JSON
[youtube] JPQpRRPT5BU: Downloading ios player API JSON
[youtube] JPQpRRPT5BU: Downloading m3u8 information
[info] JPQpRRPT5BU: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   17.27MiB in 00:00:00 at 28.40MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 108952/108952 [01:22<00:00, 1319.54frames/s]


Processed: 2024-07-23 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=_lpYc-Ww8j4
[youtube] _lpYc-Ww8j4: Downloading webpage
[youtube] _lpYc-Ww8j4: Downloading tv client config
[youtube] _lpYc-Ww8j4: Downloading player 9a279502-main
[youtube] _lpYc-Ww8j4: Downloading tv player API JSON
[youtube] _lpYc-Ww8j4: Downloading ios player API JSON
[youtube] _lpYc-Ww8j4: Downloading m3u8 information
[info] _lpYc-Ww8j4: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   18.98MiB in 00:00:01 at 14.99MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 98%|█████████▊| 121000/123077 [01:11<00:01, 1148.23frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
100%|██████████| 123077/123077 [01:15<00:00, 1627.27frames/s]


Processed: 2024-07-30 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=_VNi5Pe7zZM&ab_channel=TheBidenWhiteHouse
[youtube] _VNi5Pe7zZM: Downloading webpage
[youtube] _VNi5Pe7zZM: Downloading tv client config
[youtube] _VNi5Pe7zZM: Downloading player 9a279502-main
[youtube] _VNi5Pe7zZM: Downloading tv player API JSON
[youtube] _VNi5Pe7zZM: Downloading ios player API JSON
[youtube] _VNi5Pe7zZM: Downloading m3u8 information
[info] _VNi5Pe7zZM: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   11.95MiB in 00:00:01 at 11.84MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 99256/99256 [01:00<00:00, 1634.91frames/s]


Processed: 2024-07-31 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=2dU9oEGJ4zM&ab_channel=WFAA
[youtube] 2dU9oEGJ4zM: Downloading webpage
[youtube] 2dU9oEGJ4zM: Downloading tv client config
[youtube] 2dU9oEGJ4zM: Downloading player 9a279502-main
[youtube] 2dU9oEGJ4zM: Downloading tv player API JSON
[youtube] 2dU9oEGJ4zM: Downloading ios player API JSON
[youtube] 2dU9oEGJ4zM: Downloading m3u8 information
[info] 2dU9oEGJ4zM: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   18.70MiB in 00:00:03 at 6.12MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 157574/157574 [04:34<00:00, 574.21frames/s]


Processed: 2024-08-07 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=5_xPGvP62-A
[youtube] 5_xPGvP62-A: Downloading webpage
[youtube] 5_xPGvP62-A: Downloading tv client config
[youtube] 5_xPGvP62-A: Downloading player 9a279502-main
[youtube] 5_xPGvP62-A: Downloading tv player API JSON
[youtube] 5_xPGvP62-A: Downloading ios player API JSON
[youtube] 5_xPGvP62-A: Downloading m3u8 information
[info] 5_xPGvP62-A: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   20.82MiB in 00:00:01 at 10.68MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|█████████▉| 137800/137957 [04:50<00:00, 451.24frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio bounda

Processed: 2024-08-07 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=07A6iRmiixk&ab_channel=WFAA
[youtube] 07A6iRmiixk: Downloading webpage
[youtube] 07A6iRmiixk: Downloading tv client config
[youtube] 07A6iRmiixk: Downloading player 9a279502-main
[youtube] 07A6iRmiixk: Downloading tv player API JSON
[youtube] 07A6iRmiixk: Downloading ios player API JSON
[youtube] 07A6iRmiixk: Downloading m3u8 information
[info] 07A6iRmiixk: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   18.87MiB in 00:00:03 at 6.05MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 147354/147354 [01:41<00:00, 1451.66frames/s]


Processed: 2024-08-08 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=H8rizxQqQR0&ab_channel=ABC15Arizona
[youtube] H8rizxQqQR0: Downloading webpage
[youtube] H8rizxQqQR0: Downloading tv client config
[youtube] H8rizxQqQR0: Downloading player 9a279502-main
[youtube] H8rizxQqQR0: Downloading tv player API JSON
[youtube] H8rizxQqQR0: Downloading ios player API JSON
[youtube] H8rizxQqQR0: Downloading m3u8 information
[info] H8rizxQqQR0: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of  114.54MiB in 00:00:20 at 5.72MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|█████████▉| 876407/879407 [07:46<00:01, 1880.60frames/s] 


Processed: 2024-08-09 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=GLYhhwf-4yo
[youtube] GLYhhwf-4yo: Downloading webpage
[youtube] GLYhhwf-4yo: Downloading tv client config
[youtube] GLYhhwf-4yo: Downloading player 9a279502-main
[youtube] GLYhhwf-4yo: Downloading tv player API JSON
[youtube] GLYhhwf-4yo: Downloading ios player API JSON
[youtube] GLYhhwf-4yo: Downloading m3u8 information
[info] GLYhhwf-4yo: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   25.27MiB in 00:00:01 at 16.78MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 169279/169279 [01:29<00:00, 1889.17frames/s]


Processed: 2024-08-10 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=RR9K55Tvg90&ab_channel=TheBidenWhiteHouse
[youtube] RR9K55Tvg90: Downloading webpage
[youtube] RR9K55Tvg90: Downloading tv client config
[youtube] RR9K55Tvg90: Downloading player 753b1819-main
[youtube] RR9K55Tvg90: Downloading tv player API JSON
[youtube] RR9K55Tvg90: Downloading ios player API JSON
[youtube] RR9K55Tvg90: Downloading m3u8 information
[info] RR9K55Tvg90: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   25.72MiB in 00:00:04 at 5.53MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 214732/214732 [04:08<00:00, 865.01frames/s] 


Processed: 2024-08-15 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=NaQ5KGwAox0&ab_channel=MilwaukeeJournalSentinel
[youtube] NaQ5KGwAox0: Downloading webpage
[youtube] NaQ5KGwAox0: Downloading tv client config
[youtube] NaQ5KGwAox0: Downloading player 9a279502-main
[youtube] NaQ5KGwAox0: Downloading tv player API JSON
[youtube] NaQ5KGwAox0: Downloading ios player API JSON
[youtube] NaQ5KGwAox0: Downloading m3u8 information
[info] NaQ5KGwAox0: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   46.65MiB in 00:00:12 at 3.87MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 432489/432489 [07:17<00:00, 988.12frames/s]  


Processed: 2024-08-20 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=Kn7EPojdKT0
[youtube] Kn7EPojdKT0: Downloading webpage
[youtube] Kn7EPojdKT0: Downloading tv client config
[youtube] Kn7EPojdKT0: Downloading player 9a279502-main
[youtube] Kn7EPojdKT0: Downloading tv player API JSON
[youtube] Kn7EPojdKT0: Downloading ios player API JSON
[youtube] Kn7EPojdKT0: Downloading m3u8 information
[info] Kn7EPojdKT0: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   47.42MiB in 00:00:02 at 15.81MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 313374/313374 [03:35<00:00, 1456.01frames/s] 


Processed: 2024-08-22 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=60sVKUwuF5M&ab_channel=FOX5Atlanta
[youtube] 60sVKUwuF5M: Downloading webpage
[youtube] 60sVKUwuF5M: Downloading tv client config
[youtube] 60sVKUwuF5M: Downloading player 9a279502-main
[youtube] 60sVKUwuF5M: Downloading tv player API JSON
[youtube] 60sVKUwuF5M: Downloading ios player API JSON
[youtube] 60sVKUwuF5M: Downloading m3u8 information
[info] 60sVKUwuF5M: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   19.88MiB in 00:00:03 at 6.10MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 124751/124751 [01:10<00:00, 1778.21frames/s]


Processed: 2024-08-29 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=vVW6efTCM8M&ab_channel=KamalaHarris
[youtube] vVW6efTCM8M: Downloading webpage
[youtube] vVW6efTCM8M: Downloading tv client config
[youtube] vVW6efTCM8M: Downloading player 9a279502-main
[youtube] vVW6efTCM8M: Downloading tv player API JSON
[youtube] vVW6efTCM8M: Downloading ios player API JSON
[youtube] vVW6efTCM8M: Downloading m3u8 information
[info] vVW6efTCM8M: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   19.60MiB in 00:00:02 at 6.86MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 91%|█████████▏| 157001/172001 [01:49<00:10, 1437.32frames/s]


Processed: 2024-09-02 - Rally 
[youtube] Extracting URL: https://www.youtube.com/watch?v=2M0fHti7DZI&ab_channel=KamalaHarris
[youtube] 2M0fHti7DZI: Downloading webpage
[youtube] 2M0fHti7DZI: Downloading tv client config
[youtube] 2M0fHti7DZI: Downloading player 9a279502-main
[youtube] 2M0fHti7DZI: Downloading tv player API JSON
[youtube] 2M0fHti7DZI: Downloading ios player API JSON
[youtube] 2M0fHti7DZI: Downloading m3u8 information
[info] 2M0fHti7DZI: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   35.90MiB in 00:00:03 at 10.29MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 99%|█████████▉| 297300/299001 [04:09<00:01, 1124.10frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
100%|██████████| 299001/299001 [04:14<00:00, 1176.39frames/s]


Processed: 2024-09-02 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=TAlEsoHtmGg&ab_channel=FOX26Houston
[youtube] TAlEsoHtmGg: Downloading webpage
[youtube] TAlEsoHtmGg: Downloading tv client config
[youtube] TAlEsoHtmGg: Downloading player 9a279502-main
[youtube] TAlEsoHtmGg: Downloading tv player API JSON
[youtube] TAlEsoHtmGg: Downloading ios player API JSON
[youtube] TAlEsoHtmGg: Downloading m3u8 information
[info] TAlEsoHtmGg: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   22.88MiB in 00:00:02 at 7.85MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 153192/153192 [01:48<00:00, 1416.26frames/s]


Processed: 2024-09-04 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=aX-uJgI3Pl8
[youtube] aX-uJgI3Pl8: Downloading webpage
[youtube] aX-uJgI3Pl8: Downloading tv client config
[youtube] aX-uJgI3Pl8: Downloading player 9a279502-main
[youtube] aX-uJgI3Pl8: Downloading tv player API JSON
[youtube] aX-uJgI3Pl8: Downloading ios player API JSON
[youtube] aX-uJgI3Pl8: Downloading m3u8 information
[info] aX-uJgI3Pl8: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   23.71MiB in 00:00:03 at 7.36MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 94%|█████████▎| 178508/190508 [01:38<00:06, 1813.46frames/s]


Processed: 2024-09-12 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=4P0JCqIjRT0&ab_channel=WFAA
[youtube] 4P0JCqIjRT0: Downloading webpage
[youtube] 4P0JCqIjRT0: Downloading tv client config
[youtube] 4P0JCqIjRT0: Downloading player 9a279502-main
[youtube] 4P0JCqIjRT0: Downloading tv player API JSON
[youtube] 4P0JCqIjRT0: Downloading ios player API JSON
[youtube] 4P0JCqIjRT0: Downloading m3u8 information
[info] 4P0JCqIjRT0: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   20.22MiB in 00:00:03 at 5.43MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 143218/143218 [01:59<00:00, 1202.47frames/s]


Processed: 2024-09-12 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=9yzHzGYUeTQ&ab_channel=FOX4Dallas-FortWorth
[youtube] 9yzHzGYUeTQ: Downloading webpage
[youtube] 9yzHzGYUeTQ: Downloading tv client config
[youtube] 9yzHzGYUeTQ: Downloading player 9a279502-main
[youtube] 9yzHzGYUeTQ: Downloading tv player API JSON
[youtube] 9yzHzGYUeTQ: Downloading ios player API JSON
[youtube] 9yzHzGYUeTQ: Downloading m3u8 information
[info] 9yzHzGYUeTQ: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   21.08MiB in 00:00:07 at 2.96MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 135589/135589 [02:22<00:00, 951.64frames/s]


Processed: 2024-09-13 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=9AunRg_V078&ab_channel=6abcPhiladelphia
[youtube] 9AunRg_V078: Downloading webpage
[youtube] 9AunRg_V078: Downloading tv client config
[youtube] 9AunRg_V078: Downloading player 9a279502-main
[youtube] 9AunRg_V078: Downloading tv player API JSON
[youtube] 9AunRg_V078: Downloading ios player API JSON
[youtube] 9AunRg_V078: Downloading m3u8 information
[info] 9AunRg_V078: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   11.17MiB in 00:00:00 at 16.60MiB/s  
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 67899/67899 [00:57<00:00, 1190.28frames/s]


Processed: 2024-09-13 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=1A8aumdgaHY&ab_channel=FOX29Philadelphiaa
[youtube] 1A8aumdgaHY: Downloading webpage
[youtube] 1A8aumdgaHY: Downloading tv client config
[youtube] 1A8aumdgaHY: Downloading player 9a279502-main
[youtube] 1A8aumdgaHY: Downloading tv player API JSON
[youtube] 1A8aumdgaHY: Downloading ios player API JSON
[youtube] 1A8aumdgaHY: Downloading m3u8 information
[info] 1A8aumdgaHY: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   42.23MiB in 00:00:03 at 11.00MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 282510/282510 [04:07<00:00, 1143.42frames/s]


Processed: 2024-09-17 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=kAlAsFvOGSg&ab_channel=PBSNewsHour
[youtube] kAlAsFvOGSg: Downloading webpage
[youtube] kAlAsFvOGSg: Downloading tv client config
[youtube] kAlAsFvOGSg: Downloading player 9a279502-main
[youtube] kAlAsFvOGSg: Downloading tv player API JSON
[youtube] kAlAsFvOGSg: Downloading ios player API JSON
[youtube] kAlAsFvOGSg: Downloading m3u8 information
[info] kAlAsFvOGSg: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   18.69MiB in 00:00:02 at 7.27MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 147994/147994 [01:51<00:00, 1328.61frames/s]


Processed: 2024-09-18 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=cIaRQPsgOzI
[youtube] cIaRQPsgOzI: Downloading webpage
[youtube] cIaRQPsgOzI: Downloading tv client config
[youtube] cIaRQPsgOzI: Downloading player 9a279502-main
[youtube] cIaRQPsgOzI: Downloading tv player API JSON
[youtube] cIaRQPsgOzI: Downloading ios player API JSON
[youtube] cIaRQPsgOzI: Downloading m3u8 information
[info] cIaRQPsgOzI: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   48.28MiB in 00:00:07 at 6.87MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 387522/387522 [06:58<00:00, 925.14frames/s] 


Processed: 2024-09-19 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=WLoxQ44epxI
[youtube] WLoxQ44epxI: Downloading webpage
[youtube] WLoxQ44epxI: Downloading tv client config
[youtube] WLoxQ44epxI: Downloading player 9a279502-main
[youtube] WLoxQ44epxI: Downloading tv player API JSON
[youtube] WLoxQ44epxI: Downloading ios player API JSON
[youtube] WLoxQ44epxI: Downloading m3u8 information
[info] WLoxQ44epxI: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   22.13MiB in 00:00:03 at 5.69MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 183952/183952 [02:41<00:00, 1140.90frames/s]


Processed: 2024-09-20 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=nRkoMZB3Oq4&ab_channel=WTVCNewsChannel9
[youtube] nRkoMZB3Oq4: Downloading webpage
[youtube] nRkoMZB3Oq4: Downloading tv client config
[youtube] nRkoMZB3Oq4: Downloading player 9a279502-main
[youtube] nRkoMZB3Oq4: Downloading tv player API JSON
[youtube] nRkoMZB3Oq4: Downloading ios player API JSON
[youtube] nRkoMZB3Oq4: Downloading m3u8 information
[info] nRkoMZB3Oq4: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   16.77MiB in 00:00:01 at 8.57MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 140567/140567 [01:38<00:00, 1426.29frames/s]


Processed: 2024-09-20 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=-i64tqglpmA&ab_channel=WFAA
[youtube] -i64tqglpmA: Downloading webpage
[youtube] -i64tqglpmA: Downloading tv client config
[youtube] -i64tqglpmA: Downloading player 9a279502-main
[youtube] -i64tqglpmA: Downloading tv player API JSON
[youtube] -i64tqglpmA: Downloading ios player API JSON
[youtube] -i64tqglpmA: Downloading m3u8 information
[info] -i64tqglpmA: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   33.32MiB in 00:00:06 at 5.39MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 237868/237868 [02:31<00:00, 1574.87frames/s]


Processed: 2024-09-25 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=QeGWT7ard5U&ab_channel=KamalaHarris
[youtube] QeGWT7ard5U: Downloading webpage
[youtube] QeGWT7ard5U: Downloading tv client config
[youtube] QeGWT7ard5U: Downloading player 9a279502-main
[youtube] QeGWT7ard5U: Downloading tv player API JSON
[youtube] QeGWT7ard5U: Downloading ios player API JSON
[youtube] QeGWT7ard5U: Downloading m3u8 information
[info] QeGWT7ard5U: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   22.33MiB in 00:00:02 at 8.27MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 180300/180300 [01:53<00:00, 1585.29frames/s]


Processed: 2024-09-27 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=bzThwqnQJDY&ab_channel=ALLTHESMOKE
[youtube] bzThwqnQJDY: Downloading webpage
[youtube] bzThwqnQJDY: Downloading tv client config
[youtube] bzThwqnQJDY: Downloading player 9a279502-main
[youtube] bzThwqnQJDY: Downloading tv player API JSON
[youtube] bzThwqnQJDY: Downloading ios player API JSON
[youtube] bzThwqnQJDY: Downloading m3u8 information
[info] bzThwqnQJDY: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   38.83MiB in 00:00:01 at 21.57MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 285514/285514 [05:42<00:00, 832.89frames/s]


Processed: 2024-09-30 - Podcast
[youtube] Extracting URL: https://www.youtube.com/watch?v=1JPBKKfc3Ok&ab_channel=KamalaHarris
[youtube] 1JPBKKfc3Ok: Downloading webpage
[youtube] 1JPBKKfc3Ok: Downloading tv client config
[youtube] 1JPBKKfc3Ok: Downloading player 9a279502-main
[youtube] 1JPBKKfc3Ok: Downloading tv player API JSON
[youtube] 1JPBKKfc3Ok: Downloading ios player API JSON
[youtube] 1JPBKKfc3Ok: Downloading m3u8 information
[info] 1JPBKKfc3Ok: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   37.10MiB in 00:00:02 at 14.22MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 98%|█████████▊| 285624/291624 [03:04<00:03, 1546.97frames/s]


Processed: 2024-10-03 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=gVzjoxj-ZQQ&ab_channel=KamalaHarris
[youtube] gVzjoxj-ZQQ: Downloading webpage
[youtube] gVzjoxj-ZQQ: Downloading tv client config
[youtube] gVzjoxj-ZQQ: Downloading player 753b1819-main
[youtube] gVzjoxj-ZQQ: Downloading tv player API JSON
[youtube] gVzjoxj-ZQQ: Downloading ios player API JSON
[youtube] gVzjoxj-ZQQ: Downloading m3u8 information
[info] gVzjoxj-ZQQ: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   45.29MiB in 00:00:03 at 14.80MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 99%|█████████▉| 361304/364304 [03:17<00:01, 1829.75frames/s]


Processed: 2024-10-04 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=a4mEKdROVOo&ab_channel=KamalaHarris
[youtube] a4mEKdROVOo: Downloading webpage
[youtube] a4mEKdROVOo: Downloading tv client config
[youtube] a4mEKdROVOo: Downloading player 753b1819-main
[youtube] a4mEKdROVOo: Downloading tv player API JSON
[youtube] a4mEKdROVOo: Downloading ios player API JSON
[youtube] a4mEKdROVOo: Downloading m3u8 information
[info] a4mEKdROVOo: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   17.48MiB in 00:00:01 at 12.40MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 132856/132856 [01:27<00:00, 1524.56frames/s]


Processed: 2024-10-04 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=dwf393HK5YQ&ab_channel=PBSNewsHour
[youtube] dwf393HK5YQ: Downloading webpage
[youtube] dwf393HK5YQ: Downloading tv client config
[youtube] dwf393HK5YQ: Downloading player 9a279502-main
[youtube] dwf393HK5YQ: Downloading tv player API JSON
[youtube] dwf393HK5YQ: Downloading ios player API JSON
[youtube] dwf393HK5YQ: Downloading m3u8 information
[info] dwf393HK5YQ: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of    5.29MiB in 00:00:01 at 3.35MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: Nynorsk


 92%|█████████▏| 33505/36505 [00:08<00:00, 3959.63frames/s] 


Processed: 2024-10-05 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=JRYG3AP2ARs&ab_channel=C-SPAN
[youtube] JRYG3AP2ARs: Downloading webpage
[youtube] JRYG3AP2ARs: Downloading tv client config
[youtube] JRYG3AP2ARs: Downloading player 753b1819-main
[youtube] JRYG3AP2ARs: Downloading tv player API JSON
[youtube] JRYG3AP2ARs: Downloading ios player API JSON
[youtube] JRYG3AP2ARs: Downloading m3u8 information
[info] JRYG3AP2ARs: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of    5.23MiB in 00:00:01 at 5.17MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 33188/33188 [00:21<00:00, 1573.00frames/s]


Processed: 2024-10-07 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=TJys7OVH24E&ab_channel=60Minutes
[youtube] TJys7OVH24E: Downloading webpage
[youtube] TJys7OVH24E: Downloading tv client config
[youtube] TJys7OVH24E: Downloading player 9a279502-main
[youtube] TJys7OVH24E: Downloading tv player API JSON
[youtube] TJys7OVH24E: Downloading ios player API JSON
[youtube] TJys7OVH24E: Downloading m3u8 information
[info] TJys7OVH24E: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   15.52MiB in 00:00:00 at 22.06MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|█████████▉| 124838/124931 [01:50<00:00, 1134.72frames/s]


Processed: 2024-10-07 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=hO08k6s4kpM&ab_channel=TheSun
[youtube] hO08k6s4kpM: Downloading webpage
[youtube] hO08k6s4kpM: Downloading tv client config
[youtube] hO08k6s4kpM: Downloading player 753b1819-main
[youtube] hO08k6s4kpM: Downloading tv player API JSON
[youtube] hO08k6s4kpM: Downloading ios player API JSON
[youtube] hO08k6s4kpM: Downloading m3u8 information
[info] hO08k6s4kpM: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   53.62MiB in 00:00:05 at 9.07MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 476877/476877 [02:31<00:00, 3144.04frames/s] 


Processed: 2024-10-10 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=T3KR-jtwKTM&ab_channel=PBSNewsHour
[youtube] T3KR-jtwKTM: Downloading webpage
[youtube] T3KR-jtwKTM: Downloading tv client config
[youtube] T3KR-jtwKTM: Downloading player 9a279502-main
[youtube] T3KR-jtwKTM: Downloading tv player API JSON
[youtube] T3KR-jtwKTM: Downloading ios player API JSON
[youtube] T3KR-jtwKTM: Downloading m3u8 information
[info] T3KR-jtwKTM: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   14.33MiB in 00:00:01 at 7.40MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 103505/103505 [01:20<00:00, 1293.12frames/s]


Processed: 2024-10-11 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=XoRviMMWXx4&ab_channel=KamalaHarris
[youtube] XoRviMMWXx4: Downloading webpage
[youtube] XoRviMMWXx4: Downloading tv client config
[youtube] XoRviMMWXx4: Downloading player 9a279502-main
[youtube] XoRviMMWXx4: Downloading tv player API JSON
[youtube] XoRviMMWXx4: Downloading ios player API JSON
[youtube] XoRviMMWXx4: Downloading m3u8 information
[info] XoRviMMWXx4: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   61.20MiB in 00:00:03 at 19.65MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: Khmer


100%|██████████| 462002/462002 [14:57<00:00, 514.79frames/s]


Processed: 2024-10-14 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=5WWnKBnHAP8&ab_channel=THESHADEROOM
[youtube] 5WWnKBnHAP8: Downloading webpage
[youtube] 5WWnKBnHAP8: Downloading tv client config
[youtube] 5WWnKBnHAP8: Downloading player 9a279502-main
[youtube] 5WWnKBnHAP8: Downloading tv player API JSON
[youtube] 5WWnKBnHAP8: Downloading ios player API JSON
[youtube] 5WWnKBnHAP8: Downloading m3u8 information
[info] 5WWnKBnHAP8: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   30.82MiB in 00:00:02 at 11.77MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 140484/140484 [02:18<00:00, 1017.01frames/s]


Processed: 2024-10-14 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=JyPSMeWU9Ms&ab_channel=PCNTV
[youtube] JyPSMeWU9Ms: Downloading webpage
[youtube] JyPSMeWU9Ms: Downloading tv client config
[youtube] JyPSMeWU9Ms: Downloading player 9a279502-main
[youtube] JyPSMeWU9Ms: Downloading tv player API JSON
[youtube] JyPSMeWU9Ms: Downloading ios player API JSON
[youtube] JyPSMeWU9Ms: Downloading m3u8 information
[info] JyPSMeWU9Ms: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   40.24MiB in 00:00:03 at 10.99MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 96%|█████████▌| 324633/339633 [03:10<00:08, 1702.96frames/s]


Processed: 2024-10-16 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=lCX48rn2WOA&ab_channel=WFAA
[youtube] lCX48rn2WOA: Downloading webpage
[youtube] lCX48rn2WOA: Downloading tv client config
[youtube] lCX48rn2WOA: Downloading player 9a279502-main
[youtube] lCX48rn2WOA: Downloading tv player API JSON
[youtube] lCX48rn2WOA: Downloading ios player API JSON
[youtube] lCX48rn2WOA: Downloading m3u8 information
[info] lCX48rn2WOA: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   25.21MiB in 00:00:04 at 5.38MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 163613/163613 [01:41<00:00, 1619.51frames/s]


Processed: 2024-10-17 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=EziLtK2KOdU&ab_channel=WFAA
[youtube] EziLtK2KOdU: Downloading webpage
[youtube] EziLtK2KOdU: Downloading tv client config
[youtube] EziLtK2KOdU: Downloading player 9a279502-main
[youtube] EziLtK2KOdU: Downloading tv player API JSON
[youtube] EziLtK2KOdU: Downloading ios player API JSON
[youtube] EziLtK2KOdU: Downloading m3u8 information
[info] EziLtK2KOdU: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   21.73MiB in 00:00:04 at 5.07MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 168079/168079 [01:39<00:00, 1684.82frames/s]


Processed: 2024-10-17 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=E1UtJv8ZFBk&ab_channel=PBSNewsHour
[youtube] E1UtJv8ZFBk: Downloading webpage
[youtube] E1UtJv8ZFBk: Downloading tv client config
[youtube] E1UtJv8ZFBk: Downloading player 9a279502-main
[youtube] E1UtJv8ZFBk: Downloading tv player API JSON
[youtube] E1UtJv8ZFBk: Downloading ios player API JSON
[youtube] E1UtJv8ZFBk: Downloading m3u8 information
[info] E1UtJv8ZFBk: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   18.69MiB in 00:00:03 at 5.77MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 165500/165500 [01:54<00:00, 1447.50frames/s]


Processed: 2024-10-18 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=V7Qq1mNIYrU&ab_channel=KamalaHarris
[youtube] V7Qq1mNIYrU: Downloading webpage
[youtube] V7Qq1mNIYrU: Downloading tv client config
[youtube] V7Qq1mNIYrU: Downloading player 9a279502-main
[youtube] V7Qq1mNIYrU: Downloading tv player API JSON
[youtube] V7Qq1mNIYrU: Downloading ios player API JSON
[youtube] V7Qq1mNIYrU: Downloading m3u8 information
[info] V7Qq1mNIYrU: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   29.43MiB in 00:00:03 at 7.41MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 99%|█████████▊| 231192/234192 [03:35<00:02, 1074.59frames/s]


Processed: 2024-10-18 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=rbvyoFnsz0A&ab_channel=KamalaHarris
[youtube] rbvyoFnsz0A: Downloading webpage
[youtube] rbvyoFnsz0A: Downloading tv client config
[youtube] rbvyoFnsz0A: Downloading player 9a279502-main
[youtube] rbvyoFnsz0A: Downloading tv player API JSON
[youtube] rbvyoFnsz0A: Downloading ios player API JSON
[youtube] rbvyoFnsz0A: Downloading m3u8 information
[info] rbvyoFnsz0A: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   63.67MiB in 00:00:09 at 6.39MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 465499/465499 [01:32<00:00, 5016.43frames/s]


Processed: 2024-10-19 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=DqvCQPIt1M8&ab_channel=WFAA
[youtube] DqvCQPIt1M8: Downloading webpage
[youtube] DqvCQPIt1M8: Downloading tv client config
[youtube] DqvCQPIt1M8: Downloading player 9a279502-main
[youtube] DqvCQPIt1M8: Downloading tv player API JSON
[youtube] DqvCQPIt1M8: Downloading ios player API JSON
[youtube] DqvCQPIt1M8: Downloading m3u8 information
[info] DqvCQPIt1M8: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of    4.76MiB in 00:00:00 at 8.40MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 40345/40345 [00:34<00:00, 1181.28frames/s]


Processed: 2024-10-19 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=rBKvZI0AB_I&ab_channel=WAAY31News
[youtube] rBKvZI0AB_I: Downloading webpage
[youtube] rBKvZI0AB_I: Downloading tv client config
[youtube] rBKvZI0AB_I: Downloading player 9a279502-main
[youtube] rBKvZI0AB_I: Downloading tv player API JSON
[youtube] rBKvZI0AB_I: Downloading ios player API JSON
[youtube] rBKvZI0AB_I: Downloading m3u8 information
[info] rBKvZI0AB_I: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of    6.74MiB in 00:00:01 at 4.32MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 49312/49312 [00:26<00:00, 1828.33frames/s]


Processed: 2024-10-20 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=KVYbcRhOXQQ&ab_channel=WFAA
[youtube] KVYbcRhOXQQ: Downloading webpage
[youtube] KVYbcRhOXQQ: Downloading tv client config
[youtube] KVYbcRhOXQQ: Downloading player 9a279502-main
[youtube] KVYbcRhOXQQ: Downloading tv player API JSON
[youtube] KVYbcRhOXQQ: Downloading ios player API JSON
[youtube] KVYbcRhOXQQ: Downloading m3u8 information
[info] KVYbcRhOXQQ: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   13.96MiB in 00:00:01 at 13.56MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 111178/111178 [01:10<00:00, 1571.62frames/s]


Processed: 2024-10-20 - Event
[youtube] Extracting URL: https://www.youtube.com/watch?v=XturwvZjc_E&ab_channel=KamalaHarris
[youtube] XturwvZjc_E: Downloading webpage
[youtube] XturwvZjc_E: Downloading tv client config
[youtube] XturwvZjc_E: Downloading player 9a279502-main
[youtube] XturwvZjc_E: Downloading tv player API JSON
[youtube] XturwvZjc_E: Downloading ios player API JSON
[youtube] XturwvZjc_E: Downloading m3u8 information
[info] XturwvZjc_E: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   47.41MiB in 00:00:04 at 10.71MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 84%|████████▎ | 291002/348002 [06:11<01:12, 782.57frames/s] 


Processed: 2024-10-21 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=LVgFrlwsUoI&ab_channel=KamalaHarris
[youtube] LVgFrlwsUoI: Downloading webpage
[youtube] LVgFrlwsUoI: Downloading tv client config
[youtube] LVgFrlwsUoI: Downloading player 9a279502-main
[youtube] LVgFrlwsUoI: Downloading tv player API JSON
[youtube] LVgFrlwsUoI: Downloading ios player API JSON
[youtube] LVgFrlwsUoI: Downloading m3u8 information
[info] LVgFrlwsUoI: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   43.69MiB in 00:00:01 at 33.26MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 334696/334696 [04:43<00:00, 1181.68frames/s]


Processed: 2024-10-21 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=pkM_QiEFNps&ab_channel=TheDemocrats
[youtube] pkM_QiEFNps: Downloading webpage
[youtube] pkM_QiEFNps: Downloading tv client config
[youtube] pkM_QiEFNps: Downloading player 9a279502-main
[youtube] pkM_QiEFNps: Downloading tv player API JSON
[youtube] pkM_QiEFNps: Downloading ios player API JSON
[youtube] pkM_QiEFNps: Downloading m3u8 information
[info] pkM_QiEFNps: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   40.53MiB in 00:00:04 at 8.52MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 97%|█████████▋| 307800/317498 [01:06<00:02, 4210.52frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
 97%|█████████▋| 308498/317498 [01:08<00:01, 4510.07frames/s]


Processed: 2024-10-21 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=iO6Ta86wCDI&ab_channel=NBCNews
[youtube] iO6Ta86wCDI: Downloading webpage
[youtube] iO6Ta86wCDI: Downloading tv client config
[youtube] iO6Ta86wCDI: Downloading player 9a279502-main
[youtube] iO6Ta86wCDI: Downloading tv player API JSON
[youtube] iO6Ta86wCDI: Downloading ios player API JSON
[youtube] iO6Ta86wCDI: Downloading m3u8 information
[info] iO6Ta86wCDI: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   17.09MiB in 00:00:00 at 25.78MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 131474/131474 [02:23<00:00, 917.34frames/s] 


Processed: 2024-10-22 - Interview
[youtube] Extracting URL: https://www.youtube.com/watch?v=U3yRr0xFZZs&ab_channel=CNN
[youtube] U3yRr0xFZZs: Downloading webpage
[youtube] U3yRr0xFZZs: Downloading tv client config
[youtube] U3yRr0xFZZs: Downloading player 9a279502-main
[youtube] U3yRr0xFZZs: Downloading tv player API JSON
[youtube] U3yRr0xFZZs: Downloading ios player API JSON
[youtube] U3yRr0xFZZs: Downloading m3u8 information
[info] U3yRr0xFZZs: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   30.49MiB in 00:00:01 at 19.97MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 224396/224396 [03:41<00:00, 1015.34frames/s]


Processed: 2024-10-23 - Town Hall
[youtube] Extracting URL: https://www.youtube.com/watch?v=4x1EXMyEGA8&ab_channel=KamalaHarris
[youtube] 4x1EXMyEGA8: Downloading webpage
[youtube] 4x1EXMyEGA8: Downloading tv client config
[youtube] 4x1EXMyEGA8: Downloading player 9a279502-main
[youtube] 4x1EXMyEGA8: Downloading tv player API JSON
[youtube] 4x1EXMyEGA8: Downloading ios player API JSON
[youtube] 4x1EXMyEGA8: Downloading m3u8 information
[info] 4x1EXMyEGA8: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of  126.04MiB in 00:00:14 at 8.71MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|█████████▉| 934444/937444 [11:58<00:02, 1300.13frames/s]


Processed: 2024-10-24 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=q8p2OvDQWaY&ab_channel=WFAA
[youtube] q8p2OvDQWaY: Downloading webpage
[youtube] q8p2OvDQWaY: Downloading tv client config
[youtube] q8p2OvDQWaY: Downloading player 753b1819-main
[youtube] q8p2OvDQWaY: Downloading tv player API JSON
[youtube] q8p2OvDQWaY: Downloading ios player API JSON
[youtube] q8p2OvDQWaY: Downloading m3u8 information
[info] q8p2OvDQWaY: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   19.88MiB in 00:00:04 at 4.91MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|█████████▉| 168400/168502 [11:51<00:00, 236.55frames/s] 


Processed: 2024-10-25 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=wnzjBjPhrHQ&ab_channel=WFAA
[youtube] wnzjBjPhrHQ: Downloading webpage
[youtube] wnzjBjPhrHQ: Downloading tv client config
[youtube] wnzjBjPhrHQ: Downloading player 753b1819-main
[youtube] wnzjBjPhrHQ: Downloading tv player API JSON
[youtube] wnzjBjPhrHQ: Downloading ios player API JSON
[youtube] wnzjBjPhrHQ: Downloading m3u8 information
[info] wnzjBjPhrHQ: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   24.06MiB in 00:00:05 at 4.47MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|█████████▉| 187100/187829 [02:21<00:00, 1245.63frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
100%|██████████| 187829/187829 [02:26<00:00, 1279.03frames/s]


Processed: 2024-10-26 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=U1Qov_Hu3Ao&ab_channel=SkyNews
[youtube] U1Qov_Hu3Ao: Downloading webpage
[youtube] U1Qov_Hu3Ao: Downloading tv client config
[youtube] U1Qov_Hu3Ao: Downloading player 753b1819-main
[youtube] U1Qov_Hu3Ao: Downloading tv player API JSON
[youtube] U1Qov_Hu3Ao: Downloading ios player API JSON
[youtube] U1Qov_Hu3Ao: Downloading m3u8 information
[info] U1Qov_Hu3Ao: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   13.29MiB in 00:00:02 at 4.75MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 97532/97532 [01:01<00:00, 1584.15frames/s]


Processed: 2024-10-27 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=H2aOx0WXNak&ab_channel=WFAA
[youtube] H2aOx0WXNak: Downloading webpage
[youtube] H2aOx0WXNak: Downloading tv client config
[youtube] H2aOx0WXNak: Downloading player 753b1819-main
[youtube] H2aOx0WXNak: Downloading tv player API JSON
[youtube] H2aOx0WXNak: Downloading ios player API JSON
[youtube] H2aOx0WXNak: Downloading m3u8 information
[info] H2aOx0WXNak: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   21.26MiB in 00:00:03 at 6.48MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 145561/145561 [01:40<00:00, 1446.69frames/s]


Processed: 2024-10-28 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=6G1Oou2dVdk&ab_channel=WFAA
[youtube] 6G1Oou2dVdk: Downloading webpage
[youtube] 6G1Oou2dVdk: Downloading tv client config
[youtube] 6G1Oou2dVdk: Downloading player 753b1819-main
[youtube] 6G1Oou2dVdk: Downloading tv player API JSON
[youtube] 6G1Oou2dVdk: Downloading ios player API JSON
[youtube] 6G1Oou2dVdk: Downloading m3u8 information
[info] 6G1Oou2dVdk: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   23.40MiB in 00:00:02 at 8.27MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 97%|█████████▋| 183042/189042 [01:48<00:03, 1693.99frames/s]


Processed: 2024-10-29 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=LnXPv7nVyx8&ab_channel=WFAA
[youtube] LnXPv7nVyx8: Downloading webpage
[youtube] LnXPv7nVyx8: Downloading tv client config
[youtube] LnXPv7nVyx8: Downloading player 753b1819-main
[youtube] LnXPv7nVyx8: Downloading tv player API JSON
[youtube] LnXPv7nVyx8: Downloading ios player API JSON
[youtube] LnXPv7nVyx8: Downloading m3u8 information
[info] LnXPv7nVyx8: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   15.77MiB in 00:00:02 at 7.83MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 131987/131987 [01:14<00:00, 1779.46frames/s]


Processed: 2024-10-30 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=GoXcbU-Xjbk&ab_channel=MilwaukeeJournalSentinel
[youtube] GoXcbU-Xjbk: Downloading webpage
[youtube] GoXcbU-Xjbk: Downloading tv client config
[youtube] GoXcbU-Xjbk: Downloading player 753b1819-main
[youtube] GoXcbU-Xjbk: Downloading tv player API JSON
[youtube] GoXcbU-Xjbk: Downloading ios player API JSON
[youtube] GoXcbU-Xjbk: Downloading m3u8 information
[info] GoXcbU-Xjbk: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   17.50MiB in 00:00:02 at 8.65MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 140501/140501 [01:24<00:00, 1668.88frames/s]


Processed: 2024-10-30 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=me1z4KmJqes&ab_channel=ABC33%2F40
[youtube] me1z4KmJqes: Downloading webpage
[youtube] me1z4KmJqes: Downloading tv client config
[youtube] me1z4KmJqes: Downloading player 753b1819-main
[youtube] me1z4KmJqes: Downloading tv player API JSON
[youtube] me1z4KmJqes: Downloading ios player API JSON
[youtube] me1z4KmJqes: Downloading m3u8 information
[info] me1z4KmJqes: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   58.33MiB in 00:00:10 at 5.40MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 97%|█████████▋| 515796/530796 [03:18<00:05, 2592.21frames/s]


Processed: 2024-10-30 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=TMJx0vnYY08&ab_channel=FOX29Philadelphia
[youtube] TMJx0vnYY08: Downloading webpage
[youtube] TMJx0vnYY08: Downloading tv client config
[youtube] TMJx0vnYY08: Downloading player 753b1819-main
[youtube] TMJx0vnYY08: Downloading tv player API JSON
[youtube] TMJx0vnYY08: Downloading ios player API JSON
[youtube] TMJx0vnYY08: Downloading m3u8 information
[info] TMJx0vnYY08: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   60.31MiB in 00:00:05 at 10.55MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: Spanish


 99%|█████████▉| 375501/378501 [02:02<00:00, 3076.01frames/s] 


Processed: 2024-10-31 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=_jwaT6FN1kc&ab_channel=KamalaHarris
[youtube] _jwaT6FN1kc: Downloading webpage
[youtube] _jwaT6FN1kc: Downloading tv client config
[youtube] _jwaT6FN1kc: Downloading player 9a279502-main
[youtube] _jwaT6FN1kc: Downloading tv player API JSON
[youtube] _jwaT6FN1kc: Downloading ios player API JSON
[youtube] _jwaT6FN1kc: Downloading m3u8 information
[info] _jwaT6FN1kc: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of  120.16MiB in 00:00:14 at 8.43MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 869004/869004 [03:10<00:00, 4570.02frames/s] 


Processed: 2024-10-31 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=5YEM-fLqJV0&ab_channel=WFAA
[youtube] 5YEM-fLqJV0: Downloading webpage
[youtube] 5YEM-fLqJV0: Downloading tv client config
[youtube] 5YEM-fLqJV0: Downloading player 9a279502-main
[youtube] 5YEM-fLqJV0: Downloading tv player API JSON
[youtube] 5YEM-fLqJV0: Downloading ios player API JSON
[youtube] 5YEM-fLqJV0: Downloading m3u8 information
[info] 5YEM-fLqJV0: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   18.58MiB in 00:00:01 at 10.76MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 149960/149960 [01:41<00:00, 1482.09frames/s]


Processed: 2024-10-31 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=Y3JnjDn_cwI&ab_channel=KamalaHarris
[youtube] Y3JnjDn_cwI: Downloading webpage
[youtube] Y3JnjDn_cwI: Downloading tv client config
[youtube] Y3JnjDn_cwI: Downloading player 9a279502-main
[youtube] Y3JnjDn_cwI: Downloading tv player API JSON
[youtube] Y3JnjDn_cwI: Downloading ios player API JSON
[youtube] Y3JnjDn_cwI: Downloading m3u8 information
[info] Y3JnjDn_cwI: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   31.43MiB in 00:00:08 at 3.83MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 229503/229503 [04:08<00:00, 921.92frames/s]


Processed: 2024-11-01 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=dCsk2aFv9A4&ab_channel=KamalaHarris
[youtube] dCsk2aFv9A4: Downloading webpage
[youtube] dCsk2aFv9A4: Downloading tv client config
[youtube] dCsk2aFv9A4: Downloading player 9a279502-main
[youtube] dCsk2aFv9A4: Downloading tv player API JSON
[youtube] dCsk2aFv9A4: Downloading ios player API JSON
[youtube] dCsk2aFv9A4: Downloading m3u8 information
[info] dCsk2aFv9A4: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of  111.50MiB in 00:00:16 at 6.62MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 98%|█████████▊| 814005/832005 [04:38<00:06, 2924.10frames/s] 


Processed: 2024-11-01 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=soG-Q0f0bdY&ab_channel=TheDemocrats
[youtube] soG-Q0f0bdY: Downloading webpage
[youtube] soG-Q0f0bdY: Downloading tv client config
[youtube] soG-Q0f0bdY: Downloading player 6450230e-main
[youtube] soG-Q0f0bdY: Downloading tv player API JSON
[youtube] soG-Q0f0bdY: Downloading ios player API JSON
[youtube] soG-Q0f0bdY: Downloading m3u8 information
[info] soG-Q0f0bdY: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   39.60MiB in 00:00:03 at 10.14MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 97%|█████████▋| 285600/293500 [04:22<00:07, 1092.00frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
 98%|█████████▊| 287500/293500 [04:26<00:05, 1078.42frames/s]


Processed: 2024-11-01 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=IBWsggOdGOA&ab_channel=WFAA
[youtube] IBWsggOdGOA: Downloading webpage
[youtube] IBWsggOdGOA: Downloading tv client config
[youtube] IBWsggOdGOA: Downloading player 9a279502-main
[youtube] IBWsggOdGOA: Downloading tv player API JSON
[youtube] IBWsggOdGOA: Downloading ios player API JSON
[youtube] IBWsggOdGOA: Downloading m3u8 information
[info] IBWsggOdGOA: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   19.27MiB in 00:00:01 at 11.40MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 156855/156855 [01:51<00:00, 1408.52frames/s]


Processed: 2024-11-02 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=CUZ5J_t7hec&ab_channel=ABC33%2F40
[youtube] CUZ5J_t7hec: Downloading webpage
[youtube] CUZ5J_t7hec: Downloading tv client config
[youtube] CUZ5J_t7hec: Downloading player 9a279502-main
[youtube] CUZ5J_t7hec: Downloading tv player API JSON
[youtube] CUZ5J_t7hec: Downloading ios player API JSON
[youtube] CUZ5J_t7hec: Downloading m3u8 information
[info] CUZ5J_t7hec: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   32.28MiB in 00:00:02 at 13.52MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 97%|█████████▋| 235800/242940 [02:29<00:07, 987.60frames/s] Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
 98%|█████████▊| 236940/242940 [02:34<00:03, 1530.34frames/s]


Processed: 2024-11-02 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=WRcl7Yd4gCA&ab_channel=ABC33%2F40
[youtube] WRcl7Yd4gCA: Downloading webpage
[youtube] WRcl7Yd4gCA: Downloading tv client config
[youtube] WRcl7Yd4gCA: Downloading player 9a279502-main
[youtube] WRcl7Yd4gCA: Downloading tv player API JSON
[youtube] WRcl7Yd4gCA: Downloading ios player API JSON
[youtube] WRcl7Yd4gCA: Downloading m3u8 information
[info] WRcl7Yd4gCA: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   38.49MiB in 00:00:07 at 5.35MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 95%|█████████▌| 323084/339648 [01:44<00:06, 2608.77frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
 96%|█████████▌| 324648/339648 [01:47<00:04, 3022.90frames/s]


Processed: 2024-11-03 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=k57CWwBQ-BA&ab_channel=MilwaukeeJournalSentinel
[youtube] k57CWwBQ-BA: Downloading webpage
[youtube] k57CWwBQ-BA: Downloading tv client config
[youtube] k57CWwBQ-BA: Downloading player 9a279502-main
[youtube] k57CWwBQ-BA: Downloading tv player API JSON
[youtube] k57CWwBQ-BA: Downloading ios player API JSON
[youtube] k57CWwBQ-BA: Downloading m3u8 information
[info] k57CWwBQ-BA: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   61.78MiB in 00:00:11 at 5.45MiB/s     
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


100%|██████████| 507777/507777 [04:20<00:00, 1952.07frames/s] 


Processed: 2024-11-04 - Rally
[youtube] Extracting URL: https://www.youtube.com/watch?v=jHWM5pZGwIY&ab_channel=PBSNewsHour
[youtube] jHWM5pZGwIY: Downloading webpage
[youtube] jHWM5pZGwIY: Downloading tv client config
[youtube] jHWM5pZGwIY: Downloading player 9a279502-main
[youtube] jHWM5pZGwIY: Downloading tv player API JSON
[youtube] jHWM5pZGwIY: Downloading ios player API JSON
[youtube] jHWM5pZGwIY: Downloading m3u8 information
[info] jHWM5pZGwIY: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   93.02MiB in 00:00:08 at 10.99MiB/s    
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


  checkpoint = torch.load(fp, map_location=device)


Detected language: English


 99%|█████████▉| 751200/755003 [03:27<00:00, 5268.33frames/s]Got start time outside of audio boundary
Got start time outside of audio boundary
Got start time outside of audio boundary
100%|█████████▉| 752003/755003 [03:29<00:00, 3589.78frames/s]


Processed: 2024-11-04 - Rally
Total events processed: 71
Total chunks created: 553


## results 

In [48]:
full = analysis_df[analysis_df['text_type'] == 'full']
test = clean_transcript(full.iloc[13].text)
full.iloc[46]

date                                                  2024-10-21
type                                                   Interview
description    Kamala Harris Conversation in Pennsylvania wit...
text_type                                                   full
text           I'm going to show you how to make a simple, si...
chunk_id                                                    full
words                                                       9042
Name: 409, dtype: object

In [49]:
test

"Joe Biden and Kamala Harris were walking our picket lines when Donald Trump was crossing them. Joe Biden and Kamala Harris have led the most pro-union White House in American history. And you remember that Kamala Harris cast those deciding votes in the Senate to save millions of workers' pensions and lower costs for seniors on their prescription drugs and make the biggest investment in modern history to rebuild this country. And now Kamala Harris and our union brother Tim Walz, yes, they are ready to go even further because they believe in that future that we all want to see. Now the road to the White House goes right through our union halls. That's right, one in five voters right here in Pennsylvania is a union voter. So we are the difference maker. We have the people. We have the trust. We know how to organize. And we are ready to send Kamala Harris and Tim Walz to the White House. Are we ready? All right, I got to hear you. Are you going to get your friends registered to vote? Yes.

"Joe Biden and Kamala Harris were walking our picket lines when Donald Trump was crossing them. Joe Biden and Kamala Harris have led the most pro-union White House in American history. And you remember that Kamala Harris cast those deciding votes in the Senate to save millions of workers' pensions and lower costs for seniors on their prescription drugs and make the biggest investment in modern history to rebuild this country. And now Kamala Harris and our union brother Tim Walz, yes, they are ready to go even further because they believe in that future that we all want to see. Now the road to the White House goes right through our union halls. That's right, one in five voters right here in Pennsylvania is a union voter. So we are the difference maker. We have the people. We have the trust. We know how to organize. And we are ready to send Kamala Harris and Tim Walz to the White House. Are we ready? All right, I got to hear you. Are you going to get your friends registered to vote? Yes. Are you going to knock on doors? Yes. Are you going to fight every day until November 5th? Yes. That's what I thought. Because when we fight, we fight. When we fight, we fight. Yes. Thank you. Now I have the great pleasure of introducing my friend, my brother, one of the great labor leaders in this country, IBEW International President, Kenny Cooper. Let's go, Cooper. Good afternoon, brothers and sisters. Good afternoon. Thank you, brothers and sisters. Thank you, guests. And most of all, thank you, family, for being here on Labor Day. You know, it's a great honor to be in Pittsburgh, a good union town, but for me, it's really an honor to be in the House of Labor. IBEW Local 5, thank you so much for everything you do. As Liz said, I'm the international president of the IBEW, the oldest and largest electrical union in the entire world. And on behalf of 840,000 members of the IBEW, it is my true honor to get to introduce tonight the strongest, most pro-union administration ever in history in the United States, President Biden and Vice President Harris. Joe Biden made a lot of promises when he run for president in 2022 or 2020. He promised to save Americans' pension system that was a serious risk for us. He promised to help American workers that were depending on him to help save that pension. Workers who went to work every day lived by the rules, who deferred their own wages so they could retire one day with dignity and respect. He passed the Butch Lewis Act, saving the American's multi-floor pension plan. And when the Senate was all tied up, let's think about this a minute, on the rescue of America's pension plans, they were only tied up for one reason. We couldn't find a Republican senator that would vote for American workers. Why would? The vice president and Harris showed us her real colors. She stepped to the plate, casted a deciding vote to save our pension plans across America. Showing working families that Biden and Harris administration has our backs, promises made, promises kept. He pledged to rebuild our country's infrastructure, the only way to make sense was good union jobs that pays well and has health care for them and their families. Once again, promises made, promises kept. They worked together to pass the Chips and Science Act, bringing back manufacturing jobs to communities that really need them the most. Promises made, promises kept. They pledged to be true friends of labor because they understand the necessity of unions to the American worker and to the middle class. Well, Joe Biden and Kamala Harris kept every single one of those promises. Pension plan relief, the Butch Lewis Act went above and beyond our expectations. His infrastructure bills are bringing good union jobs back, much needed infrastructure across our nation, in every corner of our nation. And he's never been afraid to say the word union. Joe Biden is more than just the pro union, the most pro union president in our entire history. When he passed the torch to Kamala Harris, he ensured his pro union, pro labor legacy would continue with the next administration. Kamala Harris understands just as Joe did and Joe does now. The only way to meet the future of this great nation is by giving workers a real voice at every table, especially the most important table to every worker and every worker's family, the kitchen table. Yeah. Brothers and sisters, please join me to welcome our true friends, the president and vice president of the United States, Joe Biden and Kamala Harris. Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe Thank you, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe, Joe Yui, Yui went to the national president IW Abel and he said he wanted me he wanted me to get endorsed everybody thought it was crazy I literally wasn't old enough to be sworn in today I'd like to but guess what? I was unable to stuck with me they got stuck with me for the rest of my career Eric Dean the iron workers Tim Disco the bricklayer April Bowles the SEIU Lee Saunders the Rastman the great friend Jimmy Williams the painter's ally trade and his dad his dad is a really good man Mike Coleman the sheet metal worker also got some of the best elected officials America Governor Josh Shapiro is doing an incredible job the great lieutenant governor Austin Davis and one of America's best words Ed Ganey Ed you are doing a hell of a job and sir in the motto county executive I think is the hardest job in American politics everybody knows where you live and I think you can solve all the problems you don't have enough money well I tell you one thing I'm the first president ever I used to be a county official when I was 26 years old and always bothered me the federal government would send money to the state to be distributed to the county and what the hell the state going to send to the county for all state reps need the money but guess what under my administration to go straight to the county one of my best friends my name is Joe Bodnard from Scranton Pennsylvania Bobby Casey has been a great friend his dad was a great friend as well and by the way we grew up three blocks from one another three blocks and they still worry about it it's not showing up guess what I was on North Washington Avenue he was in Adams guess what they renamed North Washington Avenue down where I live Biden Way Bobby Bobby Bobby make sure we get reelected again and while we couldn't be here want to thank his partner in the Senate John Federman if you're in a foxhole you want Federman in there with you he couldn't be here today but guess what he sent the best part of the family just so she's here let me just say I mean so much to be with a true friend a true friend the vice president and the next great president of the United States of America I I come from two neighborhoods where it's not hard to say the word Union but you know what the fact of the matter is an awful lot of politicians have a trouble saying Union like they're working people guess what I'm not one of them in this common we know the simple truth Wall Street did not build America the middle class built America and unions built the middle class that's a fact by the way that is not a slogan that's a fact I asked the Treasury Department to do a study and it shows that when unions do well all workers in America do better that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that's a fact that provides major incentives for companies to build new battery factories, wind turbines, and more to create high-paying jobs in those coal and natural gas communities and on top of that there's over four billion dollars in private companies who have committed to invest in clean energy and advanced manufacturing here in Pennsylvania four billion dollars in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States in the United States so folks, we got one more job to do together let me ask you, are you ready to fight? Yes! Are you ready to win? Yes! Are you ready to let Kamala Harris our next President of the United States in the United States? Yes! Applause And in the process are you ready to make Donald Trump a loser again? Yes! I've never been more optimistic about America because we have to remember who we are we're the United States of America there's nothing, nothing, I mean this in the bottom of my heart there's nothing beyond our capacity nothing when we do it together and that means, like my friend our great Vice President of the United States Kamala Harris Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Good afternoon Pittsburgh Applause Applause Thank you Joe Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause It is good to be in the House of Labor Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Applause Now I don't have to tell the brothers and sisters of labor that you really get to know somebody when you're in the middle of a fight That's really great When times are hard When the forces are mighty When people don't believe something can get done and they have a thousand excuses for why it can't get done And I have spent more time with this extraordinary human being When the cameras were not in the room When the stakes were high When the heat was high When the heat was intense And Joe Biden has always stood with the workers of America and labor unions of America Always Always Applause Applause Applause I've been with him when he'll bring folks into the Oval Office and you know how Joe can get sometimes he doesn't spare words that sometimes the cameras are not in the room when he has those conversations Because the thing about the Joe Biden I know and I know you know because he has been a friend of labor for so long for his whole life Joe Biden can be quite impatient and that's a good thing for that kind of leader Quite impatient and I say to all of the friends here the press that's in the room history will show what we here know Joe Biden has been one of the most transformative presidents in the United States that we have ever witnessed Applause Applause Applause And it comes from his heart Applause Applause Applause Applause And you know Joe and I talk a lot about the fact that we are so proud to be the most pro-Union administration in America's history Applause Applause Applause Applause And as we know Joe still got a lot of work to do so let's also understand that So I want to thank all the incredible leaders who are here today including the Governor Shapiro Lieutenant Governor Davis Senator Casey who we will re-elect this November Mayor Gaining President Scholar President Cooper all the leaders of labor who are here all the Union members who are here so I'll just get right to a few points I love Labor Day I love celebrating Labor Day And Pittsburgh of course is a cradle of the American Labor movement It is the birthplace of the AFL headquarters of the steel workers home to firefighters local one and of course the historic IBEW Local 5 Applause Applause Applause For more than 150 years the brothers and sisters of labor have helped lead the fight for fair pay better benefits and safe working conditions and every person in our nation has benefited from that work you know everywhere I go I tell people you may not be a union member but you better thank unions for that 5 day work week Applause Thank unions for sick leave Thank unions for pay family leave Applause Thank unions for your vacation time Applause Applause Because when union wages go up everybody's wages go up when union workplaces are safer all workplaces are safer when unions are strong America is strong Applause Applause Applause Applause Applause Applause And we are clear not only has Pittsburgh shaped the history of America's labor movement today you are also shaping its future In 2021 with my dear friend the secretary Marty Walsh who the president appointed to be secretary of labor he and I hosted a meeting right here in this local and it was part of the White House Labor Task Force that I lead today we met with a group of computer programmers who were working to form a union one month later they signed their contract and became the first one of the first technology unions in our nation Applause Standing on the shoulder of all those who have been here and fought the good fight so Pittsburgh I remind us of that to say together we are fighting to build an economy that works for all working people Applause And that has always been the vision of the labor movement and that is the vision of our campaign you know in this election there are two very different visions for our nation one hours focused on the future the other focused on the past And the other focused on the future And the other focused on the future We fight for the future We fight for a future of dignity respect and opportunity for all people Applause Applause Applause We fight knowing it's some backward thinking for those folks who have been suggesting for years that the measure of the strength of a leader is based on who you beat down You know that's the stuff they're pushing That the measure of the strength of a leader is based on who you beat down when we know the true measure of the strength of a leader is based on who you lift up Applause Applause Who you lift up Applause Do you fight for workers? Do you fight for families? Do you fight for those who must be seen and heard and deserve the dignity that comes with hard work Applause That's what we fight for Applause And when you know what to stand for you know what to fight for Applause Applause So we're 64 days out from this election Laughter Laughter Ballots in Pennsylvania will start dropping in 14 days Applause 14 days Applause And this election is as much as anything else a fight for the promise of America Applause For the promise of America We love our country Applause And we know it is one of the highest forms of socialism to fight for the ideals of our country And that's what this election is about And about the promise of America And I don't need to tell unions what the promise looks like It's what you do every day Applause Applause Applause Applause But as we fight to move forward Donald Trump is trying to pull us backward Applause Including back to a time before workers had the freedom to organize Applause Well, the courts will handle that And we will handle November Applause Applause Applause We'll handle November, let the courts handle that Applause But we're not going back Applause Applause And one of the ways One of the ways we're going to guarantee we don't go back is that we remember Right? It is important to remember what that was and what it is Remember, as president Donald Trump blocked overtime benefits for millions of workers Applause He opposed efforts to raise the minimum wage Applause As the president said, he appointed union busters to the National Labor Relations Board Applause And don't forget, he supported so-called right to work laws Applause And if Donald Trump were to be re-elected he intends to give more tax cuts to billionaires and big corporations Applause He intends to cut Social Security and Medicare Applause He wants to impose what in effect would be a national sales tag I call it the Trump's national sales tag Applause On everyday products and basic necessities that would cost a typical American family the economist have said this Almost $4,000 a year Applause He intends to repeal the Affordable Care Act Applause And take us back to what we remember because it wasn't that long ago was a time when insurance companies could deny people pre-existing conditions Do you remember what that was? Children with asthma, breast cancer survivors grandparents with diabetes Applause Well look America has tried those failed policies before and we are not going back Applause Applause Applause We are not going back Applause We are not going back Applause And instead we fight for a future where no person has to go broke just because they get sick Applause And so building on the work of President Joe Biden and I and the work we have done in the White House we will continue to strengthen the Affordable Care Act and make prescription drugs affordable for all Americans Applause We We fight for a future where every worker has the freedom to organize and we will pass the Pro Act Applause And end union busting once and for all and Bob Casey will help us do that Applause Applause Applause Applause We see and know and fight for a future where every person has the opportunity not just to get by but to get ahead Applause And so we will continue to build what I call an opportunity economy so that every American has an opportunity to buy a home and start a business or build intergenerational wealth and have a future that matches their dreams and ambitions and aspirations Applause Because of course that's the nature of who we are as Americans We have dreams we can see what is possible unburdened by what has been We have aspirations We have ambitions and the system that is a good system is one that supports that and allows people the opportunity to go where they can see and imagine themselves to be That's what I'm talking about when I talk about an opportunity economy we fight for a future where every senior can retire with dignity and so we will continue to defend Social Security and Medicare And pensions And pensions And pensions Applause And we will continue to strengthen America's manufacturing sector and on that point the President mentioned it U.S. Steel is an historic American company and it is vital for our nation to maintain strong American Steel companies and I couldn't agree more with President Biden U.S. Steel should remain American owned and American operated Applause And I will always have the back of America's Steel workers Applause And all of America's workers Applause So friends 64 days until the most election of our lives and probably one of the most important in the life of our nation Truly And we know this is going to be a tight race to the very end It's going to be a tight race to the very end so let's not pay too much attention to those polls Because as unions and labor knows best we know what it's like to be the underdog And we are the underdog in this race And we have some hard work then ahead of us But here's the beauty of us in this room We like hard work Hard work is good work Hard work is joyful work Applause And so in this fight Applause I will continue to count on the strength, the determination and the hard work of the leaders in this room to knock on doors to get folks to the polls and bluntly put because the people in here do it to help us win Pennsylvania Applause So today I ask Are you ready to make your voices heard? Applause Do we believe in freedom? Applause Do we believe in opportunity? Applause Do we believe in the promise of America? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause Are you ready to make your voice heard? Applause"


# Function Testing 

## Single link test

In [None]:
# Example usage
youtube_url = "https://www.youtube.com/watch?v=6G1Oou2dVdk&ab_channel=WFAA"
start_time = 58  # Start time in seconds
duration = 60     # Optional: Duration to transcribe in seconds

#process
download_audio(youtube_url, start_time)
test,row = new_transcribe_audio(start_time)
out = clean_transcript(test['full_text'])

# Clean up
os.remove("audio.wav")

[youtube] Extracting URL: https://www.youtube.com/watch?v=6G1Oou2dVdk&ab_channel=WFAA
[youtube] 6G1Oou2dVdk: Downloading webpage
[youtube] 6G1Oou2dVdk: Downloading tv client config
[youtube] 6G1Oou2dVdk: Downloading player 236fc64d-main
[youtube] 6G1Oou2dVdk: Downloading tv player API JSON
[youtube] 6G1Oou2dVdk: Downloading ios player API JSON
[youtube] 6G1Oou2dVdk: Downloading m3u8 information
[info] 6G1Oou2dVdk: Downloading 1 format(s): 251
[download] Destination: audio
[download] 100% of   23.40MiB in 00:00:09 at 2.43MiB/s   
[ExtractAudio] Destination: audio.wav
Deleting original file audio (pass -k to keep)


100%|███████████████████████████████████████| 461M/461M [00:19<00:00, 24.9MiB/s]


Detected language: English


 97%|█████████▋| 183042/189042 [01:52<00:03, 1625.43frames/s]


In [None]:
out

" Good evening, everyone. Good evening. And thank you for taking the time out of your  busy lives. Thank you. Thank you. Thank you. Thank you, everyone. So listen, one week from  today, you will have the chance to make a decision that directly impacts your life, the life  of your family, and the future of this country we love. And it will probably be the most  important vote you ever cast. And this election is more than just a choice between two parties  and two different candidates. It is a choice about whether we have a country rooted in  freedom for every American or ruled by chaos and division. Many of you watching have probably  already cast your ballots. But I know many others are still considering who to vote for  or whether you'll vote at all. So tonight, I will speak to everyone about the choice  and the stakes in this election. Look, we know who Donald Trump is. He is the person who stood  at this very spot nearly four years ago and sent an armed mob to the United States Capi

## Chunking full text


https://www.restack.io/p/text-chunking-answer-python-file-chunking-cat-ai

https://www.pinecone.io/learn/chunking-strategies/



In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.4.1
    Uninstalling sentence-transformers-3.4.1:
      Successfully uninstalled sentence-transformers-3.4.1
Successfully installed sentence-transformers-4.1.0


In [None]:
'''
500-1000 words

keep rhetorical info within chunk:
  -use transition markers
'''

chunks = semantic_chunk_for_pentad(out)

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

def semantic_chunk_for_pentad(speech_text, threshold=0.65, min_chunk_size=3, max_chunk_size=15):
    """
    Create semantic chunks from campaign speech text for pentad analysis.

    Args:
        speech_text (str): Full campaign speech text
        threshold (float): Cosine similarity threshold (0-1)
        min_chunk_size (int): Minimum sentences per chunk
        max_chunk_size (int): Maximum sentences per chunk

    Returns:
        list: List of semantically coherent text chunks
    """
    # Simple sentence splitting without NLTK
    # Split on periods followed by whitespace or end of string
    sentences = [s.strip() for s in speech_text.replace('\n', ' ').split('. ')]
    sentences = [s + '.' if not s.endswith('.') else s for s in sentences if s]

    # Load model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for all sentences
    embeddings = model.encode(sentences)

    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_embeddings = []

    for i, sentence in enumerate(sentences):
        if not current_chunk:
            # Start a new chunk
            current_chunk.append(sentence)
            current_chunk_embeddings.append(embeddings[i])
        else:
            # Calculate similarity with the average of current chunk
            avg_embedding = np.mean(current_chunk_embeddings, axis=0)
            similarity = util.pytorch_cos_sim(embeddings[i], avg_embedding).item()

            # Check if sentence is similar enough to current chunk
            if similarity >= threshold and len(current_chunk) < max_chunk_size:
                current_chunk.append(sentence)
                current_chunk_embeddings.append(embeddings[i])
            else:
                # Check if current chunk is large enough
                if len(current_chunk) >= min_chunk_size:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = [sentence]
                    current_chunk_embeddings = [embeddings[i]]
                else:
                    # If chunk is too small, continue adding to it despite lower similarity
                    current_chunk.append(sentence)
                    current_chunk_embeddings.append(embeddings[i])

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [None]:
chunks[2]

'Thank you, everyone. So listen, one week from today, you will have the chance to make a decision that directly impacts your life, the life of your family, and the future of this country we love. And it will probably be the most important vote you ever cast.'

In [None]:
chunks[3]

'And this election is more than just a choice between two parties and two different candidates. It is a choice about whether we have a country rooted in freedom for every American or ruled by chaos and division. Many of you watching have probably already cast your ballots.'

In [None]:
chunks[4]

"But I know many others are still considering who to vote for or whether you'll vote at all. So tonight, I will speak to everyone about the choice and the stakes in this election. Look, we know who Donald Trump is."

### larger chunking test

In [None]:

def test_chunker (speech_text, threshold=0.6, min_sentences=5, target_chunk_size=2000, max_chunk_size=4000):
    """
    Create semantic chunks from campaign speech text for pentad analysis,
    optimized for LLM processing with larger chunk sizes.

    Args:
        speech_text (str): Full campaign speech text
        threshold (float): Cosine similarity threshold (0-1)
        min_sentences (int): Minimum sentences before considering a chunk complete
        target_chunk_size (int): Target character size for chunks (optimal for LLM)
        max_chunk_size (int): Maximum character size for chunks (LLM constraint)

    Returns:
        list: List of semantically coherent text chunks
    """
    # Simple sentence splitting
    sentences = [s.strip() for s in speech_text.replace('\n', ' ').split('. ')]
    sentences = [s + '.' if not s.endswith('.') else s for s in sentences if s]

    # Load model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for all sentences
    embeddings = model.encode(sentences)

    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_embeddings = []
    current_chunk_chars = 0

    for i, sentence in enumerate(sentences):
        sentence_chars = len(sentence)

        if not current_chunk:
            # Start a new chunk
            current_chunk.append(sentence)
            current_chunk_embeddings.append(embeddings[i])
            current_chunk_chars += sentence_chars
        else:
            # Calculate similarity with the average of current chunk
            avg_embedding = np.mean(current_chunk_embeddings, axis=0)
            similarity = util.pytorch_cos_sim(embeddings[i], avg_embedding).item()

            # Logic for adding sentences to chunks based on multiple criteria:
            # 1. If very similar and below max size - add to current chunk
            # 2. If reached target size with enough sentences - start new chunk
            # 3. If would exceed max size - force start new chunk

            if (similarity >= threshold and
                current_chunk_chars + sentence_chars <= max_chunk_size):
                # Add to current chunk if similar enough and within size limits
                current_chunk.append(sentence)
                current_chunk_embeddings.append(embeddings[i])
                current_chunk_chars += sentence_chars
            elif (len(current_chunk) >= min_sentences and
                  current_chunk_chars >= target_chunk_size):
                # Start new chunk if current chunk is big enough
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_chunk_embeddings = [embeddings[i]]
                current_chunk_chars = sentence_chars
            elif current_chunk_chars + sentence_chars > max_chunk_size:
                # Force start new chunk if would exceed max size
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_chunk_embeddings = [embeddings[i]]
                current_chunk_chars = sentence_chars
            else:
                # Add to current chunk even if less similar
                current_chunk.append(sentence)
                current_chunk_embeddings.append(embeddings[i])
                current_chunk_chars += sentence_chars

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    # Post-processing: Combine consecutive chunks if they're small
    i = 0
    while i < len(chunks) - 1:
        combined_size = len(chunks[i]) + len(chunks[i+1]) + 1  # +1 for space
        if combined_size <= target_chunk_size:
            chunks[i] = chunks[i] + ' ' + chunks[i+1]
            chunks.pop(i+1)
        else:
            i += 1

    return chunks

In [None]:
test = test_chunker(out)

In [None]:
test[0]

"Good evening, everyone. Good evening. And thank you for taking the time out of your busy lives. Thank you. Thank you. Thank you. Thank you, everyone. So listen, one week from today, you will have the chance to make a decision that directly impacts your life, the life of your family, and the future of this country we love. And it will probably be the most important vote you ever cast. And this election is more than just a choice between two parties and two different candidates. It is a choice about whether we have a country rooted in freedom for every American or ruled by chaos and division. Many of you watching have probably already cast your ballots. But I know many others are still considering who to vote for or whether you'll vote at all. So tonight, I will speak to everyone about the choice and the stakes in this election. Look, we know who Donald Trump is. He is the person who stood at this very spot nearly four years ago and sent an armed mob to the United States Capitol to over

In [None]:
test[1]

"This is someone who is unstable, obsessed with revenge, consumed with grievance, and out for unchecked power. Donald Trump has spent a decade trying to keep the American people divided and afraid of each other. That is who he is. But America, I am here tonight to say that is not who we are. That is not who we are. You see, what Donald Trump has never understood is that e pluribus unum out of many one isn't just a phrase on a dollar bill. It is a living truth about the heart of our nation, our democracy, doesn't require us to agree on everything. In fact, we like good arguments from time to time. Just think of your own family, right? It is not the American way to not have disagreements. We don't shy away from robust debate. We like a good debate, don't we? We like a good debate. And the fact that someone disagrees with us does not make them the enemy within. They are family, neighbors, classmates, coworkers. They are fellow Americans. And as Americans, we rise and fall together. Americ

In [None]:
len(test[1])

2332