<a href="https://colab.research.google.com/github/machinelearnear/nelson-openai-master-plan/blob/main/orchestrator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
# !pip install -r requirements.txt

In [1]:
import os
import subprocess
import argparse
import json
import re
import yt_dlp
import unicodedata

In [15]:
def parse_transcript(file_path, guest_name=None, host_name="Tomas Rebord", save_to_disk=True):
    """
    Parses an SRT transcript file and converts it into a JSON format.
    
    This function takes an SRT file, reads its content, and converts it into
    a JSON format where each entry has the speaker's name, start time, and text.
    Optionally, the JSON output can be saved to disk.
    
    :param file_path: str, path to the input SRT file.
    :param guest_name: str, name of the guest speaker.
    :param host_name: str, optional, name of the host speaker (default: "Tomas Rebord").
    :param save_to_disk: bool, optional, whether to save the JSON output to disk (default: True).
    
    :return: str, JSON-formatted transcript.
    """
    if guest_name is None:
        guest_name = " ".join(file_path.split('/')[1].split("_")[2:]).title()
    
    def read_srt_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            srt_content = file.read()
        return srt_content

    def parse_srt(input):
        entries = input.strip().split("\n\n")
        output_lines = []
        last_speaker = None
        
        for entry in entries:
            lines = entry.strip().split("\n")
            time_start = lines[1].split(" --> ")[0]
            speaker, text = re.match(r'\[(.+)\]:\s*(.+)', lines[2]).groups()
            
            if last_speaker and speaker != last_speaker:
                output_lines.append("")
            
            if output_lines and speaker == last_speaker:
                output_lines[-1] += " " + text
            else:
                output_lines.append(f"[{speaker}, {time_start}] {text}")
            
            last_speaker = speaker

        return "\n".join(output_lines)

    def convert_to_seconds(timestamp):
        hours, minutes, seconds_and_milliseconds = timestamp.split(':')
        seconds, milliseconds = seconds_and_milliseconds.split(',')
        return int(hours) * 3600 + int(minutes) * 60 + int(seconds)

    srt_string = read_srt_file(file_path)
    transcript = parse_srt(srt_string)
    pattern = r'\[(SPEAKER_\d{2}),\s(\d{2}:\d{2}:\d{2},\d{3})\]\s(.*?)\n\n'
    matches = re.findall(pattern, transcript, re.DOTALL)
    print()

    result = []
    for match in matches:
        speaker, timestamp, text = match
        speaker_name = host_name if speaker == "SPEAKER_01" else guest_name
        formatted_text = "{}: {}".format(speaker_name, text.replace('\n', ' '))
        result.append({
            "text": formatted_text,
            "start": convert_to_seconds(timestamp)
        })

    json_output = json.dumps(result, ensure_ascii=False)

    if save_to_disk:
        output_file_path = os.path.splitext(file_path)[0] + ".json"
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(json_output)

    return json_output

In [16]:
parsed_json = parse_transcript("../output/episode_050_alberto_fernandez/episode_050_alberto_fernandez.word.srt", guest_name="AF")
print(parsed_json[:250])

SPEAKER_01
[{"text": "Tomas Rebord: cuando digas 3, 2, 1, estamos en principio, nuevamente, en otro episodio del método bienvenidos y bienvenidas. Voy a arrancar de la manera que suelo hacer ahora, que es agradeciendo primero. Eso es un gran recurso para no olv


In [49]:
from helper import convert_youtube_title, parse_transcript

ImportError: cannot import name 'parse_transcript' from 'helper' (/notebooks/transcribe-podcast-using-whisperx/helper.py)

In [3]:
url = 'https://www.youtube.com/watch?v=OA1biHKSyTw' # El Método Rebord #48 - Alejandro Dolina

In [4]:
youtube_video_name = convert_youtube_title(url)
output_dir = f'output/{youtube_video_name}'
output_filename = f'{output_dir}/{youtube_video_name}.wav'

[youtube] Extracting URL: https://www.youtube.com/watch?v=OA1biHKSyTw
[youtube] OA1biHKSyTw: Downloading webpage
[youtube] OA1biHKSyTw: Downloading android player API JSON


In [5]:
!python scripts/0_download_wav_from_youtube.py $url --output-dir $output_dir

[youtube] Extracting URL: https://www.youtube.com/watch?v=OA1biHKSyTw
[youtube] OA1biHKSyTw: Downloading webpage
[youtube] OA1biHKSyTw: Downloading android player API JSON
[youtube] Extracting URL: https://www.youtube.com/watch?v=OA1biHKSyTw
[youtube] OA1biHKSyTw: Downloading webpage
[youtube] OA1biHKSyTw: Downloading android player API JSON
[info] OA1biHKSyTw: Downloading 1 format(s): 140
[dashsegments] Total fragments: 12
[download] Destination: output/episode_048_alejandro_dolina/episode_048_alejandro_dolina
[K[download] 100% of  114.12MiB in [1;37m00:00:05[0m at [0;32m21.60MiB/s[0m0;33m00:00[0m (frag 12/12)2)
[FixupM4a] Correcting container of "output/episode_048_alejandro_dolina/episode_048_alejandro_dolina"
[ExtractAudio] Destination: output/episode_048_alejandro_dolina/episode_048_alejandro_dolina.wav
Deleting original file output/episode_048_alejandro_dolina/episode_048_alejandro_dolina (pass -k to keep)


In [6]:
hf_token = "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" # https://huggingface.co/settings/tokens
num_speakers = 2

In [55]:
!whisperx $output_filename --hf_token $hf_token \
        --model small --language es --align_model WAV2VEC2_ASR_LARGE_LV60K_960H \
        --diarize --min_speakers $num_speakers --max_speakers $num_speakers \
        --output_dir $output_dir

In [None]:
srt_filepath = f"{output_dir}/{youtube_video_name}.word.srt"

In [None]:
!python scripts/1_parse_whisperx_output.py $srt_filepath