<a href="https://colab.research.google.com/github/machinelearnear/transcribe-podcast-using-whisperx/blob/main/notebooks/whisperx_transcribir_podcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## transcribir video de youtube identificando speakers y timestamps

In [None]:
hf_token = "<aca-pone-tu-hf-token>" # https://huggingface.co/settings/tokens
URL = 'https://www.youtube.com/watch?v=OA1biHKSyTw' # El Método Rebord #48 - Alejandro Dolina

### instalar todo

In [1]:
!pip install git+https://github.com/m-bain/whisperx.git;
!python3 -m pip install -U yt-dlp;

### bajarse el podcast de youtube a un `wav`

In [None]:
import json
import yt_dlp

# ℹ️ See help(yt_dlp.YoutubeDL) for a list of available options and public functions
ydl_opts = {}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(URL, download=False)

    # # ℹ️ ydl.sanitize_info makes the info json-serializable
    # print(json.dumps(ydl.sanitize_info(info)))

In [None]:
import re

def convert_title(title):
    # Extract the episode number using a regex pattern
    episode_number = re.search(r'#(\d+)', title)
    if episode_number:
        episode_number = episode_number.group(1).zfill(3)
    else:
        raise ValueError("Episode number not found in the input string")

    # Remove any non-alphanumeric characters and split the words into a list
    words = re.findall(r'\b\w+\b', title)

    # Find the index of the hyphen (-) separator
    separator_index = title.find('-')

    if separator_index == -1:
        raise ValueError("Separator (-) not found in the input string")

    # Extract the names after the separator and remove any leading/trailing whitespace
    names = title[separator_index+1:].strip()

    # Combine the episode number and names into the desired format
    result = f"episode_{episode_number}_{names.replace(' ', '_')}".lower()
    return result

input_title = ydl.sanitize_info(info)["title"]
output_title = convert_title(input_title)
output_filename = f'output/{output_title}.wav'
print(output_title)  # Output: episode_048_alejandro_dolina

In [None]:
ydl_opts = {
    'outtmpl': f'output/{output_title}',
    'format': 'm4a/bestaudio/best',
    # ℹ️ See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
    'postprocessors': [{  # Extract audio using ffmpeg
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }]
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download(URL)

### correr `openai/whisper`

In [None]:
def extract_guest_name(title):
    # Find the index of the hyphen (-) separator
    separator_index = title.find('-')

    if separator_index == -1:
        raise ValueError("Separator (-) not found in the input string")

    # Extract the guest's name after the separator and remove any leading/trailing whitespace
    guest_name = title[separator_index+1:].strip()

    return guest_name

input_title = "El Método Rebord #48 - Alejandro Dolina"
guest_name = extract_guest_name(input_title)
host_name = "Tomás Rebord"
print(guest_name)  # Output: Alejandro Dolina

In [None]:
!whisperx $output_filename --hf_token $hf_token --model small --language es --vad_filter --align_model WAV2VEC2_ASR_LARGE_LV60K_960H --diarize --min_speakers 2 --max_speakers 2

### detectar distintos speakers y generar archivos

In [None]:
import re

def parse_srt(input):
    entries = input.strip().split("\n\n")
    output_lines = []
    last_speaker = None
    
    for entry in entries:
        lines = entry.strip().split("\n")
        time_start = lines[1].split(" --> ")[0]
        speaker, text = re.match(r'\[(.+)\]:\s*(.+)', lines[2]).groups()
        
        if last_speaker and speaker != last_speaker:
            output_lines.append("")
        
        if output_lines and speaker == last_speaker:
            output_lines[-1] += " " + text
        else:
            output_lines.append(f"[{speaker}, {time_start}] {text}")
        
        last_speaker = speaker

    return "\n".join(output_lines)

In [None]:
def read_srt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_content = file.read()
    return srt_content

In [None]:
srt_string = read_srt_file(f"{output_filename}.word.srt")
print(parse_srt(srt_string))

In [None]:
import re

def process_transcript(input_str):
    # Use regular expression to find and split the input by speaker and timestamps
    speaker_timestamp_pattern = r'\[(SPEAKER_\d+), \d{2}:\d{2}:\d{2},\d{3}\]'
    segments = re.split(speaker_timestamp_pattern, input_str)
    
    # Remove the first empty string from the list
    segments = segments[1:]

    output = []
    for i in range(0, len(segments), 2):
        speaker = segments[i]
        text = segments[i + 1].strip()
        output.append(f"{speaker} ha dicho: {text}\n")

    return ''.join(output)

def replace_speaker_names(input_str, host_name=host_name, guest_name=guest_name):
    speaker_mapping = {
        "SPEAKER_00": host_name,
        "SPEAKER_01": guest_name,
        # Add more mappings if needed
    }

    output_str = input_str
    for old_speaker, new_speaker in speaker_mapping.items():
        output_str = output_str.replace(old_speaker, new_speaker)
        
    return output_str

In [None]:
output_str = replace_speaker_names(process_transcript(parse_srt(srt_string)))
print(output_str)

In [None]:
def save_to_txt_file(input_str, filename="output.txt"):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(input_str)

In [None]:
save_to_txt_file(output_str, f"{output_title}_with_speakers_timestamped.txt")

In [1]:
!pip install langflow

Collecting langflow
  Downloading langflow-0.0.46.tar.gz (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 5.9 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting google-api-python-client<3.0.0,>=2.79.0
  Downloading google_api_python_client-2.83.0-py2.py3-none-any.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 15.9 MB/s eta 0:00:01
[?25hCollecting types-pyyaml<7.0.0.0,>=6.0.12.8
  Downloading types_PyYAML-6.0.12.9-py3-none-any.whl (14 kB)
Collecting gunicorn<21.0.0,>=20.1.0
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 11.2 MB/s eta 0:00:01
[?25hCollecting beautifulsoup4<5.0.0,>=4.11.2
  Downloading beautifulsoup4-4.12.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 43.7 MB/s eta 0:00:01
[?25hCollecting google-search-results<3.

In [None]:
!langflow --host 0.0.0.0 --port 6006

[2023-04-01 23:58:15 +0000] [262] [INFO] Starting gunicorn 20.1.0
[2023-04-01 23:58:15 +0000] [262] [INFO] Listening at: http://0.0.0.0:6006 (262)
[2023-04-01 23:58:15 +0000] [262] [INFO] Using worker: uvicorn.workers.UvicornWorker
[2023-04-01 23:58:15 +0000] [266] [INFO] Booting worker with pid: 266
[2023-04-01 23:58:15 +0000] [266] [INFO] Started server process [266]
[2023-04-01 23:58:15 +0000] [266] [INFO] Waiting for application startup.
[2023-04-01 23:58:15 +0000] [266] [INFO] Application startup complete.
