In [None]:
import json

import httpx

from sgu_tool.main import (
    ensure_directories,
    get_podcast_episodes,
    get_rss_feed_entries,
    load_models,
    merge_transcript_and_diarization,
)

In [None]:
print("Starting...")
ensure_directories()
whisper_model, pipeline = load_models()

async with httpx.AsyncClient(follow_redirects=True) as client:
    feed_entries = await get_rss_feed_entries(client)
    episodes = get_podcast_episodes(feed_entries)

    for episode in episodes:
        audio_file = await episode.get_audio_file(client)

        transcription = episode.get_transcription(audio_file, whisper_model)
        episode.transcription_file.write_text(json.dumps(transcription))
        print("Transcription saved.")

        # TODO: Get some stats from the transcription to feed to diarization (ex. max number of speakers)

        diarization = episode.get_diarization(audio_file, pipeline)
        episode.diarization_file.write_text(json.dumps(diarization))
        print("Diarization saved.")

        diarized_transcript = merge_transcript_and_diarization(transcription, diarization)
        episode.diarized_transcript_file.write_text(json.dumps(diarized_transcript))
        print("Diarized transcript saved.")

        # Maybe upload it somewhere or something?
        break

In [None]:
from pathlib import Path
import json
from sgu_tool.main import Transcription

transcription: Transcription = json.loads(Path("../data/transcriptions/0889.json").read_text("utf-8"))
intro_text = " ".join(s["text"] for s in transcription["segments"][:100])

In [None]:
import spacy_transformers
import en_core_web_trf
import spacy

nlp = en_core_web_trf.load()

In [None]:
doc = nlp(intro_text)

names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

valid_names: list[str] = []

for name in names:
    # Name already in the list or less specific than another name
    if any(valid_name.startswith(name) for valid_name in valid_names):
        continue

    # Name is more specific than a one we have (replace it)
    if any(name.startswith(valid_name) for valid_name in valid_names):
        for index, valid_name in enumerate(valid_names):
            if name.startswith(valid_name):
                valid_names[index] = name
                break
        continue

    # Name seems to be unique
    valid_names.append(name)

valid_names