In [1]:
from kfp import dsl, compiler, components

In [2]:
downloader_component_url = "https://raw.githubusercontent.com/dpoulopoulos/kpf-discovery/refs/heads/main/_components/downloader.yaml"
download_document = components.load_component_from_url(downloader_component_url)

transformer_component_url = "https://raw.githubusercontent.com/dpoulopoulos/kpf-discovery/refs/heads/main/_components/transformer.yaml"
transform_document = components.load_component_from_url(transformer_component_url)

scriptwriter_component_url  = "https://raw.githubusercontent.com/dpoulopoulos/kpf-discovery/refs/heads/main/_components/scriptwriter.yaml"
scriptwriter = components.load_component_from_url(scriptwriter_component_url)

performer_component_url = "https://raw.githubusercontent.com/dpoulopoulos/kpf-discovery/refs/heads/main/_components/performer.yaml"
performer = components.load_component_from_url(performer_component_url)

In [3]:
@dsl.pipeline
def document_to_podcast(
    document_url: str,
    podcast_output_path: str,
    file_type: str | None = None,
    audio_format: str | None = None,
    host_name: str | None = None,
    cohost_name: str | None = None,
    host_voice_profile: str | None = None,
    cohost_voice_profile: str | None = None,
) -> None:
    """Convert a document to a podcast.

    This pipeline downloads a document, processes it, converts it to a script,
    and finally converts the script to speech (podcast).

    Args:
        :param document_url: Path to the input document.
        :param podcast_output_path: Path to the generated podcast.
        :param file_type: The file type of the input document. e.g. .html, .txt, .pdf.
        :param audio_format: Output podcast file type .e.g. WAV, MP3.
        :param host_name: Name of the host.
        :param cohost_name: Name of the co-host.
        :param host_voice_profile: Voice profile for the host.
        :param cohost_voice_profile: Voice profile for the co-host.
    """
    download_document_step = download_document(url=document_url)
    download_document_step.set_caching_options(False)

    process_data_step = transform_document(
        html_file=download_document_step.outputs['html_file'],
        file_type=file_type,
    ).after(download_document_step)
    process_data_step.set_caching_options(False)

    scriptwriter_step = scriptwriter(
        processed_document=process_data_step.outputs['processed_document'],
        host_name=host_name,
        cohost_name=cohost_name
    ).after(process_data_step)
    scriptwriter_step.set_accelerator_type("nvidia.com/gpu")
    scriptwriter_step.set_accelerator_limit(1)
    scriptwriter_step.set_caching_options(False)

    performer_step = performer(
        podcast_script=scriptwriter_step.outputs['podcast_script'],
        host_voice_profile=host_voice_profile,
        cohost_voice_profile=cohost_voice_profile,
        file_type=audio_format,
    ).after(scriptwriter_step)
    performer_step.set_accelerator_type("nvidia.com/gpu")
    performer_step.set_accelerator_limit(1)
    performer_step.set_caching_options(False)

In [4]:
compiler.Compiler().compile(document_to_podcast, package_path='document_to_podcast.yaml')