# Build an App for diarized transcription Using Aana SDK

This notebook provides an example of getting diarized transcription from video. Please note that the pyannote diarization model is a gated model. Follow [speaker diarization deployment docs](./../docs/pages/model_hub/speaker_recognition.md) to get access to the model.

As a first step, set the environment and install aana SDK

In [1]:
import os

os.environ["HF_TOKEN"] = "<YOUR_HF_TOKEN_GOES_HERE>"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Define Whisper and PyannoteSpeakerDiarization deployments, define the TranscribeVideoWithDiarEndpoint for diarized transcription. Register deployments and the endpoints.

In [7]:
from aana.api.api_generation import Endpoint
from aana.core.models.speaker import PyannoteSpeakerDiarizationParams
from aana.core.models.video import VideoInput
from aana.core.models.whisper import WhisperParams
from aana.deployments.aana_deployment_handle import AanaDeploymentHandle
from aana.deployments.pyannote_speaker_diarization_deployment import (
    PyannoteSpeakerDiarizationConfig,
    PyannoteSpeakerDiarizationDeployment,
)
from aana.deployments.whisper_deployment import (
    WhisperComputeType,
    WhisperConfig,
    WhisperDeployment,
    WhisperModelSize,
    WhisperOutput,
)
from aana.integrations.external.yt_dlp import download_video
from aana.processors.remote import run_remote
from aana.processors.speaker import PostProcessingForDiarizedAsr
from aana.processors.video import extract_audio
from aana.sdk import AanaSDK

# Define the model deployments.
asr_deployment = WhisperDeployment.options(
    num_replicas=1,
    ray_actor_options={
        "num_gpus": 0.25
    },  # Remove this line if you want to run Whisper on a CPU.# Also change type to float32.
    user_config=WhisperConfig(
        model_size=WhisperModelSize.MEDIUM,
        compute_type=WhisperComputeType.FLOAT16,
    ).model_dump(mode="json"),
)
diarization_deployment = PyannoteSpeakerDiarizationDeployment.options(
    num_replicas=1,
    ray_actor_options={
        "num_gpus": 0.1
    },  # Remove this line if you want to run the model on a CPU.
    user_config=PyannoteSpeakerDiarizationConfig(
        model_id="pyannote/speaker-diarization-3.1"
    ).model_dump(mode="json"),
)
deployments = [
    {"name": "asr_deployment", "instance": asr_deployment},
    {"name": "diarization_deployment", "instance": diarization_deployment},
]


# Define the endpoint to transcribe the video with diarization.
class TranscribeVideoWithDiarEndpoint(Endpoint):
    """Transcribe video with diarization endpoint."""

    async def initialize(self):
        """Initialize the endpoint."""
        self.asr_handle = await AanaDeploymentHandle.create("asr_deployment")
        self.diar_handle = await AanaDeploymentHandle.create("diarization_deployment")
        await super().initialize()

    async def run(
        self,
        video: VideoInput,
        whisper_params: WhisperParams,
        diar_params: PyannoteSpeakerDiarizationParams,
    ) -> WhisperOutput:
        """Transcribe video with diarization."""
        video_obj = await run_remote(download_video)(video_input=video)
        audio = extract_audio(video=video_obj)

        # diarized transcript requires word_timestamps from ASR
        whisper_params.word_timestamps = True
        transcription = await self.asr_handle.transcribe(
            audio=audio, params=whisper_params
        )
        diarized_output = await self.diar_handle.diarize(
            audio=audio, params=diar_params
        )
        updated_segments = PostProcessingForDiarizedAsr.process(
            diarized_segments=diarized_output["segments"],
            transcription_segments=transcription["segments"],
        )
        output_segments = [
            s.model_dump(include=["text", "time_interval", "speaker"])
            for s in updated_segments
        ]

        return {"segments": output_segments}


endpoints = [
    {
        "name": "transcribe_video",
        "path": "/video/transcribe",
        "summary": "Transcribe a video",
        "endpoint_cls": TranscribeVideoWithDiarEndpoint,
    },
]

aana_app = AanaSDK(name="transcribe_video_app")

for deployment in deployments:
    aana_app.register_deployment(**deployment)

for endpoint in endpoints:
    aana_app.register_endpoint(**endpoint)

aana_app.connect(
    host="127.0.0.1", port=8000, show_logs=False
)  # Connects to the Ray cluster or starts a new one.
aana_app.migrate()  # Runs the migrations to create the database tables.

INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


Start the App!

In [8]:
aana_app.deploy()

Now that we have the app running, lets provide an example audio with multiple speakers for transcription.

In [9]:
import json

import requests

video = {
    # Video URL/path, Aana SDK supports URLs (including YouTube), file paths or even raw video data
    "path": "../aana/tests/files/audios/sd_sample.wav",
    "media_id": "sd_sample",  # Media ID, so we can ask questions about the video later by using this ID
}

data = {
    "whisper_params": {
        "word_timestamps": True,  # Enable word_timestamps
    },
    "video": video,
}

url = "http://127.0.0.1:8000/video/transcribe"

# No streaming support possible for diarized transcription as it needs complete ASR output beforehand.
response = requests.post(url, data={"body": json.dumps(data)})

print(response.json())

{'segments': [{'text': ' Hello? Hello.', 'time_interval': {'start': 6.9, 'end': 8.14}, 'speaker': 'SPEAKER_01'}, {'text': " Oh, hello. I didn't know you were there.", 'time_interval': {'start': 8.4, 'end': 9.9}, 'speaker': 'SPEAKER_02'}, {'text': ' Neither did I.', 'time_interval': {'start': 10.22, 'end': 10.88}, 'speaker': 'SPEAKER_01'}, {'text': ' Okay. I thought, you know, I heard a beep. This is Diane in New Jersey.', 'time_interval': {'start': 10.9, 'end': 14.16}, 'speaker': 'SPEAKER_02'}, {'text': " And I'm Sheila in Texas, originally from Chicago.", 'time_interval': {'start': 14.4, 'end': 17.74}, 'speaker': 'SPEAKER_00'}, {'text': " Oh, I'm originally from Chicago also. I'm in New Jersey now, though.", 'time_interval': {'start': 18.16, 'end': 21.48}, 'speaker': 'SPEAKER_02'}, {'text': " Well, there isn't that much difference. At least, you know, they all call me a Yankee down here, so what can I say?", 'time_interval': {'start': 21.9, 'end': 28.38}, 'speaker': 'SPEAKER_00'}, {'t

Each transcribed segment comes with a corresponding speaker label as well!