In [1]:
import os
! cd .. && python -m pip install --user .

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Processing /home/jsteinbauer/ondewo/ondewo-s2t-client-python
Collecting grpcio==1.42.0
  Downloading grpcio-1.42.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 1.4 MB/s eta 0:00:01
[?25hCollecting grpcio-reflection==1.42.0
  Downloading grpcio_reflection-1.42.0-py3-none-any.whl (15 kB)
Collecting grpcio-tools==1.42.0
  Downloading grpcio_tools-1.42.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 2.2 MB/s eta 0:00:01
[?25hCollecting mypy-protobuf==3.00
  Downloading mypy_protobuf-3.0.0-py3-none-any.whl (15 kB)
Collecting ondewo-client-utils>=0.1.0
  Downloading ondewo_client_utils-0.1.0-py2.py3-none-any.whl (11 kB)
Collecting protobuf>=3.6.0
  Downloading protobuf-3.20.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 

In [3]:
import argparse
import wave

from ondewo.s2t import speech_to_text_pb2
from ondewo.s2t.client.client import Client
from ondewo.s2t.client.client_config import ClientConfig
from ondewo.s2t.client.services.speech_to_text import Speech2Text
from ondewo.s2t.speech_to_text_pb2 import ListS2tPipelinesRequest, Speech2TextConfig
from ondewo.s2t.speech_to_text_pb2 import S2tPipelineId

## Creating a client object
The example below shows how to create a speech to text client object from a client config. 
When setting *use_secure_channel=True*, a grpc certificate *grpc_cert* is required.

In [14]:
AUDIO_FILE: str = "examples/audiofiles/sample_1.wav"
# credentials = grpc.ssl_channel_credentials(root_certificates=cert)

MAX_MESSAGE_LENGTH: int = 60000000
GRPC_HOST: str = "dgxstation"
GRPC_PORT: str = "50658"
CHANNEL: str = f"{GRPC_HOST}:{GRPC_PORT}"
grpc_cert: str = None


options = [
    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
]

# channel = grpc.secure_channel(CHANNEL, credentials, options=options)

config: ClientConfig = ClientConfig(
  host=GRPC_HOST,
  port=GRPC_PORT, 
  grpc_cert=grpc_cert)
    
print(config)
    
client: Client = Client(config=config, use_secure_channel=False)



ClientConfig(host='dgxstation', port='50658', grpc_cert=None)


## List all existing speech to text pipelines
All relevant configurations of the speech to text server are defined in a speech to text pipeline. 
A running server can store several such configs at the same time, such that the client can chose which one to 
pick when he sends a request to transcribe an audio file or stream.

The example below shows how to list all available pipelines by calling the **s2t_service.list_s2t_pipelines()** function, which takes a **ListS2tPipelinesRequest()** as an argument.


In [5]:
s2t_service: Speech2Text = client.services.speech_to_text
pipelines = s2t_service.list_s2t_pipelines(request=ListS2tPipelinesRequest()).pipeline_configs
for pipeline in pipelines:
    print(pipeline)

id: "gwt001"
description {
  language: "de"
  pipeline_owner: "ondewo"
  domain: "logistics"
}
active: true
inference {
  ctc_acoustic_models {
    type: "quartznet"
    quartznet {
      config_path: "acoustic_models/de/quartznet_telephone/quartznet_telephone.yaml"
      load_type: "pt_files"
      pt_files {
        path: "acoustic_models/de/quartznet_telephone/"
        step: "2500"
      }
      ckpt_file {
        path: "acoustic_models/de/ckpt_test/test.ckpt"
      }
      use_gpu: true
    }
    quartznet_triton {
      config_path: "acoustic_models/model_configs/de/quartznet15x5_de.yaml"
      triton_url: "localhost:8001"
      triton_model: "quartznet"
    }
    wav2vec {
      model_path: "acoustic_models/en/wav2vec"
      use_gpu: true
    }
  }
  language_models {
    path: "language_models/gwt/"
    beam_size: 64
    default_lm: "gwt.binary"
    beam_search_scorer_alpha: 2.0
    beam_search_scorer_beta: 1.0
  }
}
streaming_server {
  output_style: "simple"
  streaming_spee

In [15]:
from typing import List, Optional

def find_pipeline_for_language(pipelines: List[Speech2TextConfig], language: str) -> Optional[Speech2TextConfig]:
    """ 
    Returns the first speech to text pipeline for the requested language. 
    If no pipline is found, return None.
    """
    for pipeline in pipelines:
        if pipeline.description.language == language:
            return pipeline

english_pipeline = find_pipeline_for_language(pipelines=pipelines, language='en')
german_pipeline = find_pipeline_for_language(pipelines=pipelines, language='de')

In [16]:
for pipeline in pipelines:
    if pipeline.description.language == 'en':
        print(pipeline.id)

general_english
wav2vec_base_all_atc
wav2vec_base_cz_atc
wav2vec_large_czech_atc_augmented
wav2vec_large_all_atc
wav2vec_large_cz_atc


## Load in example audio file
This audio file will be used in the following transcription examples.

In [17]:
import wave 

audio_file_path = "audiofiles/sample_1.wav"

with wave.open(audio_file_path) as w:
    audio: bytes = w.readframes(w.getnframes())

## Send a transcribe file request to the server
In general, are two different endpoints for audio transcriptions:
1. **Transcribe an audio file** 
2. **Transcribe an audio stream**

### Transcribe an audio file
In this example, we create a **TranscribeFileRequest**, including the audio file (as bytes) and a **TranscribeRequestConfig** message, including the speech to text pipeline id, as well as optional additional parameters.
The request message is then used as an argument to the **s2t_service.transcribe_file()**, which calls the corresponding endpoint.

### TranscribeRequestConfig
The TranscribeRequestConfig gives you maximal control in configuring the s2t server. 

It contains the following fields:

1. **s2t_pipeline_id** (string): The pipeline id
2. **ctc_decoding** (speech_to_text_pb2.CTCDecoding): The CTC decoding type - options are BEAM_SEARCH_WITH_LM, GREEDY
3. **language_model_name** (string): The name of the language model
4. **post_processing** (speech_to_text_pb2.PostProcessingOptions): Specifies options for post processing
5. **utterance_detection** (speech_to_text_pb2.UtteranceDetectionOptions)
6. **voice_activity_detection**: One of speech_to_text_pb2.Pyannote or speech_to_text_pb2.Matchbox
7. **return_options** (speech_to_text_pb2.TranscriptionReturnOptions): The options on how to return responses

In [19]:
import time
pipeline: Speech2TextConfig = english_pipeline

request = speech_to_text_pb2.TranscribeFileRequest(
    audio_file=audio,
    config=speech_to_text_pb2.TranscribeRequestConfig(
        s2t_pipeline_id=pipeline.id,
        ctc_decoding=speech_to_text_pb2.CTCDecoding.GREEDY,
    )
)
# Send transcription request and get response
transcribe_response = s2t_service.transcribe_file(request=request)

for transcription_message in transcribe_response.transcriptions:
    print(f"File transcript: {transcription_message.transcription}")

File transcript: hello i would like to order one large bitza with ham and cheese no mushrooms please 


### Transcribe an audio stream
In this example, we transcribe an audio stream by streaming a **TranscribeStreamRequest**, including audio chunks (as bytes) and a **TranscribeRequestConfig** message, including the speech to text pipeline id, as well as optional additional parameters.
The request message generator is then used as an argument to the **s2t_service.transcribe_stream()**, which calls the corresponding endpoint.

**Important**: After the TranscribeRequestConfig has been set once, it does not have to be sent with each new streameing request (this can help to save bandwidth). The old TranscribeRequestConfig remains until a new one is sent.

### Option 1.: Transcribe full utterances only (default)
In this mode, audio chunks are concatenated until a full utterance is accumulated (an utterance is considered "finished" if no voice is detected in the audio signal for `end_of_utterance_threshold` seconds.

In [None]:
from typing import Iterator
from streaming_example import get_streaming_audio, create_streaming_request

# Get audio stream (iterator of audio chunks)
audio_stream: Iterator[bytes] = get_streaming_audio("audiofiles/sample_1.wav")

# Create streaming request
streaming_request: Iterator[speech_to_text_pb2.TranscribeStreamRequest] = create_streaming_request(
    audio_stream=audio_stream, 
    pipeline_id=pipeline.id,
    transcribe_not_final=False,
)

# Transcribe the stream and get back responses
response_gen: Iterator[speech_to_text_pb2.TranscribeStreamResponse] = s2t_service.transcribe_stream(
    request_iterator=streaming_request
)

# Print transcribed utterances
for i, response_chunk in enumerate(response_gen):
    for transcribe_message in response_chunk.transcriptions:
        print(f"{i}. response_chunk: {transcribe_message.transcription}")

### Option 1.: Transcribe not final
In this mode, audio chunks are transcribed as soon as a minimal length of voice signal is accumulated.

In [None]:
from typing import Iterator
from streaming_example import get_streaming_audio, create_streaming_request

# Get audio stream (iterator of audio chunks)
audio_stream: Iterator[bytes] = get_streaming_audio("audiofiles/sample_1.wav")

# Create streaming request
streaming_request: Iterator[speech_to_text_pb2.TranscribeStreamRequest] = create_streaming_request(
    audio_stream=audio_stream, 
    pipeline_id=pipeline.id, 
    transcribe_not_final=True,
)

# Transcribe the stream and get back responses
response_gen: Iterator[speech_to_text_pb2.TranscribeStreamResponse] = s2t_service.transcribe_stream(
    request_iterator=streaming_request
)

# Print transcribed utterances
for i, response_chunk in enumerate(response_gen):
    for transcribe_message in response_chunk.transcriptions:
        print(f"{i}. response_chunk: {transcribe_message.transcription}")

## Pipeline CRUD
In the following, we demonstrate how to do CRUD (Create, Retrieve, Update, Delete) pipelines.

## GetS2TPipeline

The example below shows how to get a pipeline by calling the **s2t_service.get_s2t_pipeline()** function, which takes a **S2tPipelineId** as an argument.

In [None]:
# ### GetS2TPipeline
# To get a specific s2t pipeline, we can use the GetS2TPipeline endpoint.

pipeline_id = 'quarznet_en'
pipeline = s2t_service.get_s2t_pipeline(request=S2tPipelineId(id=pipeline_id))

pipelines = s2t_service.list_s2t_pipelines(request=ListS2tPipelinesRequest(registered_only=True)).pipeline_configs
print(f"Number of pipelines: {len(pipelines)}")

## DeleteS2TPipeline

The example below shows how to delete a pipeline by calling the **s2t_service.delete_s2t_pipeline()** function, which takes a **S2tPipelineId** as an argument.

In [None]:
# ### DeleteS2TPipeline
# To delete specific s2t pipeline, we can use the GetS2TPipeline endpoint.

deleted_pipeline = s2t_service.delete_s2t_pipeline(request=S2tPipelineId(id=pipeline_id))

pipelines = s2t_service.list_s2t_pipelines(request=ListS2tPipelinesRequest(registered_only=True)).pipeline_configs
print(f"Number of pipelines after pipeline deletion: {len(pipelines)}")


## CreateS2TPipeline

The example below shows how to create a pipeline by calling the **s2t_service.create_s2t_pipeline()** function, which takes a **pipeline** as an argument.

In [None]:
# # ### CreateS2TPipeline
# # To create specific s2t pipeline, we can use the CreateS2TPipeline endpoint.

pipeline = s2t_service.create_s2t_pipeline(request=pipeline)

pipelines = s2t_service.list_s2t_pipelines(request=ListS2tPipelinesRequest(registered_only=True)).pipeline_configs
print(f"Number of pipelines after pipeline creation: {len(pipelines)}")

In [None]:
# # ### UpdateS2TPipeline
# # To update specific s2t pipeline, we can use the UpdateS2TPipeline endpoint.

pipeline_id = 'quarznet_en'
pipeline = s2t_service.get_s2t_pipeline(request=S2tPipelineId(id=pipeline_id))

print(f"Old end_of_utterance_threshold: {pipeline.streaming_server.streaming_speech_recognition.end_of_utterance_threshold}")

# Update the end_of_utterance_threshold
pipeline.streaming_server.streaming_speech_recognition.end_of_utterance_threshold = 1.5
s2t_service.update_s2t_pipeline(request=pipeline)

new_pipeline = s2t_service.get_s2t_pipeline(request=S2tPipelineId(id=pipeline_id))

print(f"New end_of_utterance_threshold: {new_pipeline.streaming_server.streaming_speech_recognition.end_of_utterance_threshold}")