In [None]:
project_id="" #GCP Project ID used to call the Speech-to-Text API (API must be enabled)
audio_files_directory="" #GCS URI to audio files in format gs://bucket_name/path, files must be stored in subdirectories named after language code of the video
gcs_transcripts_directory="" #GCS URI to store generated transcripts in format gs://bucket_name/path
model="chirp" #GCP model to use for transcription (i.e. chirp, long)
chirp_region="us-central1" #Region to use for Chirp (i.e. europe-west4, us-central1 or asia-southeast1)

In [None]:
!(pip install google-cloud-speech)

In [None]:
import argparse
import re
from typing import List
from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2.types import cloud_speech
from google.cloud import speech_v2

async def transcribe_batch_multiple_files_v2(
    project_id: str,
    file_array: [],
    gcs_output_path: str,
    model: str,
):
    if model=="chirp":
        client = speech_v2.SpeechAsyncClient(
            client_options=ClientOptions(
                api_endpoint=f"{chirp_region}-speech.googleapis.com",
            )
        )
        location=chirp_region
    if model=="long":
        client = speech_v2.SpeechAsyncClient()
        location="global"

    default_config = cloud_speech.RecognitionConfig(
        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model=model,
        features=cloud_speech.RecognitionFeatures(
            enable_automatic_punctuation=True,
        ),
    )
    
    files=[]
    
    for file in file_array:
        config = cloud_speech.RecognitionConfig(
            language_codes=[file["language_code"]],
        )
        files.append(
            cloud_speech.BatchRecognizeFileMetadata(
                uri=file["gcs_uri"],
                config=config,
            )
        )

    request = speech_v2.BatchRecognizeRequest(
        recognizer=f"projects/{project_id}/locations/{location}/recognizers/_",
        config=default_config,
        files=files,
        recognition_output_config=cloud_speech.RecognitionOutputConfig(
            gcs_output_config=cloud_speech.GcsOutputConfig(
                uri=gcs_output_path+"/"+model+"/json",
            ),
        ),
    )
    operation = client.batch_recognize(request=request,timeout=530)
    print("Waiting for operation to complete...")
    response = await (await operation).result(timeout=530)
    print(response)

In [None]:
import re

language_directories=!(gsutil ls {audio_files_directory})
file_array=[]

for language_directory in language_directories:
    
    language_code=re.search("/([^/]+)/$",language_directory).group(1)
    files=!(gsutil ls {language_directory})
    for uri in files:
        
        file_array.append({"gcs_uri": uri,"language_code": language_code})

print(file_array)
number_of_files=len(file_array)
print(f"Number of files: {number_of_files}")

In [None]:
import time
import asyncio

def split_array_into_chunks(array, chunk_size):
    """
    Splits an array into chunks of the specified size.

    Args:
        array: The array to split.
        chunk_size: The size of each chunk.

    Returns:
        A list of chunks.
    """

    chunks = []
    for i in range(0, len(array), chunk_size):
        chunks.append(array[i : i + chunk_size])
        
    return chunks

arrays=split_array_into_chunks(file_array, 5)
start_time = time.time()
await asyncio.gather(*[transcribe_batch_multiple_files_v2(project_id, array, gcs_transcripts_directory, model) for array in arrays])

duration = time.time() - start_time
print(f"Took {duration} to execute")

In [None]:
import json
import re
from google.cloud import storage

transcript_array=!(gsutil ls {gcs_transcripts_directory}/{model}/json)

for transcript in transcript_array:
    print(f"Fetching results from {transcript}...")
    output_bucket, output_object = re.match(
        r"gs://([^/]+)/(.*)", transcript
    ).group(1, 2)

    # Instantiates a Cloud Storage client
    storage_client = storage.Client()

    # Fetch results from Cloud Storage
    bucket = storage_client.bucket(output_bucket)
    blob_json = bucket.blob(output_object)
    data = json.loads(blob_json.download_as_string(client=None))
    raw_transcript=""
    
    if "results" in data:
        for alternative in data["results"]:
            if "alternatives" in alternative:
                raw_transcript+=alternative["alternatives"][0]["transcript"]

    print(f"Transcript for {transcript}: {raw_transcript}")
    
    transcript_id=re.search(f"{gcs_transcripts_directory}/{model}/json/(.*)_transcript.*",transcript).group(1)
    transcript_path=re.search("gs://[^/]+/(.*)",gcs_transcripts_directory).group(1)
   
    blob_txt = bucket.blob(transcript_path+"/"+model+"/txt/"+transcript_id+".txt")
    blob_txt.upload_from_string(raw_transcript)