In [3]:
import boto3
import os
import time
import urllib
import json
from io import BytesIO
from reportlab.pdfgen import canvas

def transcribe_and_create_pdf(language_code='en-US',
                               job_name_prefix='My-transcription', pdf_filename_prefix='Transcription_output'):
    transcribe_client = boto3.client('transcribe')

    def transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code):
        transcribe_client.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': file_uri},
            MediaFormat=media_format,
            LanguageCode=language_code
        )
        max_tries = 60
        while max_tries > 0:
            max_tries -= 1
            job = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
            job_status = job['TranscriptionJob']['TranscriptionJobStatus']
            if job_status in ['COMPLETED', 'FAILED']:
                print(f"Job {job_name} is {job_status}.")
                if job_status == 'COMPLETED':
                    response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                break
            else:
                print(f"Waiting for {job_name}. Current status is {job_status}.")
            time.sleep(10)
        return text

    def process_media_format(media_format, s3_bucket_name):
        timestamp = str(int(time.time()))
        job_name = f'{job_name_prefix}_{media_format}_{timestamp}'
        pdf_filename = f'{pdf_filename_prefix}_{media_format}_{timestamp}.pdf'

        s3_client = boto3.client('s3')
        s3_objects = s3_client.list_objects(Bucket=s3_bucket_name)

        file_uris = []
        for obj in s3_objects.get('Contents', []):
            if obj['Key'].lower().endswith(f'.{media_format}'):
                file_uris.append(f's3://{s3_bucket_name}/{obj["Key"]}')

        if not file_uris:
            return "", []

        transcribed_text = ""
        for i, file_uri in enumerate(file_uris):
            text = transcribe_file(f"{job_name}_{i}", file_uri, transcribe_client, media_format, language_code)
            transcribed_text += f"Transcribed Text ({media_format.upper()}) - File {i + 1}:\n{text}\n\n"

            # Create PDF in memory
            pdf_buffer = BytesIO()
            pdf = canvas.Canvas(pdf_buffer)
            pdf.setFont("Helvetica", 12)
            pdf.drawString(10, 800, f"Transcribed Text ({media_format.upper()}) - File {i + 1}:")
            text_lines = text.split('\n')
            for j, line in enumerate(text_lines):
                pdf.drawString(10, 780 - j * 15, line)
            pdf.save()

            # Upload PDFs to S3
            s3_key = f"{pdf_filename}"

            # Reset the buffer position to the beginning
            pdf_buffer.seek(0)

            # Upload the PDF directly from the in-memory buffer
            s3_client.upload_fileobj(pdf_buffer, s3_bucket_name, s3_key)

        return transcribed_text, [f's3://{s3_bucket_name}/{s3_key}']

    s3_bucket_name = 'my-s3-doc-loader'

    # Process MP3 files
    transcribed_text_mp3, s3_uris_mp3 = process_media_format('mp3', s3_bucket_name)

    # Process MP4 files
    transcribed_text_mp4, s3_uris_mp4 = process_media_format('mp4', s3_bucket_name)

    return transcribed_text_mp3, s3_uris_mp3, transcribed_text_mp4, s3_uris_mp4

# Call the function and get the transcribed text and S3 URIs for both MP3 and MP4
transcribed_text_mp3, s3_uris_mp3, transcribed_text_mp4, s3_uris_mp4 = transcribe_and_create_pdf()


Waiting for My-transcription_mp3_1700824711_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp3_1700824711_0. Current status is IN_PROGRESS.
Job My-transcription_mp3_1700824711_0 is COMPLETED.
Waiting for My-transcription_mp4_1700824739_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_0. Current status is IN_PROGRESS.
Job My-transcription_mp4_1700824739_0 is COMPLETED.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700824739_1. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_17008