In [None]:
import boto3
import os
import time
import urllib
import json
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms.bedrock import Bedrock
from reportlab.pdfgen import canvas
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import S3FileLoader

def transcribe_and_create_pdf(file_uri, media_format='mp3', language_code='en-US',
                               job_name_prefix='My-transcription', pdf_filename_prefix='Transcription_output'):
    transcribe_client = boto3.client('transcribe')

    def transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code):
        transcribe_client.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': file_uri},
            MediaFormat=media_format,
            LanguageCode=language_code
        )
        max_tries = 60
        while max_tries > 0:
            max_tries -= 1
            job = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
            job_status = job['TranscriptionJob']['TranscriptionJobStatus']
            if job_status in ['COMPLETED', 'FAILED']:
                print(f"Job {job_name} is {job_status}.")
                if job_status == 'COMPLETED':
                    response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                break
            else:
                print(f"Waiting for {job_name}. Current status is {job_status}.")
            time.sleep(10)
        return text

    # Generate unique job names and pdf filenames
    timestamp = str(int(time.time()))
    job_name = f'{job_name_prefix}_{timestamp}'
    pdf_filename = f'{pdf_filename_prefix}_{timestamp}.pdf'

    text = transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code)

    pdf = canvas.Canvas(pdf_filename)

    pdf.setFont("Helvetica", 12)
    pdf.drawString(10, 800, "Transcribed Text:")
    text_lines = text.split('\n')
    for i, line in enumerate(text_lines):
        pdf.drawString(10, 780 - i * 15, line)

    pdf.save()

    s3_bucket_name = 'my-s3-doc-loader'
    s3_key = pdf_filename
    s3_client = boto3.client('s3')
    with open(pdf_filename, 'rb') as pdf_file:
        s3_client.upload_fileobj(pdf_file, s3_bucket_name, s3_key)

    return text, f's3://{s3_bucket_name}/{s3_key}'

# Example usage for audio file
audio_file_uri = 's3://my-s3-doc-loader/Cloud_computing.mp3'
transcribed_text_audio, s3_uri_audio = transcribe_and_create_pdf(audio_file_uri, media_format='mp3', language_code='en-US')

# Example usage for video file
video_file_uri = 's3://my-s3-doc-loader/video.mp4'
transcribed_text_video, s3_uri_video = transcribe_and_create_pdf(video_file_uri, media_format='mp4', language_code='en-US')

