In [2]:
import boto3
import os
import time
import urllib
import json
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms.bedrock import Bedrock
from reportlab.pdfgen import canvas
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import S3FileLoader

def transcribe_and_create_pdf(language_code='en-US',
                               job_name_prefix='My-transcription', pdf_filename_prefix='Transcription_output'):
    transcribe_client = boto3.client('transcribe')

    def transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code):
        transcribe_client.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': file_uri},
            MediaFormat=media_format,
            LanguageCode=language_code
        )
        max_tries = 60
        while max_tries > 0:
            max_tries -= 1
            job = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
            job_status = job['TranscriptionJob']['TranscriptionJobStatus']
            if job_status in ['COMPLETED', 'FAILED']:
                print(f"Job {job_name} is {job_status}.")
                if job_status == 'COMPLETED':
                    response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                break
            else:
                print(f"Waiting for {job_name}. Current status is {job_status}.")
            time.sleep(10)
        return text

    # Generate unique job names and pdf filenames
    timestamp = str(int(time.time()))
    job_name = f'{job_name_prefix}_{timestamp}'
    pdf_filename = f'{pdf_filename_prefix}_{timestamp}.pdf'

    # Prompt the user for media format
    media_format = input("Enter media format (mp3 or mp4): ").lower()
    if media_format not in ['mp3', 'mp4']:
        raise ValueError("Unsupported media format. Please enter 'mp3' or 'mp4'.")

    # Prompt the user for the appropriate file URI based on the chosen media format
    file_uri = input(f"Enter {'audio' if media_format == 'mp3' else 'video'} file URI: ")

    text = transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code)

    pdf = canvas.Canvas(pdf_filename)

    pdf.setFont("Helvetica", 12)
    pdf.drawString(10, 800, "Transcribed Text:")
    text_lines = text.split('\n')
    for i, line in enumerate(text_lines):
        pdf.drawString(10, 780 - i * 15, line)

    pdf.save()

    s3_bucket_name = 'my-s3-doc-loader'
    s3_key = pdf_filename
    s3_client = boto3.client('s3')
    with open(pdf_filename, 'rb') as pdf_file:
        s3_client.upload_fileobj(pdf_file, s3_bucket_name, s3_key)

    return text, f's3://{s3_bucket_name}/{s3_key}'

# Call the function and get the transcribed text and S3 URI
transcribed_text, s3_uri = transcribe_and_create_pdf()

# Print or use transcribed_text and s3_uri as needed
print("Transcribed Text:", transcribed_text)
print("PDF saved to:", s3_uri)


Waiting for My-transcription_1700743946. Current status is IN_PROGRESS.
Waiting for My-transcription_1700743946. Current status is IN_PROGRESS.
Job My-transcription_1700743946 is COMPLETED.
Transcribed Text: Aws is the world's most comprehensive and broadly adopted cloud platform. Millions of customers trust Aws to power their infrastructure and applications. Organizations of every type and size are using Aws to lower costs become more agile and innovate faster. Aws provides on demand delivery of technology services via the internet. With pay as you go pricing, you can use these services to build and run virtually any type of application without upfront costs or ongoing commitments. You only pay for what you use. Aws gives you more services and more features within those services than any other cloud provider. This makes it faster, easier and more cost effective to move your existing applications to the cloud and to build anything you can imagine from infrastructure technologies like c

In [1]:
import boto3
import os
import time
import urllib
import json
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms.bedrock import Bedrock
from reportlab.pdfgen import canvas
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import S3FileLoader

def transcribe_and_create_pdf(language_code='en-US',
                               job_name_prefix='My-transcription', pdf_filename_prefix='Transcription_output'):
    transcribe_client = boto3.client('transcribe')

    def transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code):
        transcribe_client.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': file_uri},
            MediaFormat=media_format,
            LanguageCode=language_code
        )
        max_tries = 60
        while max_tries > 0:
            max_tries -= 1
            job = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
            job_status = job['TranscriptionJob']['TranscriptionJobStatus']
            if job_status in ['COMPLETED', 'FAILED']:
                print(f"Job {job_name} is {job_status}.")
                if job_status == 'COMPLETED':
                    response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                break
            else:
                print(f"Waiting for {job_name}. Current status is {job_status}.")
            time.sleep(10)
        return text

    def process_media_format(media_format):
        timestamp = str(int(time.time()))
        job_name = f'{job_name_prefix}_{media_format}_{timestamp}'
        pdf_filename = f'{pdf_filename_prefix}_{media_format}_{timestamp}.pdf'

        file_uris = []
        while True:
            file_uri = input(f"Enter {media_format} file URI (enter 'done' when finished): ")
            if file_uri.lower() == 'done':
                break
            file_uris.append(file_uri)

        transcribed_text = ""
        for i, file_uri in enumerate(file_uris):
            text = transcribe_file(f"{job_name}_{i}", file_uri, transcribe_client, media_format, language_code)
            transcribed_text += f"Transcribed Text ({media_format.upper()}) - File {i + 1}:\n{text}\n\n"

            # Create PDF
            pdf = canvas.Canvas(f"{pdf_filename}_{i}")
            pdf.setFont("Helvetica", 12)
            pdf.drawString(10, 800, f"Transcribed Text ({media_format.upper()}) - File {i + 1}:")
            text_lines = text.split('\n')
            for j, line in enumerate(text_lines):
                pdf.drawString(10, 780 - j * 15, line)
            pdf.save()

        # Upload PDFs to S3
        s3_bucket_name = 'my-s3-doc-loader'
        s3_keys = [f"{pdf_filename}_{i}" for i in range(len(file_uris))]
        s3_client = boto3.client('s3')

        for i, s3_key in enumerate(s3_keys):
            with open(s3_key, 'rb') as pdf_file:
                s3_client.upload_fileobj(pdf_file, s3_bucket_name, s3_key)

        return transcribed_text, [f's3://{s3_bucket_name}/{key}' for key in s3_keys]

    # Process MP3
    transcribed_text_mp3, s3_uris_mp3 = process_media_format('mp3')

    # Process MP4
    transcribed_text_mp4, s3_uris_mp4 = process_media_format('mp4')

    return transcribed_text_mp3, s3_uris_mp3, transcribed_text_mp4, s3_uris_mp4

# Call the function and get the transcribed text and S3 URIs for both MP3 and MP4
transcribed_text_mp3, s3_uris_mp3, transcribed_text_mp4, s3_uris_mp4 = transcribe_and_create_pdf()

# Print or use transcribed_text and s3_uris as needed for both MP3 and MP4
print("Transcribed Text (MP3):\n", transcribed_text_mp3)
print("PDFs saved to (MP3):\n", "\n".join(s3_uris_mp3))
print("\n")
print("Transcribed Text (MP4):\n", transcribed_text_mp4)
print("PDFs saved to (MP4):\n", "\n".join(s3_uris_mp4))


Waiting for My-transcription_mp3_1700749900_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp3_1700749900_0. Current status is IN_PROGRESS.
Job My-transcription_mp3_1700749900_0 is COMPLETED.
Waiting for My-transcription_mp4_1700749936_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1700749936_0. Current status is IN_PROGRESS.
Job My-transcription_mp4_1700749936_0 is COMPLETED.
Transcribed Text (MP3):
 Transcribed Text (MP3) - File 1:
Cloud computing is the on demand delivery of it resources via the internet. With pay as you go pricing. Instead of buying, owning and maintaining physical data centers and servers, you can access technology services such as computing, power storage and databases on an as needed basis. From a cloud provider like Amazon web services organizations of every type size and industry are using the cloud for a wide variety of use cases such as data backup, disaster recovery, email, virtual desktops, software development and testi