In [6]:
import boto3
import os
import time
import urllib
import json
from io import BytesIO
from reportlab.pdfgen import canvas
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms.bedrock import Bedrock
from langchain.document_loaders import S3FileLoader
from bedrock import get_bedrock_client
bedrock_client = get_bedrock_client(region='us-east-1', runtime=True)
buckets3 = 'my-s3-doc-loader'
s3 = boto3.client('s3')
def transcribe_and_create_pdf(s3_client, s3_bucket_name,language_code='en-US',
                               job_name_prefix='My-transcription', pdf_filename_prefix='Transcription_output'):
    transcribe_client = boto3.client('transcribe')

    def transcribe_file(job_name, file_uri, transcribe_client, media_format, language_code):
        transcribe_client.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={'MediaFileUri': file_uri},
            MediaFormat=media_format,
            LanguageCode=language_code
        )
        max_tries = 60
        while max_tries > 0:
            max_tries -= 1
            job = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
            job_status = job['TranscriptionJob']['TranscriptionJobStatus']
            if job_status in ['COMPLETED', 'FAILED']:
                print(f"Job {job_name} is {job_status}.")
                if job_status == 'COMPLETED':
                    response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                break
            else:
                print(f"Waiting for {job_name}. Current status is {job_status}.")
            time.sleep(10)
        return text

    def process_media_format(media_format, s3_bucket_name):
        timestamp = str(int(time.time()))
        job_name = f'{job_name_prefix}_{media_format}_{timestamp}'

        s3_client = boto3.client('s3')
        s3_objects = s3_client.list_objects(Bucket=s3_bucket_name)

        file_uris = []
        for obj in s3_objects.get('Contents', []):
            if obj['Key'].lower().endswith(f'.{media_format}'):
                file_uris.append(f's3://{s3_bucket_name}/{obj["Key"]}')

        if not file_uris:
            return "", []

        transcribed_text = ""
        s3_uris = []  # Store the S3 URIs for all processed files
        for i, file_uri in enumerate(file_uris):
            text = transcribe_file(f"{job_name}_{i}", file_uri, transcribe_client, media_format, language_code)
            transcribed_text += f"Transcribed Text ({media_format.upper()}) - File {i + 1}:\n{text}\n\n"

            # Create PDF in memory
            pdf_buffer = BytesIO()
            pdf = canvas.Canvas(pdf_buffer)
            pdf.setFont("Helvetica", 12)
            pdf.drawString(10, 800, f"Transcribed Text ({media_format.upper()}) - File {i + 1}:")
            text_lines = text.split('\n')
            for j, line in enumerate(text_lines):
                pdf.drawString(10, 780 - j * 15, line)
            pdf.save()

            # Upload PDFs to S3 with unique filenames
            pdf_filename = f'{pdf_filename_prefix}_{media_format}_{timestamp}_file_{i + 1}.pdf'
            s3_key = pdf_filename

            # Reset the buffer position to the beginning
            pdf_buffer.seek(0)

            # Upload the PDF directly from the in-memory buffer
            s3_client.upload_fileobj(pdf_buffer, s3_bucket_name, s3_key)

            # Append the S3 URI to the list
            s3_uris.append(f's3://{s3_bucket_name}/{s3_key}')

        return transcribed_text, s3_uris

    s3_bucket_name = buckets3

    # Check if the bucket contains MP3 or MP4 files
    mp3_files = any(obj['Key'].lower().endswith('.mp3') for obj in s3.list_objects_v2(Bucket=s3_bucket_name).get('Contents', []))
    mp4_files = any(obj['Key'].lower().endswith('.mp4') for obj in s3.list_objects_v2(Bucket=s3_bucket_name).get('Contents', []))

    # Check if the bucket contains PNG or JPEG files
    image_files = any(obj['Key'].lower().endswith(('.png', '.jpeg')) for obj in s3.list_objects_v2(Bucket=s3_bucket_name).get('Contents', []))

    # Process MP3 and MP4 files
    if mp3_files or mp4_files:
        transcribed_text_mp3, s3_uris_mp3 = process_media_format('mp3', s3_bucket_name)
        transcribed_text_mp4, s3_uris_mp4 = process_media_format('mp4', s3_bucket_name)
        
        # Do something with transcribed_text_mp3, s3_uris_mp3, transcribed_text_mp4, s3_uris_mp4

    # Process PNG and JPEG files
    if image_files:
        # Step 1: Get the list of files in the S3 bucket with the specified formats
        s3 = boto3.client('s3')

        response = s3.list_objects_v2(Bucket=buckets3)
        files_to_process = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].lower().endswith(('.png', '.jpeg'))]

        # Step 2: Process each file
        for file_name in files_to_process:
            # Amazon Textract client
            textract = boto3.client('textract')

            # Call Textract to detect text
            response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': buckets3,
                        'Name': file_name
                    }
                }
            )

            lines = []
            for item in response["Blocks"]:
                if item["BlockType"] == "LINE":
                    lines.append(item["Text"])

            result = '\n'.join(lines)

            # Create a PDF
            pdf_buffer = BytesIO()
            pdf = canvas.Canvas(pdf_buffer, pagesize=letter)
            pdf.setFont("Helvetica", 12)

            # Adjust the coordinates as needed
            text_lines = result.split('\n')
            for j, line in enumerate(text_lines):
                pdf.drawString(10, 780 - j * 15, line)

            pdf.save()
            pdf_buffer.seek(0)

            # Upload the PDF to S3
            pdf_key = f"{file_name.split('.')[0]}.pdf"  # Assuming you want to use the same file name with a '.pdf' extension
            s3.upload_fileobj(pdf_buffer, buckets3, pdf_key)

            # Optionally, you can print a message indicating that the PDF has been uploaded
            print(f"PDF created and uploaded for {file_name} as {pdf_key}")

# Third part of the code remains the same

# Call the function and get the transcribed text and S3 URIs for both MP3 and MP4
s3_client = boto3.client('s3')
transcribed_text_mp3, s3_uris_mp3, transcribed_text_mp4, s3_uris_mp4 = transcribe_and_create_pdf(s3_client, buckets3)
s3 = boto3.client('s3')

response = s3.list_objects_v2(Bucket=buckets3)

file_names = []
allowed_formats = ['.txt', '.pdf', '.doc', '.docx']

for obj in response['Contents']:
    key = obj['Key']
    
    # Check if the file has an allowed format
    if any(key.lower().endswith(format) for format in allowed_formats):
        file_names.append(key)

for context_key in file_names:
    loader = S3FileLoader(bucket=buckets3, key=context_key)
    context_content = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000)
context_texts = text_splitter.split_documents(context_content)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(documents=context_texts, embedding=embeddings)

retriever = db.as_retriever(search_type='mmr', search_kwargs={"k": 3})

template = """
Human: Answer truthfully based on the given question, fetch the answer only from the given text documents
Instruction:
1.If multiple files are there, read the all the files each and every lines accurately for to generate answer
2.If there is no text found in the text document about the asked question ,"print no result found" do not print any results if answer not found,do not search the answers from outside
3.Generate answer whatever available related to the question
4.Must complete the sentence in the result fully, do not leave results incomplete format in the end.
text:{context}
question:{question}
Assistant:"""
qa_prompt = PromptTemplate(template=template, input_variables=["context","question"])
chain_type_kwargs = { "prompt": qa_prompt}
llm = Bedrock(model_id="anthropic.claude-v2",client=bedrock_client)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
    verbose=False
)

UnboundLocalError: cannot access local variable 's3' where it is not associated with a value

In [None]:
question="Explain about cloud computing?"
result = qa.run(question)
print(result)