In [1]:
import boto3
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms.bedrock import Bedrock
from langchain.document_loaders import S3FileLoader
from bedrock import get_bedrock_client
import urllib
import json

def create_and_upload_pdf(text, bucket_name, file_key):
    pdf_buffer = BytesIO()
    pdf = canvas.Canvas(pdf_buffer, pagesize=letter)
    pdf.setFont("Helvetica", 12)

    text_lines = text.split('\n')
    for j, line in enumerate(text_lines):
        pdf.drawString(10, 780 - j * 15, line)

    pdf.save()
    pdf_buffer.seek(0)

    s3 = boto3.client('s3')
    s3.upload_fileobj(pdf_buffer, bucket_name, file_key)

    print(f"PDF created and uploaded as {file_key}")

s3 = boto3.client('s3')
transcribe_client = boto3.client('transcribe')

def process_media_format(media_format, bucket_name):
    language_code='en-US'
    job_name_prefix='My-transcription'
    pdf_filename_prefix='Transcription_output'

    timestamp = str(int(time.time()))
    job_name = f'{job_name_prefix}_{media_format}_{timestamp}'

    s3_objects = s3.list_objects_v2(Bucket=bucket_name)

    file_uris = []
    for obj in s3_objects.get('Contents', []):
        if obj['Key'].lower().endswith(f'.{media_format}'):
            file_uris.append(f's3://{bucket_name}/{obj["Key"]}')

    if not file_uris:
        return "", []

    transcribed_text = ""
    s3_uris = []
    for i, file_uri in enumerate(file_uris):
        transcribe_client.start_transcription_job(
            TranscriptionJobName=f"{job_name}_{i}",
            Media={'MediaFileUri': file_uri},
            MediaFormat=media_format,
            LanguageCode=language_code
        )

        max_tries = 60
        while max_tries > 0:
            max_tries -= 1
            job = transcribe_client.get_transcription_job(TranscriptionJobName=f"{job_name}_{i}")
            job_status = job['TranscriptionJob']['TranscriptionJobStatus']
            if job_status in ['COMPLETED', 'FAILED']:
                print(f"Job {job_name}_{i} is {job_status}.")
                if job_status == 'COMPLETED':
                    response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
                    data = json.loads(response.read())
                    text = data['results']['transcripts'][0]['transcript']
                    transcribed_text += f"Transcribed Text ({media_format.upper()}) - File {i + 1}:\n{text}\n\n"

                    pdf_filename = f'{pdf_filename_prefix}_{media_format}_{timestamp}_file_{i + 1}.pdf'
                    s3_key = pdf_filename

                    create_and_upload_pdf(text, bucket_name, s3_key)

                    s3_uris.append(f's3://{bucket_name}/{s3_key}')
                break
            else:
                print(f"Waiting for {job_name}_{i}. Current status is {job_status}.")
            time.sleep(10)

        return transcribed_text, s3_uris

    return process_media_format, s3

def process_images_and_create_pdf():
    s3 = boto3.client('s3')
    bucket_name = 'my-s3-doc-loader'
    response = s3.list_objects_v2(Bucket=bucket_name)
    image_files_in_s3 = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].lower().endswith(('.png', '.jpeg'))]

    for image_file_name in image_files_in_s3:
        textract = boto3.client('textract')

        response = textract.detect_document_text(
            Document={
                'S3Object': {
                    'Bucket': bucket_name,
                    'Name': image_file_name
                }
            }
        )

        lines = []
        for item in response["Blocks"]:
            if item["BlockType"] == "LINE":
                lines.append(item["Text"])

        result = '\n'.join(lines)

        pdf_key = f"{image_file_name.split('.')[0]}.pdf"
        create_and_upload_pdf(result, bucket_name, pdf_key)

bucket_name = 'my-s3-doc-loader'
s3 = boto3.client('s3')

response = s3.list_objects_v2(Bucket=bucket_name)
file_names_for_audio_video = [obj['Key'] for obj in response.get('Contents', [])]

audio_video_formats = any(file_name.lower().endswith(('.mp3', '.mp4')) for file_name in file_names_for_audio_video)
image_formats = any(file_name.lower().endswith(('.png', '.jpeg')) for file_name in file_names_for_audio_video)

if audio_video_formats:
    transcribed_text_audio, s3_uris_audio = process_media_format('mp3', bucket_name)
    transcribed_text_video, s3_uris_video = process_media_format('mp4', bucket_name)

if image_formats:
    process_images_and_create_pdf()

Waiting for My-transcription_mp3_1701666578_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp3_1701666578_0. Current status is IN_PROGRESS.
Job My-transcription_mp3_1701666578_0 is COMPLETED.
PDF created and uploaded as Transcription_output_mp3_1701666578_file_1.pdf
Waiting for My-transcription_mp4_1701666605_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1701666605_0. Current status is IN_PROGRESS.
Waiting for My-transcription_mp4_1701666605_0. Current status is IN_PROGRESS.
Job My-transcription_mp4_1701666605_0 is COMPLETED.
PDF created and uploaded as Transcription_output_mp4_1701666605_file_1.pdf
PDF created and uploaded as Screenshot (5).pdf


In [2]:
file_formats = ['.pdf', '.doc', '.txt', '.docx']
response = s3.list_objects(Bucket=bucket_name)
file_keys = [obj['Key'] for obj in response.get('Contents', []) if any(obj['Key'].endswith(format) for format in file_formats)]

In [3]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}



combined_content = ""

for key in file_keys:
    loader = S3FileLoader(bucket=bucket_name, key=key)
    context_content = loader.load()
    context_content = [str(item) for item in context_content]
    combined_content += '\n'.join(context_content)

documents_list = [Document(page_content=combined_content)]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500)
context_texts = text_splitter.split_documents(documents_list)



In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(documents=context_texts, embedding=embeddings)

retriever = db.as_retriever(search_type='mmr', search_kwargs={"k": 3})

template = """
Human: Answer truthfully based on the given question, fetch the answer only from the given text documents
Instruction:
1. If multiple files are there, read all the files each and every line accurately to generate the answer.
2. If there is no text found in the text document about the asked question, print "no result found." Do not print any results if the answer is not found, do not search the answers from outside.
3. Generate an answer whatever is available related to the question.
4. Must complete the sentence in the result fully, do not leave results incomplete format in the end.
5. If the question about a particular topic , give answer in simple about that topic only
text:{context}
question:{question}
Assistant:"""
bedrock_client = get_bedrock_client(region='us-east-1', runtime=True)
qa_prompt = PromptTemplate(template=template, input_variables=["context","question"])
chain_type_kwargs = {"prompt": qa_prompt}
llm = Bedrock(model_id="anthropic.claude-v2", client=bedrock_client)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
    verbose=False
    )

In [5]:
question="what is the benefits of using azure?"
result = qa.run(question)
print(result)

 Based on the given text, some of the key benefits of using Azure are:

1. Speed and flexibility: Azure provides fast and flexible cloud computing services.

2. Affordability: Azure offers affordable pricing for its services. 

3. Features: Azure provides various features like analytics, AI, etc to assist organizations.

4. Supports open source technologies: Azure works with open source tools and frameworks.

5. Supports many industries: Azure supports various industries like e-commerce, finance, manufacturing, healthcare, etc.

6. Infrastructure as a Service (IaaS): Azure provides IaaS for computing infrastructure. 

7. Platform as a Service (PaaS): Azure offers PaaS for application development platforms.

8. Enables predictive analytics: Azure's analytics capabilities enable predictive modeling for better marketing and distribution.

9. Secure distribution: Azure allows secure and timely distribution of digital assets like albums. 

10. Drives recommendations and personalization: Azu