# Generative AI with Amazon Bedrock
- Amazon Bedrock Handson 기반 LLM 모델 활용법

In [2]:
import json
import boto3

In [3]:
# AWS Setting
aws_access_key_id=""
aws_secret_access_key=""
region_name=""

session = boto3.Session(region_name=region_name)
bedrock_client = session.client(service_name="bedrock"
                                 , aws_access_key_id=aws_access_key_id
                                 , aws_secret_access_key=aws_secret_access_key)
bedrock_runtime = session.client(service_name="bedrock-runtime"
                                 , aws_access_key_id=aws_access_key_id
                                 , aws_secret_access_key=aws_secret_access_key)

In [4]:
# Bedrock에서 지원하는 FM Model 확인
fm_list = bedrock_client.list_foundation_models()
print([model['modelId'] for model in fm_list['modelSummaries']][:5])

# Claude 모델 확인
[model['modelId'] for model in fm_list['modelSummaries'] if 'claude' in model['modelId']]

['amazon.titan-tg1-large', 'amazon.titan-image-generator-v1:0', 'amazon.titan-image-generator-v1', 'amazon.titan-embed-g1-text-02', 'amazon.titan-text-lite-v1:0:4k']


['anthropic.claude-instant-v1:2:100k',
 'anthropic.claude-instant-v1',
 'anthropic.claude-v1',
 'anthropic.claude-v2:0:18k',
 'anthropic.claude-v2:0:100k',
 'anthropic.claude-v2:1:18k',
 'anthropic.claude-v2:1:200k',
 'anthropic.claude-v2:1',
 'anthropic.claude-v2']

## LLM 모델을 활용한 질의응답

In [8]:
# If you'd like to try your own prompt, edit this parameter!
prompt_data = """Human: what is llm and rag
Assistant:
"""

body = json.dumps({"prompt": prompt_data
                    , "max_tokens_to_sample": 2048})
modelId = "anthropic.claude-v2"  # change this to use a different version from the model provider
accept = "application/json"
contentType = "application/json"

response = bedrock_runtime.invoke_model(
    body=body, modelId=modelId, accept=accept, contentType=contentType
)
response_body = json.loads(response.get("body").read())

print(response_body.get("completion"))

 LLMs and RAGs are types of artificial intelligence systems:

- LLMs stands for Large Language Models. These are AI systems trained on huge amounts of text data to generate human-like text and engage in natural conversation. Some examples of LLMs are GPT-3, ChatGPT, Claude, and Anthropic's Claude.

- RAG stands for Retrieval-Augmented Generation. RAG systems combine large language models with a retrieval mechanism to provide more accurate and knowledgeable responses. The retrieval mechanism allows the system to look up relevant information from a knowledge base when answering questions, rather than relying solely on the language model. Some examples of RAG systems are Anthropic's Constitutional AI and Meta's BlenderBot. 

The main difference is that RAG systems have an additional retrieval component that allows them to supplement the knowledge within the language model, while basic LLMs generate responses using only their trained parameters. RAG systems aim to have more factual groundi

## VectorDB를 활용한 Document 기반 응답(Retrieval) - 영어

In [10]:
# 참고할 Document 생성 AWS Hands-on Document (Eng)
from urllib.request import urlretrieve
from pypdf import PdfReader, PdfWriter
import glob
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

urls = [
    'https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2021/ar/Amazon-2020-Shareholder-Letter-and-1997-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2020/ar/2019-Shareholder-Letter.pdf'
]

filenames = [
    'AMZN-2022-Shareholder-Letter.pdf',
    'AMZN-2021-Shareholder-Letter.pdf',
    'AMZN-2020-Shareholder-Letter.pdf',
    'AMZN-2019-Shareholder-Letter.pdf'
]

metadata = [
    dict(year=2022, source=filenames[0]),
    dict(year=2021, source=filenames[1]),
    dict(year=2020, source=filenames[2]),
    dict(year=2019, source=filenames[3])]

data_root = "./data/"

for idx, url in enumerate(urls):
    file_path = data_root + filenames[idx]
    urlretrieve(url, file_path)

local_pdfs = glob.glob(data_root + '*.pdf')

for local_pdf in local_pdfs:
    pdf_reader = PdfReader(local_pdf)
    pdf_writer = PdfWriter()
    for pagenum in range(len(pdf_reader.pages)-3):
        page = pdf_reader.pages[pagenum]
        pdf_writer.add_page(page)

    with open(local_pdf, 'wb') as new_file:
        new_file.seek(0)
        pdf_writer.write(new_file)
        new_file.truncate()

documents = []

for idx, file in enumerate(filenames):
    loader = PyPDFLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = metadata[idx]
        
    print(f'{len(document)} {document}\n')
    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 100,
)

docs = text_splitter.split_documents(documents)


7 [Document(page_content='Dear shareholders:\nAs I sit down to write my second annual shareholder letter as CEO, I find myself optimistic and energized\nby what lies ahead for Amazon. Despite 2022 being one of the harder macroeconomic years in recent memory,and with some of our own operating challenges to boot, we still found a way to grow demand (on top ofthe unprecedented growth we experienced in the first half of the pandemic). We innovated in our largestbusinesses to meaningfully improve customer experience short and long term. And, we made importantadjustments in our investment decisions and the way in which we’ll invent moving forward, while stillpreserving the long-term investments that we believe can change the future of Amazon for customers,\nshareholders, and employees.\nWhile there were an unusual number of simultaneous challenges this past year, the reality is that if you\noperate in large, dynamic, global market segments with many capable and well-funded competitors (theco

In [6]:
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import FAISS

# bedrock embedding model check
[model['modelId'] for model in fm_list['modelSummaries'] if 'embed' in model['modelId']]


['amazon.titan-embed-g1-text-02',
 'amazon.titan-embed-text-v1:2:8k',
 'amazon.titan-embed-text-v1',
 'amazon.titan-embed-image-v1:0',
 'amazon.titan-embed-image-v1',
 'cohere.embed-english-v3',
 'cohere.embed-multilingual-v3']

In [15]:
titan = 'amazon.titan-embed-text-v1'
cohere = 'cohere.embed-multilingual-v3'

titan_embeddings = BedrockEmbeddings(model_id=titan, client=bedrock_runtime)
cohere_embeddings = BedrockEmbeddings(model_id=cohere, client=bedrock_runtime)

In [16]:
titan_db = FAISS.from_documents(docs, titan_embeddings)
titan_retriever = titan_db.as_retriever(search_kwargs={"k": 4})
cohere_db = FAISS.from_documents(docs, cohere_embeddings)
cohere_retriever = cohere_db.as_retriever(search_kwargs={"k": 4})

In [17]:
# save vectordb
titan_db.save_local('./indexes/titan_faiss')
cohere_db.save_local('./indexes/cohere_faiss')

In [None]:
# load vectordb
titan_db = FAISS.load_local("cts_faiss_index", titan_embeddings)
cohere_db = FAISS.load_local("cts_faiss_index", cohere_embeddings)

In [22]:
# Compare embedding model
question = "Why is Amazon successful?"
titan_db.similarity_search(question)

[Document(page_content='This growth also created short-term logistics and cost challenges. We spent Amazon’s first 25 years building', metadata={'year': 2021, 'source': 'AMZN-2021-Shareholder-Letter.pdf'}),
 Document(page_content='position us well to pursue this large market segment. Amazon Business allows businesses, municipalities,and organizations to procure products like office supplies and other bulk items easily and at great savings.While some areas of the economy have struggled over the past few years, Amazon Business has thrived. Why?Because the team has translated what it means to deliver selection, value, and convenience into a businessprocurement setting, constantly listening to and learning from customers, and innovating on their behalf.Some people have never heard of Amazon Business, but, our business customers love it. Amazon Businesslaunched in 2015 and today drives roughly $35B in annualized gross sales. More than six million activecustomers, including 96 of the global 

In [23]:
cohere_db.similarity_search(question)

[Document(page_content='through the pandemic the same way without the dedication and extraordinary efforts shown by our teams\nduring this period, and I’m eternally grateful.\nIt’s not normal for a company of any size to be able to respond to something as discontinuous and\nunpredictable as this pandemic turned out to be. What is it about Amazon that made it possible for us to doso? It’s because we weren’t starting from a standing start. We had been iterating on and remaking ourfulfillment capabilities for nearly two decades. In every business we pursue, we’re constantly experimentingand inventing. We’re divinely discontented with customer experiences, whether they’re our own or not. Webelieve these customer experiences can always be better, and we strive to make customers’ lives better andeasier every day. The beauty of this mission is that you never run out of runway; customers always want better,and our job is both to listen to their feedback and to imagine what else is possible and

In [24]:
# 질문으로부터 유사도 기반 문서를 추출하고, 해당 내용과 함께 LLM 모델에 입력하여 결과를 얻음.

question = "Why is Amazon successful?"
results = cohere_db.similarity_search(question)

contexts = ""

for idx, document in enumerate(results):
    contexts += f"{idx+1}. {document.page_content} \n"

# If you'd like to try your own prompt, edit this parameter!
prompt_data = f"""Human: {question}
<context>
{contexts}
</context>
Answer the question based on the <context></context> provided. If the text doesn't contain the answer, reply that the answer is not available.
Assistant:
"""

body = json.dumps({"prompt": prompt_data
                    , "max_tokens_to_sample": 2048})
modelId = "anthropic.claude-v2"  # change this to use a different version from the model provider
accept = "application/json"
contentType = "application/json"

response = bedrock_runtime.invoke_model(
    body=body, modelId=modelId, accept=accept, contentType=contentType
)
response_body = json.loads(response.get("body").read())

print(response_body.get("completion"))

 Based on the context, there are a few key reasons why Amazon has been successful:

1. Amazon has constantly iterated and improved its fulfillment and logistics capabilities over many years. This enabled it to rapidly scale up during the pandemic when demand surged.

2. Amazon diversified beyond just selling books online to become a large marketplace with extensive product selection and expanded globally. 

3. Amazon has repeatedly innovated and invented new products and services like Kindle, Alexa, and AWS that created new business opportunities.

4. Amazon is focused on improving the customer experience and making customers' lives easier. This customer obsession drives constant improvement. 

5. Amazon has invested heavily in expanding infrastructure like fulfillment centers and transportation networks to enable fast delivery at scale.

So in summary, key factors seem to be long-term investment in logistics/infrastructure, constant innovation, customer obsession, and rapid expansion 

## VectorDB를 활용한 Document 기반 응답(Retrieval) - 한국어

-  기본 프로세스는 위와 동일하나, 한국어로 학습된 Embedding 모델을 활용하여야함.
- Amazon Titan과 Cohere 모두 한국어 Embedding을 지원하나, HuggingFace의 sroberta 모델을 함께 테스트한다.

In [7]:
# 참고할 Document 생성 위키피디아 방탄소년단 (Kor)
# https://ko.wikipedia.org/wiki/%EB%B0%A9%ED%83%84%EC%86%8C%EB%85%84%EB%8B%A8
from urllib.request import urlretrieve
from pypdf import PdfReader, PdfWriter
import glob
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

data_root = ""
file_nm = "./data/방탄소년단.pdf"

pdf_reader = PdfReader(f'{data_root}{file_nm}')
pdf_writer = PdfWriter()

documents = []

loader = PyPDFLoader(data_root + file_nm)
document = loader.load()
    
print(f'{len(document)} {document}\n')
documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 100,
)

docs = text_splitter.split_documents(documents)


41 [Document(page_content="2022년  백악관에서의  방탄소년단\n왼쪽에서부터  뷔, 정국, 지민, RM, 진, 제이홉, 슈가\n기본 정보\n결성 지역대한민국 서울특별시\n장르 랩 · 힙합 · 록 · EDM\n활동 시기2013년  6 월  13 일  ~ 현재\n레이블\n  하이브  레이블\n 유니버설  뮤직\n 컬럼비아  레코드\n소속사빅히트  뮤직\n웹사이트공식 홈페이지 (http://bts.ibighit.com)\n구성원\nRM\n진\n슈가\n제이홉\n지민\n뷔\n정국방탄소년단\n방탄소년단(防彈少年團 , 영어: Bangtansonyunda n, 약칭:\nBTS, 방탄 ) 은  2013년 6월 13일에 데뷔한  빅히트  뮤직\n소속  대한민국 7인조  보이  그룹이다 .[1][2] 팬 클럽 이름\n은 아미다 .\n현재  멤버  전원이  병역을  이행  중에  있다 .\n방탄복이  총알을 막아내는  것처럼 , 살아가는  동안  힘든\n일을  겪는  10 대 , 20 대가  겪는  힘든  일과  편견을  막아내\n고 자신들의  음악적  가치를  당당히  지켜내겠다는  의미\n를 담고  있다 . 방탄소년단을  지칭하는  'BTS' 는  본래  이\n름인  'BangTan Sonye unda n' 혹은  'Bulletproof Boys' 의\n준말이다 .[3] 2017년  빅히트  뮤직은 방탄소년단의  공식\n로고를  교체하면서  과거와  미래를  아우르는  개념으로\n의미를  확장시키고 , 'Beyond The Scene' 의  준말로  의미\n를 추가했다 . 이는  매  순간마다  청춘의  장면들을  뛰어넘\n는다는  의미를  가지고  있다 .[4][5][6]\n2013년  방탄소년단은  《2 COOL 4 SKOOL》을  발매\n하며  데뷔하였고 , 그  해  신인상을  수상했다 . 이후  2015\n년에  《화양연화  pt.1》을  발매해  첫  음악방송 1위를  수\n상하였고 , 2016 년에  《WINGS》를  발매해  첫  대상을\n

In [8]:
avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
print(f'Average length among {len(documents)} documents loaded is {avg_doc_length(documents)} characters.')
print(f'After the split we have {len(docs)} documents as opposed to the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_doc_length(docs)} characters.')

Average length among 41 documents loaded is 2568 characters.
After the split we have 131 documents as opposed to the original 41.
Average length among 131 documents (after split) is 857 characters.


In [9]:
# 전체를 입력할 필요는 없으니, 20개만 추출하여 진행
docs = docs[:20]

In [10]:
titan = 'amazon.titan-embed-text-v1'
cohere = 'cohere.embed-multilingual-v3'
ksbert = 'BM-K/KoSimCSE-BERT-multitask'
ksroberta = 'jhgan/ko-sroberta-multitask'

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

titan_embeddings = BedrockEmbeddings(model_id=titan,
    client=bedrock_runtime
)

cohere_embeddings = BedrockEmbeddings(model_id=cohere,
    client=bedrock_runtime
)

ksbert_embeddings = HuggingFaceEmbeddings(
    model_name=ksbert,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

ksroberta_embeddings = HuggingFaceEmbeddings(
    model_name=ksroberta,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)


No sentence-transformers model found with name /Users/mzc01-kimsangwook/.cache/torch/sentence_transformers/BM-K_KoSimCSE-BERT-multitask. Creating a new one with MEAN pooling.


In [33]:
# Embedding Model Test
titan_retriever = FAISS.from_documents(docs, titan_embeddings).as_retriever(search_kwargs={"k": 4})
cohere_retriever = FAISS.from_documents(docs, cohere_embeddings).as_retriever(search_kwargs={"k": 4})
ksbert_retriever = FAISS.from_documents(docs, ksbert_embeddings).as_retriever(search_kwargs={"k": 4})
ksroberta_retriever = FAISS.from_documents(docs, ksroberta_embeddings).as_retriever(search_kwargs={"k": 4})

retriever_nm_list = ["titan", "cohere", 'ksbert', 'ksroberta']
retriever_list = [titan_retriever, cohere_retriever, ksbert_retriever, ksroberta_retriever]

In [35]:
question = "BTS의 의미는?"

for retriever_nm, retriever in zip(retriever_nm_list, retriever_list):
    doc_list = retriever.get_relevant_documents(question)
    print(f"==== {retriever_nm} ====")
    for doc in doc_list:
        print(doc.page_content)
        print('-' * 100)
    
    print()

==== titan ====
정부 화관문화훈장 최연소  수여자이다 . 또한 , 그래미  어워드 후보에 오른  최초이자  유일한  대한민국의
음악  그룹이다 .
방탄소년단은  SNS를 통한  팬들과의  소통이  활발하여  2019 년과  2020 년  전  세계에서  가장  많은  리트윗
을 기록한  연예인이자  트위터 최다  활동  음악  그룹으로  기네스  세계  기록에 오르기도  했다 . 또한  2017 년
라인프렌즈와 협업하며  직접  창작한  캐릭터  《BT21》을  선보였고 , 현재  다양한  캐릭터  상품을  출시하
고 있다 . 방탄소년단은  그룹명의  의미에  맞게  사회  활동  및  자선  활동에  활발히  참여하고  있으며 , 유니세
프와 함께  LOVE MYSELF 캠페인을 진행하면서  소속사  빅히트  뮤직과 함께  5 억  원을  기부하였다 . 또
한 유엔 총회에서  두  차례  연설하였고[7][8], 이를  계기로  타임지  표지를  장식하기도  했다 . 방탄소년단의
팬덤은  아미(영어: ARMY)[9]이며  '청춘을  위한  사랑스러운  대표자 ' 라는  의미와  방탄복과 군대처럼  방탄
소년단도  팬클럽과  항상  함께라는  의미를  가지고  있다 .[10]
빅히트  엔터테인먼트의 대표이자  프로듀서인  방시혁은 2010년  9 월  2 일 , 힙합  그룹  방탄소년단의  새  멤
버를  모집하는  전국  오디션을  개최한다고  밝혔다 . 이  오디션은  포털사이트  다음과 함께  ‘ 힛잇 (HIT
IT)’이라는  타이틀로  진행되었다 . 이미  방탄소년단의  멤버로  발탁된  김남준에  대해  방시혁은  “ 언더  힙
합신에서도  실력을  인정받은  고등학생  래퍼로  랩  메이킹에  탁월하며  프로  못지  않은  실력을  지니고  있
다”고  소개했다 .[11][12][13] 연습생  시절에는  임정희와 2AM 등의  회사  선배  가수들의  앨범에  참여하기
-------------------------------------

In [37]:
""" 방탄소년단(防彈少年團 , 영어: Bangtansonyunda n, 약칭: BTS, 방탄 ) 은  2013년 6월 13일에 데뷔한  빅히트  뮤직
소속  대한민국 7인조  보이  그룹이다 .[1][2] 팬 클럽 이름은 아미다 .
현재  멤버  전원이  병역을  이행  중에  있다 . 방탄복이  총알을 막아내는  것처럼 , 살아가는  동안  힘든
일을  겪는  10 대 , 20 대가  겪는  힘든  일과  편견을  막아내고 자신들의  음악적  가치를  당당히  지켜내겠다는  의미
를 담고  있다 . 방탄소년단을  지칭하는  'BTS' 는  본래  이름인  'BangTan Sonye unda n' 혹은  'Bulletproof Boys' 의
준말이다 .[3] 2017년  빅히트  뮤직은 방탄소년단의  공식로고를  교체하면서  과거와  미래를  아우르는  개념으로
의미를  확장시키고 , 'Beyond The Scene' 의  준말로  의미를 추가했다 . 이는  매  순간마다  청춘의  장면들을  뛰어넘
는다는  의미를  가지고  있다 . """

# 원하는 문장인 위 내용을 1순위로 추출한 cohere와 ksroberta 중 우선 무료로 사용할 수 있는 ksroberta를 활용하기로함.
retrieval = ksroberta_retriever

In [40]:
# 질문으로부터 유사도 기반 문서를 추출하고, 해당 내용과 함께 LLM 모델에 입력하여 결과를 얻음.

question = "BTS의 의미는?"
results = retrieval.get_relevant_documents(question)

contexts = ""

for idx, document in enumerate(results):
    contexts += f"{idx+1}. {document.page_content} \n"

# If you'd like to try your own prompt, edit this parameter!
prompt_data = f"""Human: 
<context>
{contexts}
</context>
Answer the question based on the <context></context> provided. If the text doesn't contain the answer, reply that the answer is not available.

{question}
Assistant:
"""

body = json.dumps({"prompt": prompt_data
                    , "max_tokens_to_sample": 2048})
modelId = "anthropic.claude-v2"  # change this to use a different version from the model provider
accept = "application/json"
contentType = "application/json"

response = bedrock_runtime.invoke_model(
    body=body, modelId=modelId, accept=accept, contentType=contentType
)
response_body = json.loads(response.get("body").read())

print(response_body.get("completion"))

 <context>에 따르면, BTS는 Bangtan Sonyeondan의 약자로, 방탄소년단의 영문 약자입니다. 

본문에서는 다음과 같이 설명하고 있습니다:

방탄소년단(防彈少年團, 영어: Bangtan Sonyeondan, 약칭: BTS, 방탄)은 2013년 6월 13일에 데뷔한 빅히트 뮤직 소속 대한민국 7인조 보이 그룹이다.

따라서 BTS의 의미는 Bangtan Sonyeondan의 약자로, 방탄소년단을 나타냅니다.
