# Conversational Search with GenAI

## Step 1: Initialize

Install required library such as [OpenSearch client library](https://opensearch.org/docs/1.0/clients/python/), [LangChain](https://python.langchain.com/docs/get_started), etc.

In [None]:
%pip install --upgrade sagemaker==2.186.0 
%pip install opensearch-py==2.3.1
%pip install wikipedia unstructured transformers==4.33.2
%pip install langchain==0.0.308 #0.0.293
%pip install boto3==1.28.59
%pip install selenium

Initialize SageMaker, and Boto3

In [None]:
import sagemaker, boto3, json
from sagemaker.session import Session

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
aws_account = boto3.client('sts').get_caller_identity().get('Account')
sess = sagemaker.Session()

Get Cloud Formation stack output variables

 

In [None]:
import json
region = aws_region

cfn = boto3.client('cloudformation')
kms = boto3.client('secretsmanager')

def get_cfn_outputs(stackname):
    outputs = {}
    for output in cfn.describe_stacks(StackName=stackname)['Stacks'][0]['Outputs']:
        outputs[output['OutputKey']] = output['OutputValue']
    return outputs

## Setup variables to use for the rest of the demo
cloudformation_stack_name = "semantic-search"

outputs = get_cfn_outputs(cloudformation_stack_name)
aos_host = outputs['OpenSearchDomainEndpoint']
aos_credentials = json.loads(kms.get_secret_value(SecretId=outputs['OpenSearchSecret'])['SecretString'])

outputs

## Step 2: Verify deployed endpoint for embedding and content generation model

### Get endpoint for embedding

---
This is SageMaker Endpoint with GPT-J 6B parameters model to convert text into vector.


In [None]:
embedding_endpoint_name=outputs['EmbeddingEndpointName']
print(embedding_endpoint_name)

Endpoint should be in `InService` status to be able to serve requests. 

In [None]:
import time

sm_client = boto3.client("sagemaker", aws_region)

describe_embedding_endpoint_response = sm_client.describe_endpoint(EndpointName=embedding_endpoint_name)

while describe_embedding_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_embedding_endpoint_response = sm_client.describe_endpoint(EndpointName=embedding_endpoint_name)
print(describe_embedding_endpoint_response["EndpointStatus"])

### Get endpoint for content generation



In [None]:
llm_endpoint_name=outputs['LLMEndpointName']
print(llm_endpoint_name)

Verify embedding endpoint is ready (It should show the status as `InService`)

In [None]:
sm_client = boto3.client("sagemaker", aws_region)

describe_llm_endpoint_response = sm_client.describe_endpoint(EndpointName=llm_endpoint_name)

while describe_llm_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_llm_endpoint_response = sm_client.describe_endpoint(EndpointName=llm_endpoint_name)
print(describe_embedding_endpoint_response["EndpointStatus"])

### Test embedding endpoint


In [None]:
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.embeddings import SagemakerEndpointEmbeddings


class TestContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        embeddings = response_json["embedding"]
        if len(embeddings) == 1:
            return [embeddings[0]]
        return embeddings


test_content_handler = TestContentHandler()

test_embeddings = SagemakerEndpointEmbeddings(
    endpoint_name=embedding_endpoint_name,
    region_name=aws_region,
    content_handler=test_content_handler,
)

In [None]:
# return first five elements of the embdedding
print(test_embeddings.embed_documents(["Hello World"])[0][:5])

### Test LLM endpoint
Alternatively we can also use AWS Boto3 library to query SageMaker endpoint as shown below

In [None]:
def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type="application/json"):
    client = boto3.client("runtime.sagemaker")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
    )
    return response

#method used to parse the inference model's response. we pass it as part of the model's config
def parse_response_model(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    return [gen["generated_text"] for gen in model_predictions]


In [None]:
question = "Hello OpenSearch"

In [None]:
payload = {
    "inputs": question,
    "parameters":{
        "max_new_tokens": 1024,
        "num_return_sequences": 1,
        "top_k": 100,
        "top_p": 0.5,
        "do_sample": False,
        "return_full_text": True,
        "temperature": 0.9
    }
}



In [None]:
query_response = query_endpoint_with_json_payload(
    json.dumps(payload).encode("utf-8"), endpoint_name=llm_endpoint_name
)

generated_texts = parse_response_model(query_response)

print(f"The generated output is: {generated_texts[0]}\n")

## Step 3: Test LLM without context information

### 3.1 Test SageMaker

In [None]:
from uuid import uuid4
from typing import Dict
from langchain.memory import ConversationBufferMemory
from langchain.memory import DynamoDBChatMessageHistory
from langchain.memory import ConversationBufferWindowMemory
from langchain import PromptTemplate, SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain.chains import RetrievalQA


class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"
    
    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompt, "parameters": model_kwargs})
        #print("Prompt Input:\n" + input_str)
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        #print("LLM generated text:\n" + response_json[0]["generated_text"])
        return response_json[0]["generated_text"]
    

content_handler = ContentHandler()


In [None]:
params = {
        "max_new_tokens": 1024,
        "num_return_sequences": 1,
        "top_k": 100,
        "top_p": 0.95,
        "do_sample": False,
        "return_full_text": False,
        "temperature": 0.9
        }

In [None]:
llm_hallucination=SagemakerEndpoint(
        endpoint_name=llm_endpoint_name,
        region_name=aws_region,
        model_kwargs=params,
        content_handler=content_handler,
)

In [None]:
question = "How to determine shard and data node counts for OpenSearch?"

In [None]:
print("Question is:" + question)
generated_result = llm_hallucination(question)
print(generated_result)

In [None]:
print("Question is:" + question)
generated_result = llm_hallucination(question)
print(generated_result)

### 3.2 Test Bedrock


In [None]:
is_bedrock_available=True

In [None]:
import json
import os
import sys
import boto3
from botocore.config import Config

bedrock_region="us-west-2"

#boto3_bedrock = boto3.client(service_name="bedrock-runtime", endpoint_url=f"https://bedrock-runtime.{bedrock_region}.amazonaws.com")
boto3_bedrock = boto3.client(service_name="bedrock-runtime", config=Config(region_name=bedrock_region))


In [None]:
from langchain.chains import ConversationChain
from langchain.llms.bedrock import Bedrock
from langchain.memory import ConversationBufferMemory

bedrock_llm_hallucination = Bedrock(model_id="anthropic.claude-instant-v1", client=boto3_bedrock)
bedrock_llm_hallucination.model_kwargs = {"max_tokens_to_sample":1204,"temperature":0.9,"top_k":250,"top_p":1,"stop_sequences":["\\n\\nHuman:"]}

if is_bedrock_available:
    bedrock_result = bedrock_llm_hallucination(question)
    print(bedrock_result)
else:
    print("Bedrock is unavailable")

## Step 4: Test LLM with context information

### 4.1 Test SageMaker with context

In [None]:
with_context_params = {
        "max_new_tokens": 1024,
        "num_return_sequences": 1,
        "top_k": 100,
        "top_p": 0.95,
        "do_sample": False,
        "return_full_text": False,
        "temperature": 0.01
        }

In [None]:
llm_with_context=SagemakerEndpoint(
        endpoint_name=llm_endpoint_name,
        region_name=aws_region,
        model_kwargs=with_context_params,
        content_handler=content_handler,
)

In [None]:
context_and_question="""
Answer the question with with the content in the Context.
Context: 
By default in OpenSearch Service, each index is divided into five primary shards and one replica (total of 10 shards). This behavior differs from open source OpenSearch, which defaults to one primary and one replica shard. Because you can't easily change the number of primary shards for an existing index, you should decide about shard count before indexing your first document.
The overall goal of choosing a number of shards is to distribute an index evenly across all data nodes in the cluster. However, these shards shouldn't be too large or too numerous. A general guideline is to try to keep shard size between 10–30 GiB for workloads where search latency is a key performance objective, and 30–50 GiB for write-heavy workloads such as log analytics.
Large shards can make it difficult for OpenSearch to recover from failure, but because each shard uses some amount of CPU and memory, having too many small shards can cause performance issues and out of memory errors. In other words, shards should be small enough that the underlying OpenSearch Service instance can handle them, but not so small that they place needless strain on the hardware.
For example, suppose you have 66 GiB of data. You don't expect that number to increase over time, and you want to keep your shards around 30 GiB each. Your number of shards therefore should be approximately 66 * 1.1 / 30 = 3. 

Question:How to determine shard and data node counts for OpenSearch?

Answer:
"""

generated_result = llm_with_context(context_and_question)
print(generated_result)

## Step 5: Select SageMaker or Bedrock used for embedding and content generation

In [None]:
from ipywidgets import Dropdown

llm_selection = [
    "SageMaker",
    "Bedrock",
]

llm_dropdown = Dropdown(
    options=llm_selection,
    value="SageMaker",
    description="Select a LLM",
    style={"description_width": "initial"},
    layout={"width": "max-content"},
)
display(llm_dropdown)

In [None]:
llm_category = llm_dropdown.value

if not is_bedrock_available:
    llm_category = "SageMaker"

In [None]:
print("You selected {0} as LLM".format(llm_category))

## Step 6: Load documents with LangChain document loader and store vector into OpenSearch

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import SeleniumURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)

urls = ["https://docs.aws.amazon.com/opensearch-service/latest/developerguide/bp.html",
        "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/sizing-domains.html", 
        "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/petabyte-scale.html",
        "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/managedomains-dedicatedmasternodes.html",
        "https://docs.aws.amazon.com/opensearch-service/latest/developerguide/cloudwatch-alarms.html"]
url_loader = UnstructuredURLLoader(urls=urls)
#url_loader = SeleniumURLLoader(urls=urls)
url_texts = url_loader.load_and_split(text_splitter=text_splitter)


In [None]:
all_splits = url_texts
print(all_splits[0])

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection

auth = (aos_credentials['username'], aos_credentials['password'])
aos_client = OpenSearch(
    hosts = [{'host': aos_host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

### LangChain embedding endpoint

In [None]:
from typing import Any, Dict, Iterable, List, Optional, Tuple, Callable
import json
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.schema import Document

class BulkSagemakerEndpointEmbeddings(SagemakerEndpointEmbeddings):
        def embed_documents(
            self, texts: List[str], chunk_size: int = 5
        ) -> List[List[float]]:
            results = []
            _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size

            for i in range(0, len(texts), _chunk_size):
                response = self._embedding_func(texts[i:i + _chunk_size])
                results.extend(response)
            return results
        
class EmbeddingContentHandler(EmbeddingsContentHandler):
        content_type = "application/json"
        accepts = "application/json"

        def transform_input(self, prompt: str, model_kwargs={}) -> bytes:

            input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
            return input_str.encode('utf-8') 

        def transform_output(self, output: bytes) -> str:

            response_json = json.loads(output.read().decode("utf-8"))
            embeddings = response_json["embedding"]
            if len(embeddings) == 1:
                return [embeddings[0]]
            return embeddings

print(embedding_endpoint_name)
sagemaker_embeddings = BulkSagemakerEndpointEmbeddings( 
            endpoint_name=embedding_endpoint_name,
            region_name=aws_region, 
            content_handler=EmbeddingContentHandler())


### Bedrock embedding

In [None]:
from langchain.embeddings import BedrockEmbeddings

bedrock_embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v1',client=boto3_bedrock)

### OpenSearch vector store

In [None]:
match llm_category:
    case "SageMaker":
        embeddings = sagemaker_embeddings
    case "Bedrock":
        embeddings = bedrock_embeddings


In [None]:
embeddings

Initialize OpenSearch index name 

In [None]:
embedding_index_name = 'opensearch_kb_vector'

In [None]:
#aos_client.indices.delete(index=embedding_index_name)

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch

os_domain_ep = 'https://'+aos_host

if len(all_splits) > 500:
    for i in range(0, len(all_splits), 500):
        start = i
        end = i+500
        if end > len(all_splits):
            end = len(all_splits)-1
        docs = all_splits[start:end]
        OpenSearchVectorSearch.from_documents(
            index_name = embedding_index_name,
            documents=docs,
            embedding=embeddings,
            opensearch_url=os_domain_ep,
            http_auth=auth
        )
        print(f"ingest documents from {start} to {end}", start, end)
else:
    OpenSearchVectorSearch.from_documents(
            index_name = embedding_index_name,
            documents=all_splits,
            embedding=embeddings,
            opensearch_url=os_domain_ep,
            http_auth=auth
        )
    print(f" completed documents ingestion")

The the OpenSearch index detailed information. Pay attention to `dimension` field value when you choose different LLM. For SageMaker, we use GPT-J enmbedding model whose dimension is 4096. For Bedrock, we use Titan Embedding modele whose dimension is 1536.

In [None]:
aos_client.indices.get(index=embedding_index_name)

Initialize OpenSearch index name whose settings are customized

In [None]:
customized_embedding_index_name = 'customized_opensearch_kb_vector'

In [None]:
#aos_client.indices.delete(index=customized_embedding_index_name)

In [None]:
OpenSearchVectorSearch.from_documents(
            index_name = customized_embedding_index_name,
            documents=all_splits,
            embedding=embeddings,
            opensearch_url=os_domain_ep,
            http_auth=auth,
            engine="faiss",
            space_type="innerproduct",
            ef_construction=256,
            m=48,
        )
print(f"ingest documents into customized knn index")

Get OpenSearch index detailed information

In [None]:
aos_client.indices.get(index=customized_embedding_index_name)

In [None]:
class SimiliarOpenSearchVectorSearch(OpenSearchVectorSearch):
    
    def relevance_score(self, distance: float) -> float:
        return distance
    
    def _select_relevance_score_fn(self) -> Callable[[float], float]:
        return self.relevance_score
    

open_search_vector_store = SimiliarOpenSearchVectorSearch(
                                    index_name=embedding_index_name,
                                    embedding_function=embeddings,
                                    opensearch_url=os_domain_ep,
                                    http_auth=auth
                                    ) 

In [None]:
docs_ = open_search_vector_store.similarity_search_with_score(question, k=5)

print("found document number:" + str(len(docs_)))

print("opensearch results:\n")
for doc in docs_:
    print(doc)
    print("\n-----------------")

## Step 7: Retrieval Augmented Generation

In [None]:
sagemaker_retriever = open_search_vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        'k': 5,
        'score_threshold': 0.62
    }
)

bedrock_retriever = open_search_vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        'k': 5,
        'score_threshold': 0.005
    }
)

In [None]:
match llm_category:
    case "SageMaker":
        retriever = sagemaker_retriever
    case "Bedrock":
        retriever = bedrock_retriever

In [None]:
sagemaker_params = {
        "max_new_tokens": 1024,
        "num_return_sequences": 1,
        "top_k": 200,
        "top_p": 0.9,
        "do_sample": False,
        "return_full_text": False,
        "temperature": 0.0001
        }

sagemaker_llm=SagemakerEndpoint(
        endpoint_name=llm_endpoint_name,
        region_name=aws_region,
        model_kwargs=params,
        content_handler=content_handler,
)

In [None]:
bedrock_params = {
    "max_tokens_to_sample":2048,
    "temperature":0.0001,
    "top_k":250,
    "top_p":1,
    "stop_sequences":["\\n\\nHuman:"]
}

bedrock_titan_llm = Bedrock(model_id="anthropic.claude-instant-v1", client=boto3_bedrock)
bedrock_titan_llm.model_kwargs = bedrock_params

### Select content generation LLM, SageMaker or Bedrock

In [None]:
match llm_category:
    case "SageMaker":
        llm = sagemaker_llm
    case "Bedrock":
        llm = bedrock_titan_llm


Show the the LLM for content generation to be used, SageMaker or Bedrock selected above

---

In [None]:
llm

Define `RetrievalQA` Chain with SageMaker or Bedrock LLM

---

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff" #stuff, refine, map_reduce, and map_rerank
)

Use RAG to generate answer to the same question before. Compare the content generated with RAG and LLM without context.

---

In [None]:
print("Question is:" + question)

#langchain.debug=False
result = qa({"query": question})

print("result:" + result["result"])
  

### Use customized prompt for RAG

In [None]:
template2 = """Answer the question as truthfully as possible by using the provided informaiton in >>CONTEXT<<. If the answer is not contained within the >>CONTEXT<<, respond with "I can't answer that".

>>CONTEXT<<:
{context}

>>QUESTION<<:
{question}

>>Answer<<:
"""


prompt_template2 = PromptTemplate(
    input_variables=["question", "context"],
    template=template2
)

qa_with_prompt = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt_template2}
)


In [None]:
import langchain

#langchain.debug = True
langchain.debug = False

print("Question is:" + question)
result = qa_with_prompt({"query": question})

print("\n### Generated result:" + result["result"])


### Return source documents

In [None]:
qa_with_source = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template2}
)

In [None]:
print("Question is:" + question)
result = qa_with_source({"query": question})

print("result:" + result["result"])
print("\n\n===========================")
print("\nsource documents:")
for doc in result["source_documents"]:
    print(doc)
    print("---------------------------\n")

## Step 8: Conversational search by memorizing the history 

In [None]:
dynamo = boto3.client('dynamodb')

history_table_name = 'conversation-history-memory'

try:
    response = dynamo.describe_table(TableName=history_table_name)
    print("The table "+history_table_name+" exists")
except dynamo.exceptions.ResourceNotFoundException:
    print("The table "+history_table_name+" does not exist")
    
    dynamo.create_table(
    TableName=history_table_name,
    AttributeDefinitions=[
        {
            'AttributeName': 'SessionId',
            'AttributeType': 'S',
        }
    ],
    KeySchema=[
        {
            'AttributeName': 'SessionId',
            'KeyType': 'HASH',
        }
    ],
    ProvisionedThroughput={
        'ReadCapacityUnits': 5,
        'WriteCapacityUnits': 5,
    }
    )

    response = dynamo.describe_table(TableName=history_table_name) 
    
    while response["Table"]["TableStatus"] == 'CREATING':
        time.sleep(1)
        print('.', end='')
        response = dynamo.describe_table(TableName=history_table_name) 

    print("\ndynamo DB Table, '"+response['Table']['TableName']+"' is created")




In [None]:
ddb_table_name = "conversation-history-memory"
session_id = str(uuid4())
chat_memory = DynamoDBChatMessageHistory(
        table_name=ddb_table_name,
        session_id=session_id
    )

messages = chat_memory.messages

# Maintains immutable sessions
# If previous session was present, create
# a new session and copy messages, and 
# generate a new session_id 
if messages:
    session_id = str(uuid4())
    chat_memory = DynamoDBChatMessageHistory(
        table_name=ddb_table_name,
        session_id=session_id
    )
    # This is a workaround at the moment. Ideally, this should
    # be added to the DynamoDBChatMessageHistory class
    try:
        messages = messages_to_dict(messages)
        chat_memory.table.put_item(
            Item={"SessionId": session_id, "History": messages}
        )
    except Exception as e:
        print(e)

memory = ConversationBufferMemory(chat_memory=chat_memory, memory_key="chat_history", return_messages=True)


In [None]:
from langchain.chains import ConversationalRetrievalChain

params = {
        "max_length": 2048,
        "max_new_tokens": 1024,
        "num_return_sequences": 1,
        "top_k": 200,
        "top_p": 0.9,
        "do_sample": False,
        "return_full_text": False,
        "temperature": 0.0001
        }

sagemaker_llm=SagemakerEndpoint(
        endpoint_name=llm_endpoint_name,
        region_name=aws_region,
        model_kwargs=params,
        content_handler=content_handler,
)

condense_template = """system: generate one standalone question.
Given the following conversation between <chat-history> and </chat-history> 
and follow up question between <follow-up-question> and </follow-up-question>, 
rephrase the follow up question to be a standalone question in its original language. 
The standalone question will only contains one sentence and it must end with '?'

<chat-history>
{chat_history}
</chat-history>

<follow-up-question>
{question}
</follow-up-question>

standalone question:
"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(condense_template)




In [None]:
match llm_category:
    case "SageMaker":
        llm = sagemaker_llm
    case "Bedrock":
        llm = bedrock_titan_llm

In [None]:
qa_with_memory = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    combine_docs_chain_kwargs={"prompt": prompt_template2},
    verbose=True)

In [None]:
result = qa_with_memory(question)


In [None]:
#print("result:" + str(result))
print("\nAnswer:\n" + str(result["answer"]))

### Second question
Try to ask one more question, `ConversationalRetrievalChain` will use the first question, first question's answer and second question as prompt to LLM to generate new question. The prompt to LLM is like following:

```python

_template = """system: generate one standalone question.
Given the following conversation between <chat-history> and </chat-history> 
and follow up question between <follow-up-question> and </follow-up-question>, 
rephrase the follow up question to be a standalone question in its original language. 
The standalone question will only contains one sentence and it must end with '?'

<chat-history>
{chat_history}
</chat-history>

<follow-up-question>
{question}
</follow-up-question>

standalone question:
```

After get the new question from LLM, it will search relevant document from OpenSearch vector store and get relevant documents, then combine the new question and relevant documents as prompt to go through RAG process. The prompt to LLM is like following:

```python
prompt_template = """Answer the question as truthfully as possible by using the provided informaiton in >>CONTEXT<<. If the answer is not contained within the >>CONTEXT<<, respond with "I can't answer that".

>>CONTEXT<<:
{context}

>>QUESTION<<:
{question}

>>Answer<<:
"""
```

In summary, `ConversationalRetrievalChain` will call LLM twice:
1. Use history question, history answer and latest question as prompt to generate new question
2. Use new question generated in the first step, query relevant documents. Combine relevant documents and new question as prompt to LLM to generate answer.

You can also see the verbose message like following:

---

### First call to LLM:

![generate new question](../image/module8/conversation-new-question.png)

---

### Second call to LLM:

![generate final answer](../image/module8/conversation-final-answer.png)

---


In [None]:
second_following_question = 'if my data growth is very fast'
second_result = qa_with_memory(second_following_question)


In [None]:
print("second answer:" + str(second_result["answer"]))

### Return source document

---

We can also include source documents so that we can know where the content information are from.

In [None]:
session_id = str(uuid4())
chat_memory = DynamoDBChatMessageHistory(
        table_name=ddb_table_name,
        session_id=session_id
    )

memory_for_source = ConversationBufferMemory(chat_memory=chat_memory,memory_key="chat_history")

qa_with_memory_and_source = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=retriever,
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    combine_docs_chain_kwargs={"prompt": prompt_template2},
    return_source_documents=True, 
    verbose=True)


In [None]:
chat_history = []
result = qa_with_memory_and_source({"question": question,"chat_history": chat_history})

In [None]:
print("\nAnswer:\n" + str(result["answer"]))
print("\nSource Documents:\n")
for doc in result["source_documents"]:
    print(str(doc))
    print("--------------------------------")

In [None]:
chat_history = [(question, result["answer"])]
second_query = second_following_question
second_result = qa_with_memory_and_source({"question": second_query, "chat_history": chat_history})

In [None]:
print("\nAnswer:\n" + str(second_result["answer"]))
print("\nSource Documents:\n")
for doc in second_result["source_documents"]:
    print(str(doc))
    print("--------------------------------")