# Create embeddings using Amazon Titan Text Embeddings model and run similarity tests

> *This notebook should work well with the **`Data Science 3.0`** kernel in SageMaker Studio*

## Setup

In [None]:
import json
import os
import sys

import boto3
import botocore

import numpy as np
from numpy.linalg import norm

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from labutils import bedrock

boto3_bedrock = boto3.client(service_name='bedrock-runtime')

modelId="amazon.titan-embed-text-v1"

## Define a resuable Python function to generate the embeddings for each sentence

In [None]:
def sen2vec(sentence):
    # Convert the given sentence to a vector representation using a text embedding model.  
    input_body = {"inputText": sentence}
    try:
        response = boto3_bedrock.invoke_model(
            body=json.dumps(input_body),
            modelId=modelId,
            accept="application/json",
            contentType="application/json",
            )
        response_body = json.loads(response.get("body").read())
        vector = np.array(response_body.get("embedding"))            
        return vector
    except Exception as e:
        print(e)

## Generate and store the embeddings in an array

In [None]:
with open("documents.txt") as doc:
    num_records = len(doc.readlines())
    
doc_array=np.empty(shape=(num_records), dtype="S255")
embed_array = np.zeros(shape=(num_records, 1536))
    
with open("documents.txt") as doc:
    for num, line in enumerate(doc, 0):
      doc = line.strip('\n')
      doc_array[num] = doc
      embed_array[num] = sen2vec(doc)

print(embed_array.shape)
print(embed_array)

## Interactively query using similarity search

Create an interactive query response loop that uses similarity search to pull relevent documents from the matrix vector store. We use 0.5 as the similarity threshold to limit hallucination. Queries that relate to the course names in the document store will lead to successful results. If none of the results exceed the 0.5 threshold, "I don't know" will be the response.

In [None]:
while True:
    query = input("\nEnter your query or say quit to exit: ")
    if query == "quit":
        break
    else:
        embed_query = sen2vec(query)
        denominator = norm(embed_array, axis=1) * norm(embed_query)
        similarity = embed_array.dot(embed_query) / denominator
        max_value = max(similarity)
        max_value_index = similarity.argmax()
        print("\nQuery: " + query)
        if max_value > 0.5:
            print("Document that can answer the question: " + doc_array[max_value_index].decode('UTF-8'))
        else:
            print("Answer: " + "I don't know")
        print("\nSimilarity vector used for document selection")
        print(similarity)