# Understanding Embeddings using Amazon Bedrock

Embeddings enable deep-learning models to understand real-world data domains more effectively. They simplify how real-world data is represented while retaining the semantic and syntactic relationships. This allows machine learning algorithms to extract and process complex data types and enable innovative AI applications. The following sections describe some important factors. 

In [2]:
!pip install umap-learn altair

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import boto3
import json
import pandas as pd
import umap
from utils import umap_plot

## Create Embeddings using Amazon Bedrock's `Amazon Titan Embeddings G1 - Text` Embeddings Model

For More Info regarding Titan Embeddings: https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html

In [4]:
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-east-1')

In [5]:
prompt = "Hello my name is Noel"

In [6]:
kwargs = {
    "modelId": "amazon.titan-embed-text-v1",
    "contentType": "application/json",
    "accept": "*/*",
    "body": json.dumps(
        {
            "inputText": prompt
        }
    )
}

In [7]:
response = bedrock_runtime.invoke_model(**kwargs)

In [8]:
response

{'ResponseMetadata': {'RequestId': '92d5d87a-b9b9-4c2d-8117-b49b8090e04b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 29 Feb 2024 19:28:19 GMT',
   'content-type': 'application/json',
   'content-length': '17249',
   'connection': 'keep-alive',
   'x-amzn-requestid': '92d5d87a-b9b9-4c2d-8117-b49b8090e04b',
   'x-amzn-bedrock-invocation-latency': '137',
   'x-amzn-bedrock-input-token-count': '5'},
  'RetryAttempts': 0},
 'contentType': 'application/json',
 'body': <botocore.response.StreamingBody at 0x7f0d43134c70>}

In [9]:
response_body = json.loads(response.get('body').read())

In [21]:
response_body['embedding']

[-0.44140625,
 -0.13085938,
 0.46679688,
 -0.015197754,
 -0.30078125,
 0.33789062,
 0.16015625,
 1.8239021e-05,
 -0.79296875,
 -0.060546875,
 0.35351562,
 0.50390625,
 0.096191406,
 -0.27929688,
 0.45898438,
 0.06689453,
 -0.13769531,
 -0.15917969,
 -0.22851562,
 0.37695312,
 -0.119140625,
 -0.40625,
 -0.012573242,
 -0.45898438,
 -0.10253906,
 -0.24414062,
 -0.25976562,
 -1.6484375,
 0.11376953,
 -0.28710938,
 0.37890625,
 1.7734375,
 0.32421875,
 0.73828125,
 0.19238281,
 0.06542969,
 0.20214844,
 -0.15332031,
 0.41992188,
 0.44140625,
 0.18261719,
 -0.23144531,
 0.13574219,
 0.05029297,
 0.640625,
 0.67578125,
 0.32421875,
 0.10986328,
 0.16308594,
 0.46289062,
 0.14257812,
 0.64453125,
 0.515625,
 -0.03564453,
 -0.140625,
 -0.19140625,
 -0.19726562,
 -0.19628906,
 -0.037597656,
 -0.5703125,
 0.18261719,
 -0.32421875,
 -0.12060547,
 0.22070312,
 0.15820312,
 -0.30859375,
 -0.4609375,
 -0.0025024414,
 -0.2109375,
 -0.44335938,
 -0.19824219,
 0.16992188,
 0.07421875,
 0.6015625,
 -0.00

In [10]:
len(response_body['embedding'])

1536

### Functionize our Embedding call

In [11]:
def generate_embeddings(body):
   
    kwargs = {
    "modelId": "amazon.titan-embed-text-v1",
    "contentType": "application/json",
    "accept": "*/*",
    "body": json.dumps(
        {
            "inputText": body
        }
    )
    }

    response = bedrock_runtime.invoke_model(**kwargs)

    response_body = json.loads(response.get('body').read())

    return response_body['embedding']

In [12]:
generate_embeddings('this is a test')

[0.10595703,
 -0.3515625,
 -0.076171875,
 -0.016845703,
 -0.25390625,
 0.2890625,
 0.37109375,
 -0.00064086914,
 -0.31054688,
 -0.003540039,
 0.64453125,
 -0.00091552734,
 0.40234375,
 0.40234375,
 -0.57421875,
 -0.3359375,
 0.18261719,
 -0.125,
 -0.31445312,
 0.359375,
 -0.84765625,
 0.08105469,
 0.11035156,
 0.78515625,
 0.35351562,
 -0.28125,
 0.55078125,
 -0.23828125,
 -0.44335938,
 -0.08886719,
 0.37304688,
 0.88671875,
 0.859375,
 -1.046875,
 0.44140625,
 -0.19628906,
 0.33789062,
 0.1875,
 0.08642578,
 0.21386719,
 -0.13085938,
 -0.18164062,
 0.58984375,
 0.640625,
 -0.03466797,
 0.034179688,
 0.14550781,
 0.13476562,
 0.59375,
 0.90625,
 -0.05859375,
 0.5546875,
 0.31835938,
 -0.1875,
 0.24902344,
 0.53515625,
 -0.7109375,
 0.32421875,
 0.50390625,
 -0.41796875,
 -0.16210938,
 -0.4375,
 -0.14160156,
 -0.53125,
 0.296875,
 -0.08642578,
 -0.46289062,
 0.09082031,
 -0.890625,
 0.32421875,
 0.20410156,
 -0.51171875,
 -0.37304688,
 0.23339844,
 -0.13671875,
 0.38085938,
 -0.01342773

### Let's create a list of related sentences and turn them into Embeddings

In [13]:
sentences = pd.DataFrame({'text':[
    'Where is the world cup?',
    'The world cup is in Qatar',
    'What color is the sky?',
    'The sky is blue',
    'Where does the bear live?',
    'The bear lives in the woods',
    'What is an apple?',
    'An apple is a fruit'
]}) 

In [14]:
sentences

Unnamed: 0,text
0,Where is the world cup?
1,The world cup is in Qatar
2,What color is the sky?
3,The sky is blue
4,Where does the bear live?
5,The bear lives in the woods
6,What is an apple?
7,An apple is a fruit


In [15]:
emb = []

for i in sentences['text']:
    emb.append(generate_embeddings(i))

In [16]:
len(emb)

8

In [17]:
emb

[[-0.52734375,
  -0.055419922,
  -0.29492188,
  0.27734375,
  0.18359375,
  0.115722656,
  0.37890625,
  -0.0006599426,
  1.421875,
  0.375,
  0.84375,
  -0.51171875,
  -0.09667969,
  -0.2265625,
  0.33203125,
  0.3203125,
  0.09765625,
  0.390625,
  0.34375,
  -0.09667969,
  0.45117188,
  0.40234375,
  -0.39257812,
  -0.8046875,
  0.27148438,
  0.55078125,
  0.32421875,
  -0.9453125,
  -0.41015625,
  0.4921875,
  -0.8828125,
  -0.076171875,
  -0.28125,
  -0.40820312,
  -0.1015625,
  -0.59375,
  0.69140625,
  0.15332031,
  -0.75390625,
  -0.35546875,
  -0.13574219,
  0.26367188,
  0.061279297,
  0.734375,
  -0.19824219,
  -0.16699219,
  0.091308594,
  0.15429688,
  0.15625,
  -0.0075683594,
  -0.5390625,
  -0.45703125,
  0.41015625,
  0.91015625,
  -0.50390625,
  -0.6015625,
  0.75,
  -0.11621094,
  -0.057128906,
  0.265625,
  0.19726562,
  -0.15820312,
  -0.328125,
  0.7890625,
  0.45898438,
  -0.2265625,
  0.17578125,
  -0.296875,
  0.5546875,
  -0.5859375,
  0.29492188,
  0.30078125

In [18]:
# Explore the first 3 entries fo the embeddings

for e in emb:
    print(e[:3])

[-0.52734375, -0.055419922, -0.29492188]
[-1.015625, 0.44726562, -0.26953125]
[0.30078125, 0.3359375, 0.953125]
[1.4921875, 0.28515625, 0.421875]
[-0.10888672, -0.31835938, -0.15332031]
[0.52734375, -0.45703125, 0.0019454956]
[-0.7890625, -0.45898438, -0.18066406]
[0.07421875, -0.24609375, -0.2265625]


## Visualize the embeddings

In [20]:
chart = umap_plot(sentences, emb)
chart.interactive()