In [1]:
!pip install -qU \
    sagemaker==2.173.0 \
    pinecone-client==3.1.0 \
    ipywidgets==7.0.0

[33m  DEPRECATION: Building 'sagemaker' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'sagemaker'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-ai 2.31.6 requires faiss-cpu!=1.8.0.post0,<2.0.0,>=1.8.0, which is not installed.
dask 2025.7.0 requires cloudpickle>=3.0.0, but you have cloudpickle 2.2.1 which is incompatible.
distributed 2025.7.0 requires cloudpickle>=3.0.0, but you have cloudpickle 2.2.1 which is incompatible.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1,

In [2]:
import sagemaker
from sagemaker.huggingface import HuggingFaceModel

role = sagemaker.get_execution_role()
# Define Hugging Face Model configuration
hub_config = {
    'HF_MODEL_ID':'google/flan-t5-small',
    'HF_TASK':'text2text-generation'
}

# Define the model
huggingface_model = HuggingFaceModel(
    transformers_version="4.28",
    pytorch_version="2.0",
    py_version="py310",
    env=hub_config,
    role=role
)

In [3]:
llm = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge",
    endpoint_name="flan-t5-small-v4"
)

---------!

In [4]:
question = "Which instances can I use with Managed Spot Training in SageMaker?"

out = llm.predict({"inputs": question})
out

[{'generated_text': 'SageMaker'}]

In [5]:
context = """Managed Spot Training can be used with all instances
supported in Amazon SageMaker. Managed Spot Training is supported
in all AWS Regions where Amazon SageMaker is currently available."""

In [6]:
prompt_template = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

text_input = prompt_template.replace("{context}", context).replace("{question}", question)

out = llm.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {question}\n[Output]: {generated_text}")

[Input]: Which instances can I use with Managed Spot Training in SageMaker?
[Output]: I don't know


In [7]:
unanswerable_question = "What color is my desk?"

text_input = prompt_template.replace("{context}", context).replace("{question}", unanswerable_question)

out = llm.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {unanswerable_question}\n[Output]: {generated_text}")

[Input]: What color is my desk?
[Output]: I don't know


In [8]:
hub_config = {
    'HF_MODEL_ID': 'sentence-transformers/all-MiniLM-L6-v2', # model_id from hf.co/models
    'HF_TASK': 'feature-extraction'
}

huggingface_model = HuggingFaceModel(
    env=hub_config,
    role=role,
    transformers_version="4.28",  # or latest supported by SageMaker
    pytorch_version="2.0",
    py_version="py310"
)

In [9]:
encoder = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.large",
    endpoint_name="minilm-v1a"
)

--------------!

In [10]:
out = encoder.predict({"inputs": ["some text here", "some more text goes here too"]})

In [15]:
docs = ["hello world", "some more text goes here too"]

out = encoder.predict({"inputs": docs})

# 3. Inspect the output (optional debugging)
print(len(out))        # number of sentences
print(len(out[0]))     # number of tokens in sentence 1
print(len(out[0][0]))  # should be 384 (embedding dim)

# 4. Average token embeddings -> sentence embeddings
import numpy as np

sentence_embeddings = [np.mean(np.array(sent), axis=0) for sent in out]
embeddings = np.vstack(sentence_embeddings)
print(embeddings.shape)   # (num_sentences, 384)

# 5. Wrap into a helper function
from typing import List

def embed_docs(docs: List[str]) -> List[List[float]]:
    out = encoder.predict({"inputs": docs})
    sentence_embeddings = [np.mean(np.array(sent), axis=0) for sent in out]
    return np.vstack(sentence_embeddings).tolist()

2
1
4
(12, 384)


In [16]:
s3_path = "s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv"


In [17]:
!aws s3 cp $s3_path Amazon_SageMaker_FAQs.csv

download: s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv to ./Amazon_SageMaker_FAQs.csv


In [18]:
import pandas as pd

df_knowledge = pd.read_csv("Amazon_SageMaker_FAQs.csv", header=None, names=["Question", "Answer"])
df_knowledge.head()

Unnamed: 0,Question,Answer
0,What is Amazon SageMaker?,Amazon SageMaker is a fully managed service to...
1,In which Regions is Amazon SageMaker available...,For a list of the supported Amazon SageMaker A...
2,What is the service availability of Amazon Sag...,Amazon SageMaker is designed for high availabi...
3,How does Amazon SageMaker secure my code?,Amazon SageMaker stores code in ML storage vol...
4,What security measures does Amazon SageMaker h...,Amazon SageMaker ensures that ML model artifac...


In [19]:
df_knowledge.drop(["Question"], axis=1, inplace=True)
df_knowledge.head()

Unnamed: 0,Answer
0,Amazon SageMaker is a fully managed service to...
1,For a list of the supported Amazon SageMaker A...
2,Amazon SageMaker is designed for high availabi...
3,Amazon SageMaker stores code in ML storage vol...
4,Amazon SageMaker ensures that ML model artifac...


In [None]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

# configure client
pc = Pinecone(api_key=api_key)

In [23]:
import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Retrieve your Pinecone API key
api_key = os.getenv("PINECONE_API_KEY")
if not api_key:
    raise ValueError("PINECONE_API_KEY not found. Check your .env file!")

print("Pinecone API key loaded successfully")

Pinecone API key loaded successfully


In [24]:
from pinecone import Pinecone

# Connect to Pinecone
pc = Pinecone(api_key=api_key)

# Test connection by listing indexes
print(pc.list_indexes())

{'indexes': [{'dimension': 1024,
              'host': 'ragindex-rap39mq.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'ragindex',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 384,
              'host': 'rag-first-rap39mq.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'rag-first',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


  from tqdm.autonotebook import tqdm


In [25]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [26]:
index_name = 'retrieval-augmentation-aws'

In [27]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=embeddings.shape[1],
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [28]:
from tqdm.auto import tqdm

batch_size = 2  # can increase but needs larger instance size otherwise instance runs out of memory
vector_limit = 1000

answers = df_knowledge[:vector_limit]
index = pc.Index(index_name)

for i in tqdm(range(0, len(answers), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(answers))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in answers["Answer"][i:i_end]]
    # create embeddings
    texts = answers["Answer"][i:i_end].tolist()
    embeddings = embed_docs(texts)
    # create records list for upsert
    records = zip(ids, embeddings, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|██████████| 77/77 [00:23<00:00,  3.28it/s]


In [29]:
# check number of records in the index
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 154}},
 'total_vector_count': 154}

In [30]:
question

'Which instances can I use with Managed Spot Training in SageMaker?'

In [31]:
# extract embeddings for the questions
query_vec = embed_docs(question)[0]

# query pinecone
res = index.query(vector=query_vec, top_k=5, include_metadata=True)

# show the results
res

{'matches': [{'id': '89',
              'metadata': {'text': 'Once a Managed Spot Training job is '
                                   'completed, you can see the savings in the '
                                   'AWS Management Console and also calculate '
                                   'the cost savings as the percentage '
                                   'difference between the duration for which '
                                   'the training job ran and the duration for '
                                   'which you were billed. Regardless of how '
                                   'many times your Managed Spot Training jobs '
                                   'are interrupted, you are charged only once '
                                   'for the duration for which the data was '
                                   'downloaded.'},
              'score': 0.601714075,
              'values': []},
             {'id': '17',
              'metadata': {'text': 'Amazon Sag

In [32]:
contexts = [match.metadata['text'] for match in res.matches]

In [33]:
max_section_len = 1000
separator = "\n"

def construct_context(contexts: List[str]) -> str:
    chosen_sections = []
    chosen_sections_len = 0

    for text in contexts:
        text = text.strip()
        # Add contexts until we run out of space.
        chosen_sections_len += len(text) + 2
        if chosen_sections_len > max_section_len:
            break
        chosen_sections.append(text)
    concatenated_doc = separator.join(chosen_sections)
    print(
        f"With maximum sequence length {max_section_len}, selected top {len(chosen_sections)} document sections: \n{concatenated_doc}"
    )
    return concatenated_doc

In [34]:
context_str = construct_context(contexts=contexts)

With maximum sequence length 1000, selected top 3 document sections: 
Once a Managed Spot Training job is completed, you can see the savings in the AWS Management Console and also calculate the cost savings as the percentage difference between the duration for which the training job ran and the duration for which you were billed. Regardless of how many times your Managed Spot Training jobs are interrupted, you are charged only once for the duration for which the data was downloaded.
Amazon SageMaker provides purpose-built ML governance tools across the ML lifecycle. With SageMaker Role Manager, administrators can define minimum permissions in minutes. SageMaker Model Cards makes it easier to capture, retrieve, and share essential model information from conception to deployment, and SageMaker Model Dashboard keeps you informed on production model behavior, all in one place. View more details.
Managed Spot Training can be used with all instances supported in Amazon SageMaker.


In [35]:
text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)

out = llm.predict({"inputs": text_input})
generated_text = out[0]["generated_text"]
print(f"[Input]: {question}\n[Output]: {generated_text}")

[Input]: Which instances can I use with Managed Spot Training in SageMaker?
[Output]: All instances supported in Amazon SageMaker.


In [36]:
def rag_query(question: str) -> str:
    # create query vec
    query_vec = embed_docs(question)[0]
    # query pinecone
    res = index.query(vector=query_vec, top_k=5, include_metadata=True)
    # get contexts
    contexts = [match.metadata['text'] for match in res.matches]
    # build the multiple contexts string
    context_str = construct_context(contexts=contexts)
    # create our retrieval augmented prompt
    text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)
    # make prediction
    out = llm.predict({"inputs": text_input})
    return out[0]["generated_text"]

In [37]:
rag_query("Which instances can I use with Managed Spot Training in SageMaker?")

With maximum sequence length 1000, selected top 3 document sections: 
Once a Managed Spot Training job is completed, you can see the savings in the AWS Management Console and also calculate the cost savings as the percentage difference between the duration for which the training job ran and the duration for which you were billed. Regardless of how many times your Managed Spot Training jobs are interrupted, you are charged only once for the duration for which the data was downloaded.
Amazon SageMaker provides purpose-built ML governance tools across the ML lifecycle. With SageMaker Role Manager, administrators can define minimum permissions in minutes. SageMaker Model Cards makes it easier to capture, retrieve, and share essential model information from conception to deployment, and SageMaker Model Dashboard keeps you informed on production model behavior, all in one place. View more details.
Managed Spot Training can be used with all instances supported in Amazon SageMaker.


'All instances supported in Amazon SageMaker.'

In [38]:
rag_query("How do I create a Hugging Face instance on Sagemaker?")

With maximum sequence length 1000, selected top 1 document sections: 
Amazon SageMaker provides purpose-built ML governance tools across the ML lifecycle. With SageMaker Role Manager, administrators can define minimum permissions in minutes. SageMaker Model Cards makes it easier to capture, retrieve, and share essential model information from conception to deployment, and SageMaker Model Dashboard keeps you informed on production model behavior, all in one place. View more details.


"I don't know"