# SageMaker Demo
This notebook is intended to be used with a [SageMaker notebook instance](https://docs.aws.amazon.com/sagemaker/latest/dg/nbi.html) launched using the following [CloudFormation](https://docs.aws.amazon.com/cloudformation/) template:

- [sagemaker-notebook-cloudformation.yml](https://github.com/managedkaos/jupyter-environment-details/blob/main/sagemaker-notebook-cloudformation.yml)

Together the CloudFormation template and this notebook demonstrate:

- Attaching an IAM role to a SageMaker instance with policies that allow the instance to use other AWS services
- Using the [Boto3 Python library](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) to create clients for accessing AWS services
- Using boto3 clients to read from [Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html) and write to an [S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html)



## Grab some metadata from the local system's `/opt/ml/metadata/resource-metadata.json`

In [None]:
import json

# Path to the JSON file
file_path = "/opt/ml/metadata/resource-metadata.json"

# Open the file and load its content
with open(file_path, "r") as file:
    metadata = json.load(file)

# Access the ResourceName value
notebook_instance_name = metadata["ResourceName"]

print("\tNotebook instance name:", notebook_instance_name)

## Install the Boto3 library and initialize clients for S3 and SSM

In [None]:
%pip install --quiet boto3 pandas plotly scikit-learn

In [None]:
import boto3

s3_client = boto3.client("s3")
ssm_client = boto3.client("ssm")

## Create helper functions
- read_from_parameter_store(name)
- write_to_s3(bucket, key, content)

In [None]:
# Read value from Parameter Store
def read_from_parameter_store(name):
    response = ssm_client.get_parameter(Name=name, WithDecryption=True)
    return response["Parameter"]["Value"]


# Write data to the S3 bucket
def write_to_s3(bucket, key, body):
    s3_client.put_object(Bucket=bucket, Key=key, Body=body)
    print(f"\tSuccessfully wrote data to s3://{bucket}/{key}")

## Read from SSM ParameterStore 

In [None]:
# Get the S3 bucket name and region from Parameter Store
bucket_name = read_from_parameter_store(f"/{notebook_instance_name}/s3bucket")
region_name = read_from_parameter_store(f"/{notebook_instance_name}/region")

print(f"\tS3 Bucket Name from Parameter Store: {bucket_name}")
print(f"\tRegion Name from Parameter Store: {region_name}")

## Do something really cool in the following cell...

In [None]:
print("\tHello, World!")

## Generate data: calculating Pi

In [None]:
from decimal import Decimal, getcontext
import pandas as pd


def pi_archimedes(n):
    # Calculate pi over n iterations using the approach from Archimedes
    polygon_edge_length_squared = Decimal(2)
    polygon_sides = 2
    for i in range(n):
        polygon_edge_length_squared = (
            2 - 2 * (1 - polygon_edge_length_squared / 4).sqrt()
        )
        polygon_sides *= 2
    return polygon_sides * polygon_edge_length_squared.sqrt()


data = []
places = 100
old_result = None

for n in range(10 * places):
    getcontext().prec = 2 * places  # Do calculations with double precision
    result = pi_archimedes(n)
    getcontext().prec = places  # Print the result with single precision
    result = +result  # Rounding
    data.append(result)
    if result == old_result:  # Did it converge?
        break
    old_result = result

df = pd.DataFrame({"Iteration": list(range(len(data))), "Pi": data})

In [None]:
df

In [None]:
import os

data_directory = "./data"

os.makedirs(data_directory, exist_ok=True)
chunk_size = 25  # Number of rows per chunk

# Iterate through the DataFrame in chunks and write each to a separate HTML file
for i in range(0, len(df), chunk_size):
    chunk_df = df.iloc[i : i + chunk_size]
    filename = f"{data_directory}/data-{i//chunk_size + 1}.html"
    chunk_df.to_html(filename, index=False)
    print(f"\tWrote data {i} to {filename}")

## Pickle the Pi dataframe

In [None]:
import pickle

pickle_file = f"{data_directory}/pi-data.pkl"

# Open a file in binary write mode ('wb')
with open(pickle_file, "wb") as f:
    # Use pickle.dump() to serialize and save the object to the file
    pickle.dump(df, f)

print(f"\tPi data serialized to {pickle_file}")

## Generate graphs

In [None]:
import numpy as np
from sklearn.datasets import make_blobs
import plotly.express as px

# Generate synthetic data with 3D features
x, y = make_blobs(n_samples=200, centers=7, n_features=3, random_state=42)

# Convert the data to a format suitable for Plotly Express
df3 = pd.DataFrame(np.column_stack((x, y)), columns=["X", "Y", "Z", "Cluster"])

In [None]:
# Visualize the clusters in 2D
fig3 = px.scatter(df3, x="X", y="Y", color="Cluster", title="2D Cluster Visualization")

# Write the figure to disk
fig3.write_html(f"{data_directory}/graph-cluster-2d-visualization.html")
fig3.show()

In [None]:
# Visualize the clusters in 3D
fig3 = px.scatter_3d(
    df3, x="X", y="Y", z="Z", color="Cluster", title="3D Cluster Visualization"
)

# Write the figure to disk
fig3.write_html(f"{data_directory}/graph-cluster-3d-visualization.html")
fig3.show()

## Upload data to S3 and create `index.html`

In [None]:
import fnmatch
import subprocess

website = f"http://{bucket_name}.s3-website-{region_name}.amazonaws.com"

# Use the fnmatch module to find all files in the current directory that end in ".html"
file_list = []
for root, dirnames, filenames in os.walk("."):
    for filename in fnmatch.filter(filenames, "*.html"):
        file_list.append(os.path.join(root, filename))

# Sort the file list alphabetically
file_list.sort()

# Create the HTML file and write the header
with open(os.path.join(".", "index.html"), "w") as f:
    f.write(
        """<html>
        <head>
            <title>HTML Output</title>
            <style>
                table {
                    border-collapse: collapse;
                    width: 100%;
                }
                th, td {
                    text-align: left;
                    padding: 8px;
                }
                th {
                    background-color: #007bff;
                    color: #fff;
                    font-weight: bold;
                }
                tr:nth-child(even) {
                    background-color: #f2f2f2;
                }
                tr:hover {
                    background-color: #ddd;
                }
            </style>
        </head>
        <body>
            <table>
                <tr><th>Name</th><th>Size</th></tr>\n
    """
    )

    # Loop through each file and add a row to the table
    for file_name in file_list:
        if file_name in ["./index.html"]:
            continue

        file_size = os.path.getsize(file_name)
        f.write(
            f'<tr><td><a href="{website}/{file_name}" target="_blank" rel="noopener noreferrer">{file_name}</a></td><td>{int(file_size / 1048576)} MB</td></tr>\n'
        )

    # Write the footer and close the file
    f.write("</table></body></html>")
    f.close()

command = [
    "aws",
    "s3",
    "sync",
    ".",
    f"s3://{bucket_name}",
    "--exclude",
    "*",
    "--include",
    "*.html",
    "--include",
    "*.pkl",
    "--no-progress",
]

# Run the command and wait for it to complete
output = subprocess.run(command, capture_output=True, text=True)

# Print the output
print(output.stdout)
print("\tfin")

## Read the bucket contents

In [None]:
objects = s3_client.list_objects_v2(Bucket=bucket_name)

print(f"\tContents of bucket {bucket_name}:")
for obj in objects["Contents"]:
    print(f"\t\t{obj['Key']}")

## Reload the pickled dataframe

In [None]:
# Load the saved TfidfVectorizer
with open(pickle_file, "rb") as f:
    df = pickle.load(f)

df

In [None]:
from IPython.display import display, Markdown

markdown_text = f"""
## Access the data in the S3 bucket website
Use the following link to view the data in the S3 bucket's website:

## {website}
"""

display(Markdown(markdown_text))