In [3]:
# cardio_feature_store_setup.ipynb
# Updated script for creating and ingesting to a new SageMaker Feature Store Feature Group

from datetime import datetime
import pandas as pd
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import time

# Initialize SageMaker session and role
region = sagemaker.Session().boto_region_name
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=boto_session, sagemaker_client=sagemaker_client)
role = sagemaker.get_execution_role()

# Constants
bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/feature-store/cardio"
s3_uri = f"s3://{bucket}/{prefix}/"
feature_group_name = "cardio-feature-group-v2"

# Load your dataset
df = pd.read_csv("cardio_train.csv", sep=";")

# Add and format event_time column (ISO 8601 string format)
event_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
df["event_time"] = event_time

# Drop duplicates by id if necessary
df = df.drop_duplicates(subset=["id"])

# Define and create feature group
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)
feature_group.load_feature_definitions(data_frame=df)

feature_group.create(
    s3_uri=s3_uri,
    record_identifier_name="id",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True
)

# Wait for the feature group to become active
status = None
while status != "Created":
    status = feature_group.describe().get("FeatureGroupStatus")
    print(f"Current status: {status}... waiting.")
    time.sleep(15)

print("✅ Feature group is now active.")

# Ingest the data once the group is ready
feature_group.ingest(data_frame=df, max_workers=3, wait=True)
print("✅ Ingestion completed.")

  event_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")


Current status: Creating... waiting.
Current status: Creating... waiting.
Current status: Created... waiting.
✅ Feature group is now active.
✅ Ingestion completed.


In [5]:
from sagemaker.feature_store.feature_group import IngestionManagerPandas

query = feature_group.athena_query()
query_string = f'SELECT * FROM "{query.table_name}" LIMIT 10;'
query.run(query_string=query_string, output_location=s3_uri)
query.wait()
df_results = query.as_dataframe()
df_results.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,event_time,write_time,api_invocation_time,is_deleted
0,23,14532,2,181,95.0,130,90,1,1,1,1,1,0,2025-05-23T19:04:54Z,2025-05-23 19:11:25.292,2025-05-23 19:05:41.000,False
1,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,2025-05-23T19:04:54Z,2025-05-23 19:11:25.220,2025-05-23 19:05:41.000,False
2,33334,19236,1,159,100.0,130,90,1,2,0,0,0,0,2025-05-23T19:04:54Z,2025-05-23 19:11:25.220,2025-05-23 19:05:41.000,False
3,66653,23253,2,165,54.0,80,60,1,1,1,0,1,0,2025-05-23T19:04:54Z,2025-05-23 19:11:25.220,2025-05-23 19:05:41.000,False
4,33419,23468,2,161,59.0,110,70,1,1,0,0,1,1,2025-05-23T19:04:54Z,2025-05-23 19:11:25.220,2025-05-23 19:05:42.000,False
