## Create Feature Store and Feature Groups

In [1]:
# Install Libraries
!pip install boto3 sagemaker pandas --quiet

In [2]:
# Import Libraries
from datetime import datetime, timezone
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import time

# Suppress specific DeprecationWarning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Feature Store Environment

In [3]:
# Feature Store Setup: Environment Initialization & Data Preparation
# Initialize SageMaker session and role
region = sagemaker.Session().boto_region_name
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=boto_session, sagemaker_client=sagemaker_client)
role = sagemaker.get_execution_role()

# Constants
bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/feature-store/cardio"
s3_uri = f"s3://{bucket}/{prefix}/"
feature_group_base = "cardio"

# Load engineered dataset
df = pd.read_csv("cardio_engineered.csv")

# Ensure 'id' exists and is unique
if 'id' not in df.columns:
    df["id"] = range(1, len(df) + 1)
assert df['id'].is_unique, "🚨 The 'id' column must contain unique values."

# Add event_time column using timezone-aware timestamp
df["event_time"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# Drop duplicate records
df = df.drop_duplicates(subset=["id"])

### Feature Group Creation & Ingestion

In [4]:
# Feature Store: Create and Ingest Feature Groups
# Define logical groupings of features
feature_groups_definitions = {
    f"{feature_group_base}-demographics": ["id", "age", "gender", "age_group", "event_time"],
    f"{feature_group_base}-vitals": ["id", "height", "weight", "bmi", "systolic_bp", "diastolic_bp", "pulse_pressure", "is_hypertensive", "event_time"],
    f"{feature_group_base}-labs-lifestyle": ["id", "cholesterol", "gluc", "smoke", "alco", "active", "chol_bmi_ratio", "lifestyle_score", "event_time"],
    f"{feature_group_base}-target": ["id", "cardio", "event_time"]
}

# Create and ingest each feature group
for group_name, features in feature_groups_definitions.items():
    print(f"Creating Feature Group: {group_name}")
    fg = FeatureGroup(name=group_name, sagemaker_session=sagemaker_session)

    try:
        fg.describe()
        print(f"Feature group '{group_name}' already exists. Skipping creation.")
        continue
    except Exception:
        pass  # Proceed if the feature group does not exist

    fg.load_feature_definitions(data_frame=df[features])

    fg.create(
        s3_uri=s3_uri,
        record_identifier_name="id",
        event_time_feature_name="event_time",
        role_arn=role,
        enable_online_store=True
    )

    # Wait until the feature group is active
    status = None
    while status != "Created":
        status = fg.describe().get("FeatureGroupStatus")
        print(f"⏳ Waiting for {group_name} → Current status: {status}")
        time.sleep(15)
    
    print(f"Feature group '{group_name}' is now active.")

    # Ingest the data
    fg.ingest(data_frame=df[features], max_workers=3, wait=True)
    print(f"Ingestion completed for: {group_name}")

Creating Feature Group: cardio-demographics
Feature group 'cardio-demographics' already exists. Skipping creation.
Creating Feature Group: cardio-vitals
Feature group 'cardio-vitals' already exists. Skipping creation.
Creating Feature Group: cardio-labs-lifestyle
Feature group 'cardio-labs-lifestyle' already exists. Skipping creation.
Creating Feature Group: cardio-target
Feature group 'cardio-target' already exists. Skipping creation.


These feature groups were created to organize the dataset into meaningful clinical categories that support interpretability and modularity. The demographics group contains age, gender, and age group, which are fundamental population risk indicators. The vitals group includes physical health metrics like height, weight, BMI, blood pressure, pulse pressure, and a hypertension flag, which are key signals for cardiovascular stress. The labs and lifestyle group captures metabolic and behavioral factors such as cholesterol, glucose, smoking, alcohol use, activity level, and engineered ratios like cholesterol-to-BMI and a lifestyle risk score. The target group holds the outcome label for cardiovascular disease. Structuring the data this way enables easier experimentation, model explainability, and scalable management of features across development and production.

#### Preview Query via Athena

In [5]:
# Query any feature group via Athena
query_fg = FeatureGroup(name="cardio-target", sagemaker_session=sagemaker_session)
query = query_fg.athena_query()
query_string = f'SELECT * FROM "{query.table_name}" LIMIT 10;'
query.run(query_string=query_string, output_location=s3_uri)
query.wait()
df_results = query.as_dataframe()
df_results.head()

Unnamed: 0,id,cardio,event_time,write_time,api_invocation_time,is_deleted
0,45616,1,2025-05-24T08:54:17Z,2025-05-24 09:18:03.885,2025-05-24 09:12:16.000,False
1,22816,1,2025-05-24T08:54:17Z,2025-05-24 09:18:03.885,2025-05-24 09:12:16.000,False
2,22827,0,2025-05-24T08:54:17Z,2025-05-24 09:18:03.885,2025-05-24 09:12:16.000,False
3,31,1,2025-05-24T08:54:17Z,2025-05-24 09:17:56.840,2025-05-24 09:12:15.000,False
4,45644,1,2025-05-24T08:54:17Z,2025-05-24 09:18:03.885,2025-05-24 09:12:16.000,False


In [7]:
# Save feature group to S3
!aws s3 cp cardio_engineered_feature_store_setup.ipynb s3://sagemaker-us-east-1-531690656306/sagemaker/feature-store/cardio_engineered_feature_store_setup.ipynb

upload: ./cardio_engineered_feature_store_setup.ipynb to s3://sagemaker-us-east-1-531690656306/sagemaker/feature-store/cardio_engineered_feature_store_setup.ipynb
