# Create Feature Store & Groups

In [12]:
# Install Libraries
!pip install boto3 sagemaker pandas --quiet

In [13]:
# Import Libraries
from datetime import datetime, timezone
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.session import Session
import time

# Suppress specific DeprecationWarning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Feature Store Environment

In [14]:
# Feature Store Setup: Environment Initialization & Data Preparation
# Initialize SageMaker session and role
region = sagemaker.Session().boto_region_name
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=boto_session, sagemaker_client=sagemaker_client)
role = sagemaker.get_execution_role()

# Constants
bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/feature-store/cardio"
s3_uri = f"s3://{bucket}/{prefix}/"
feature_group_base = "cardio"

# Load engineered dataset
df = pd.read_csv("cardio_engineered.csv")

# Ensure 'id' exists and is unique
if 'id' not in df.columns:
    df["id"] = range(1, len(df) + 1)
assert df['id'].is_unique, "The 'id' column must contain unique values."

# Add event_time column using timezone-aware timestamp
df["event_time"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# Drop duplicate records
df = df.drop_duplicates(subset=["id"])

### Feature Group Creation & Ingestion

In [15]:
# Display all column names
print(df.columns.tolist())

['age', 'gender', 'height_ft', 'weight_lbs', 'systolic_bp', 'diastolic_bp', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio', 'bmi', 'age_group', 'cholesterol_label', 'pulse_pressure', 'chol_bmi_ratio', 'height_in', 'age_years', 'is_hypertensive', 'bp_category', 'bmi_category', 'age_gluc_interaction', 'lifestyle_score', 'id', 'event_time']


#### Run this code to delete old features groups (Only if needed)

In [17]:
# Delete Feature Group (Run if you want to restart)
# Setup
sagemaker_session = Session()
region = sagemaker_session.boto_region_name
featurestore_runtime = boto3.client("sagemaker-featurestore-runtime", region_name=region)
sagemaker_client = boto3.client("sagemaker", region_name=region)

# List of your feature group names
feature_groups_to_delete = [
    "cardio-demographics",
    "cardio-vitals",
    "cardio-labs-lifestyle",
    "cardio-target"
]

# Delete loop
for fg_name in feature_groups_to_delete:
    try:
        print(f"Deleting feature group: {fg_name}")
        sagemaker_client.delete_feature_group(FeatureGroupName=fg_name)
    except Exception as e:
        print(f"Failed to delete {fg_name}: {e}")

Deleting feature group: cardio-demographics
Deleting feature group: cardio-vitals
Deleting feature group: cardio-labs-lifestyle
Deleting feature group: cardio-target


In [18]:
# Define logical groupings of features to match engineered dataset
feature_groups_definitions = {
    f"{feature_group_base}-demographics": [
        "id", "age", "gender", "age_group", "event_time"
    ],
    f"{feature_group_base}-vitals": [
        "id", "height_ft", "height_in", "weight_lbs", "bmi", "systolic_bp",
        "diastolic_bp", "pulse_pressure", "is_hypertensive", "bp_category", "bmi_category", "event_time"
    ],
    f"{feature_group_base}-labs-lifestyle": [
        "id", "cholesterol", "gluc", "smoke", "alco", "active",
        "chol_bmi_ratio", "age_gluc_interaction", "lifestyle_score", "cholesterol_label", "event_time"
    ],
    f"{feature_group_base}-target": [
        "id", "cardio", "event_time"
    ]
}

# Create and ingest each feature group
for group_name, features in feature_groups_definitions.items():
    print(f"Creating Feature Group: {group_name}")
    fg = FeatureGroup(name=group_name, sagemaker_session=sagemaker_session)

    try:
        fg.describe()
        print(f"Feature group '{group_name}' already exists. Skipping creation.")
        continue
    except Exception:
        pass  # Proceed if the feature group does not exist

    fg.load_feature_definitions(data_frame=df[features])

    fg.create(
        s3_uri=s3_uri,
        record_identifier_name="id",
        event_time_feature_name="event_time",
        role_arn=role,
        enable_online_store=True
    )

    # Wait until the feature group is active
    status = None
    while status != "Created":
        status = fg.describe().get("FeatureGroupStatus")
        print(f"⏳ Waiting for {group_name} → Current status: {status}")
        time.sleep(15)

    print(f"Feature group '{group_name}' is now active.")

    # Ingest the data
    fg.ingest(data_frame=df[features], max_workers=3, wait=True)
    print(f"Ingestion completed for: {group_name}")

Creating Feature Group: cardio-demographics
⏳ Waiting for cardio-demographics → Current status: Creating
⏳ Waiting for cardio-demographics → Current status: Creating
⏳ Waiting for cardio-demographics → Current status: Created
Feature group 'cardio-demographics' is now active.
Ingestion completed for: cardio-demographics
Creating Feature Group: cardio-vitals
⏳ Waiting for cardio-vitals → Current status: Creating
⏳ Waiting for cardio-vitals → Current status: Creating
⏳ Waiting for cardio-vitals → Current status: Created
Feature group 'cardio-vitals' is now active.
Ingestion completed for: cardio-vitals
Creating Feature Group: cardio-labs-lifestyle
⏳ Waiting for cardio-labs-lifestyle → Current status: Creating
⏳ Waiting for cardio-labs-lifestyle → Current status: Creating
⏳ Waiting for cardio-labs-lifestyle → Current status: Created
Feature group 'cardio-labs-lifestyle' is now active.
Ingestion completed for: cardio-labs-lifestyle
Creating Feature Group: cardio-target
⏳ Waiting for cardio

These feature groups were created to organize the dataset into meaningful clinical categories that support interpretability and modularity. The demographics group contains age, gender, and age group, which are fundamental population risk indicators. The vitals group includes physical health metrics like height, weight, BMI, blood pressure, pulse pressure, and a hypertension flag, which are key signals for cardiovascular stress. The labs and lifestyle group captures metabolic and behavioral factors such as cholesterol, glucose, smoking, alcohol use, activity level, and engineered ratios like cholesterol-to-BMI and a lifestyle risk score. The target group holds the outcome label for cardiovascular disease. Structuring the data this way enables easier experimentation, model explainability, and scalable management of features across development and production.

#### Preview Query via Athena

In [None]:
# Query any feature group via Athena
# Initialize
query_fg = FeatureGroup(name="cardio-target", sagemaker_session=sagemaker_session)

# Start Athena query
query = query_fg.athena_query()
table_name = query.table_name  # Auto-resolves the Glue table

# Define the query string
query_string = f'SELECT * FROM "{table_name}" LIMIT 10;'

# Define the S3 output path for Athena results
output_location = f's3://{bucket}/athena-results/'

# Run the query
query.run(query_string=query_string, output_location=output_location)
query.wait()

# Convert results to DataFrame
df_results = query.as_dataframe()
df_results.head()

In [None]:
# Save feature group to S3
!aws s3 cp cardio_engineered_feature_store_setup.ipynb s3://sagemaker-us-east-1-226675648827/sagemaker/feature-store/cardio_engineered_feature_store_setup.ipynb