In [None]:
# Define and Create the Feature Group

In [44]:
import sagemaker

role = sagemaker.get_execution_role()
print(f"IAM Role ARN successfully retrieved: {role}")

IAM Role ARN successfully retrieved: arn:aws:iam::564543410445:role/LabRole


In [45]:
from sagemaker.feature_store.feature_group import FeatureGroup
import time

# Define a unique name for your Feature Group
feature_group_name = f"{project_prefix}-feature-group"
print(f"Feature Group Name: {feature_group_name}")

# Instantiate the FeatureGroup object
feature_group = FeatureGroup(
    name=feature_group_name, 
    sagemaker_session=sagemaker_session
)

# Load the feature definitions from your DataFrame
# This automatically infers the name and type for each feature column
feature_group.load_feature_definitions(data_frame=df_final_features)

print("\nCreating Feature Group in the SageMaker backend...")
# This command creates the Feature Group in AWS. 
# This process can take several minutes.
feature_group.create(
    s3_uri=f"s3://{bucket}/{project_prefix}/feature-store-offline",
    record_identifier_name="record_id",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True # Enable for low-latency, real-time lookups
)

# --- Wait for Feature Group to be Created ---
print("\nWaiting for Feature Group to be created (this may take 5-10 minutes)...")
while True:
    status = feature_group.describe().get("FeatureGroupStatus")
    if status == "Created":
        print("✅ Feature Group created successfully.")
        break
    if status == "CreateFailed":
        print("❌ Feature Group creation failed.")
        print(feature_group.describe().get("FailureReason"))
        break
    print(".", end="")
    time.sleep(5)

Feature Group Name: crime-prediction-datalake-feature-group

Creating Feature Group in the SageMaker backend...

Waiting for Feature Group to be created (this may take 5-10 minutes)...
....✅ Feature Group created successfully.


In [46]:
print("Starting to ingest data into the Feature Group...")
# This can take a few minutes as it may run a small processing job in the background.
feature_group.ingest(
    data_frame=df_final_features, 
    max_workers=4,  # You can adjust the number of parallel workers
    wait=True
)
print("\n✅ Data ingestion complete!")

Starting to ingest data into the Feature Group...

✅ Data ingestion complete!


In [None]:
# To recap:
# Established a raw data lake in Amazon S3.
# Cataloged data with AWS Glue and queried it with Amazon Athena.
# Performed a full suite of EDA, data cleaning, and advanced feature engineering.
# Stored the final, machine-learning-ready features in a SageMaker Feature Store.
# Crime data is now managed, versioned, and reusable.

In [51]:
from sagemaker.feature_store.feature_group import FeatureGroup

# The name of the Feature Group that already exists
feature_group_name = "crime-prediction-datalake-feature-group"

# Instantiate the FeatureGroup object by name to connect to the existing one
feature_group = FeatureGroup(
    name=feature_group_name, 
    sagemaker_session=sagemaker_session
)

print(f"✅ Successfully connected to existing Feature Group: {feature_group.name}")

✅ Successfully connected to existing Feature Group: crime-prediction-datalake-feature-group


In [46]:
print("Starting to ingest data into the Feature Group...")
# This can take a few minutes as it may run a small processing job in the background.
feature_group.ingest(
    data_frame=df_final_features, 
    max_workers=4,  # You can adjust the number of parallel workers
    wait=True
)
print("\n✅ Data ingestion complete!")

Starting to ingest data into the Feature Group...

✅ Data ingestion complete!
