# 6 â€“ Create SageMaker Feature Store

This notebook creates a SageMaker Feature Group for the
extreme precipitation prediction project.

The feature group stores engineered weather features
and the next-day extreme precipitation label.


## Import Required Libraries and Initialize SageMaker Session


In [16]:
# --- Core Imports ---
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from pyathena import connect
import pandas as pd
import time

# --- SageMaker Session Setup ---
sess = sagemaker.Session()
bucket = sess.default_bucket()
region = boto3.Session().region_name
role = get_execution_role()

boto_session = boto3.Session(region_name=region)
sm_client = boto_session.client("sagemaker", region_name=region)
fs_runtime = boto_session.client("sagemaker-featurestore-runtime", region_name=region)

fs_session = Session(
    boto_session=boto_session,
    sagemaker_client=sm_client,
    sagemaker_featurestore_runtime_client=fs_runtime
)

# --- Athena Configuration ---
database_name = "ghcn_extreme_precip_db"

conn = connect(
    s3_staging_dir=f"s3://{bucket}/athena/staging/",
    region_name=region
)

print("Bucket:", bucket)
print("Region:", region)
print("Athena connection established")


Bucket: sagemaker-us-east-1-083422367993
Region: us-east-1
Athena connection established


## Load Partitioned Parquet Dataset from S3


In [17]:
df = pd.read_sql(
    f"SELECT * FROM {database_name}.extreme_precip_parquet",
    conn
)

df.head()


  df = pd.read_sql(


Unnamed: 0,station_id,date,tmax,tmin,prcp_lag_1,prcp_roll_7,extreme_precip_tomorrow,year,month


## Prepare Feature Store DataFrame

Add required:
- Record identifier
- Event time
Ensure all object columns are cast to string


In [18]:
fs_df = df.copy()

# Unique record identifier
fs_df["record_id"] = (
    fs_df["station_id"].astype(str) + "_" + fs_df["date"].astype(str)
)

# Event time (epoch seconds)
current_time = int(time.time())
fs_df["EventTime"] = float(current_time)

# Cast object columns to string for Feature Store compatibility
for col in fs_df.columns:
    if fs_df[col].dtype == "object":
        fs_df[col] = fs_df[col].astype("string")

fs_df.head()


Unnamed: 0,station_id,date,tmax,tmin,prcp_lag_1,prcp_roll_7,extreme_precip_tomorrow,year,month,record_id,EventTime


## Define Feature Group Name


In [19]:
feature_group_name = f"ghcn-extreme-feature-group-{int(time.time())}"

feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=fs_session
)

print("Feature Group Name:", feature_group_name)


Feature Group Name: ghcn-extreme-feature-group-1771134299


## Load Feature Definitions


In [20]:
feature_group.load_feature_definitions(data_frame=fs_df)


[FeatureDefinition(feature_name='station_id', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='date', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='tmax', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='tmin', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='prcp_lag_1', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='prcp_roll_7', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='extreme_precip_tomorrow', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='year', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='month', feature_type=<FeatureType

## Create Feature Group (Online + Offline Store Enabled)


In [21]:
offline_prefix = f"{project_prefix}/featurestore"

feature_group.create(
    s3_uri=f"s3://{bucket}/{offline_prefix}",
    record_identifier_name="record_id",
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True
)

print("Creating Feature Group...")


Creating Feature Group...


## Wait for Feature Group Creation


In [22]:
status = feature_group.describe()["FeatureGroupStatus"]

while status == "Creating":
    print("Waiting for Feature Group creation...")
    time.sleep(5)
    status = feature_group.describe()["FeatureGroupStatus"]

print("Feature Group Status:", status)


Waiting for Feature Group creation...
Waiting for Feature Group creation...
Waiting for Feature Group creation...
Waiting for Feature Group creation...
Feature Group Status: Created


## Ingest Records into Feature Store


In [23]:
feature_group.ingest(
    data_frame=fs_df,
    max_workers=4,
    wait=True
)

print("Feature ingestion complete.")


Feature ingestion complete.


## Verify Feature Group Configuration


In [24]:
feature_group.describe()


{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:083422367993:feature-group/ghcn-extreme-feature-group-1771134299',
 'FeatureGroupName': 'ghcn-extreme-feature-group-1771134299',
 'RecordIdentifierFeatureName': 'record_id',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'station_id', 'FeatureType': 'String'},
  {'FeatureName': 'date', 'FeatureType': 'String'},
  {'FeatureName': 'tmax', 'FeatureType': 'String'},
  {'FeatureName': 'tmin', 'FeatureType': 'String'},
  {'FeatureName': 'prcp_lag_1', 'FeatureType': 'String'},
  {'FeatureName': 'prcp_roll_7', 'FeatureType': 'String'},
  {'FeatureName': 'extreme_precip_tomorrow', 'FeatureType': 'String'},
  {'FeatureName': 'year', 'FeatureType': 'String'},
  {'FeatureName': 'month', 'FeatureType': 'String'},
  {'FeatureName': 'record_id', 'FeatureType': 'String'},
  {'FeatureName': 'EventTime', 'FeatureType': 'Fractional'}],
 'CreationTime': datetime.datetime(2026, 2, 15, 5, 45, 0, 176000, tzinfo=tzlocal()),
 'Onli

## Confirm Feature Store is Ready for Model Training

The Feature Group has been successfully created and populated.
It now supports both offline training workflows and real-time inference use cases.
