In [2]:
import sagemaker
import boto3
import sys
import pandas as pd
import numpy as np
import io
from sagemaker.session import Session
from sagemaker import get_execution_role
import time

from sagemaker.feature_store.feature_group import FeatureGroup

In [3]:
prefix = "sagemaker-featurestore-introduction"
role = get_execution_role()

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket()

boto_session = boto3.Session(region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)


# Prepare Data

In [4]:
data = pd.read_csv("./data/campain_data.csv")
data = data.dropna(axis=0)
data['id']= data['id'].astype('string')
data['income']= data['income'].astype('string')
data['gender']= data['gender'].astype('string')
data['marital_status']= data['marital_status'].astype('string')
current_time_sec = int(round(time.time()))
data["EventTime"] = pd.Series([current_time_sec] * len(data), dtype="float64")
data_source_1 = data[['id','age','dist','EventTime']]
data_source_2 = data[['id','income','gender','marital_status','target','EventTime']]

In [5]:
data.head(5)

Unnamed: 0,id,age,dist,income,gender,marital_status,target,EventTime
0,1,73,4.371654,"90-99,999",M,S,1,1668965000.0
2,3,85,1.22381,"10-19,999",F,S,1,1668965000.0
3,4,76,2.962427,"90-99,999",M,M,1,1668965000.0
4,5,76,2.594408,"10-19,999",M,S,1,1668965000.0
5,6,71,3.877504,"50-59,999",F,S,1,1668965000.0


# Define Feature Group

In [8]:
data_souce_feature_group = FeatureGroup(
    name='data-souce-feature-group', sagemaker_session=sagemaker_session
)
data_souce1_feature_group = FeatureGroup(
    name='data-souce1-feature-group', sagemaker_session=sagemaker_session
)
data_souce2_feature_group = FeatureGroup(
    name='data-souce2-feature-group', sagemaker_session=sagemaker_session
)

# Load Feature Definitions

In [9]:
data_souce_feature_group.load_feature_definitions(data_frame=data)
data_souce1_feature_group.load_feature_definitions(data_frame=data_source_1)
data_souce2_feature_group.load_feature_definitions(data_frame=data_source_2)

[FeatureDefinition(feature_name='id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='income', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='gender', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='marital_status', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='target', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>)]

# Create Feature Stores

In [154]:
data_souce_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}",
    record_identifier_name='id',
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True,
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:653583447178:feature-group/data-souce-feature-group',
 'ResponseMetadata': {'RequestId': '9a0bb1e4-1ccf-4df2-893c-c14851025bb1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9a0bb1e4-1ccf-4df2-893c-c14851025bb1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '101',
   'date': 'Sun, 20 Nov 2022 01:05:17 GMT'},
  'RetryAttempts': 0}}

In [155]:
data_souce1_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}",
    record_identifier_name='id',
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True,
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:653583447178:feature-group/data-souce1-feature-group',
 'ResponseMetadata': {'RequestId': '94d87304-850e-43cd-96e4-3e50d3a39407',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '94d87304-850e-43cd-96e4-3e50d3a39407',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '102',
   'date': 'Sun, 20 Nov 2022 01:05:19 GMT'},
  'RetryAttempts': 0}}

In [156]:
data_souce2_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}",
    record_identifier_name='id',
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True,
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:653583447178:feature-group/data-souce2-feature-group',
 'ResponseMetadata': {'RequestId': '96e7e92b-0d00-482c-8529-293264328d46',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '96e7e92b-0d00-482c-8529-293264328d46',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '102',
   'date': 'Sun, 20 Nov 2022 01:05:20 GMT'},
  'RetryAttempts': 0}}

# Ingest features to feature store

In [162]:
data_souce_feature_group.ingest(
    data_frame=data, max_workers=3, wait=True
)

ERROR:sagemaker.feature_store.feature_group:Failed to ingest row 3584: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Provided Record does not contain a required FeatureValue. Please provide a FeatureValue with the name [EventTime].
ERROR:sagemaker.feature_store.feature_group:Failed to ingest row 3586: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Provided Record does not contain a required FeatureValue. Please provide a FeatureValue with the name [EventTime].
ERROR:sagemaker.feature_store.feature_group:Failed to ingest row 3587: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Provided Record does not contain a required FeatureValue. Please provide a FeatureValue with the name [EventTime].
ERROR:sagemaker.feature_store.feature_group:Failed to ingest row 3588: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Pro

IngestionError: [3584, 3586, 3587, 3588, 3589, 3592, 3593, 3594, 3595, 3596, 3598, 3599, 3600, 3601, 3602, 3603, 3604, 3605, 3606, 3608, 3609, 3610, 3611, 3612, 3613, 3614, 3616, 3617, 3618, 3619, 3620, 3621, 3622, 3623, 3624, 3625, 3626, 3627, 3628, 3629, 3630, 3631, 3633, 3634, 3635, 3636, 3637, 3638, 3639, 3640, 3641, 3642, 3643, 3644, 3645, 3646, 3647, 3648, 3649, 3650, 3651, 3652, 3653, 3654, 3655, 3656, 3658, 3659, 3660, 3661, 3662, 3664, 3665, 3666, 3667, 3668, 3669, 3670, 3671, 3672, 3673, 3674, 3675, 3676, 3677, 3678, 3679, 3680, 3681, 3682, 3683, 3684, 3686, 3687, 3688, 3689, 3690, 3691, 3692, 3693, 3694, 3695, 3696, 3697, 3698, 3699, 3700, 3701, 3702, 3704, 3705, 3707, 3708, 3709, 3711, 3712, 3714, 3715, 3716, 3717, 3718, 3719, 3720, 3721, 3722, 3723, 3724, 3725, 3726, 3727, 3728, 3729, 3730, 3731, 3732, 3733, 3734, 3735, 3736, 3738, 3739, 3741, 3742, 3743, 3744, 3745, 3747, 3748, 3749, 3750, 3751, 3752, 3753, 3754, 3755, 3756, 3757, 3758, 3759, 3762, 3763, 3764, 3765, 3766, 3767, 3768, 3769, 3770, 3771, 3772, 3773, 3774, 3775, 3776, 3777, 3779, 3781, 3782, 3783, 3784, 3785, 3786, 3787, 3788, 3789, 3790, 3791, 3792, 3794, 3795, 3796, 3797, 3799, 3800, 3801, 3802, 3803, 3805, 3806, 3807, 3808, 3810, 3811, 3812, 3813, 3814, 3815, 3816, 3817, 3818, 3820, 3821, 3822, 3823, 3824, 3825, 3827, 3828, 3829, 3830, 3831, 3832, 3833, 3834, 3835, 3836, 3837, 3838, 3839, 3840, 3841, 3842, 3844, 3845, 3847, 3848, 3849, 3850, 3851, 3852, 3853, 3854, 3855, 3856, 3857, 3858, 3859, 3860, 3861, 3862, 3864, 3865, 3866, 3867, 3868, 3869, 3872, 3873, 3874, 3875, 3876, 3877, 3878, 3879, 3880, 3882, 3883, 3884, 3885, 3886, 3887, 3888, 3889, 3890, 3891, 3892, 3893, 3894, 3895, 3896, 3897, 3898, 3899, 3900, 3901, 3902, 3904, 3905, 3906, 3908, 3909, 3910, 3911, 3912, 3913, 3914, 3915, 3916, 3917, 3918, 3919, 3920, 3921, 3922, 3923, 3924, 3925, 3926, 3928, 3929, 3930, 3931, 3932, 3933, 3937, 3938, 3939, 3940, 3942, 3943, 3944, 3945, 3946, 3947, 3948, 3949, 3951, 3952, 3953, 3954, 3955, 3956, 3957, 3958, 3959, 3960, 3963, 3964, 3965, 3966, 3967, 3968, 3969, 3970, 3971, 3972, 3973, 3974, 3975, 3976, 3977, 3978, 3980, 3981, 3982, 3983, 3984, 3985, 3987, 3988, 3989, 3990, 3991, 3992, 3993, 3995, 3996, 3997, 3998, 3999] -> Failed to ingest some data into FeatureGroup data-souce-feature-group

# Lookup feature store

In [10]:
record_identifier_values = ["1", "2", "3", "4"]
featurestore_runtime.batch_get_record(Identifiers=[{"FeatureGroupName": "data-souce-feature-group", "RecordIdentifiersValueAsString": record_identifier_values}])

{'ResponseMetadata': {'RequestId': 'a949ef05-40e8-40f3-9e0f-4a0ab2429ad9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a949ef05-40e8-40f3-9e0f-4a0ab2429ad9',
   'content-type': 'application/json',
   'content-length': '1514',
   'date': 'Sun, 20 Nov 2022 17:20:04 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'data-souce-feature-group',
   'RecordIdentifierValueAsString': '1',
   'Record': [{'FeatureName': 'id', 'ValueAsString': '1'},
    {'FeatureName': 'age', 'ValueAsString': '73'},
    {'FeatureName': 'dist', 'ValueAsString': '4.371654351'},
    {'FeatureName': 'income', 'ValueAsString': '90-99,999'},
    {'FeatureName': 'gender', 'ValueAsString': 'M'},
    {'FeatureName': 'marital_status', 'ValueAsString': 'S'},
    {'FeatureName': 'target', 'ValueAsString': '1'},
    {'FeatureName': 'EventTime', 'ValueAsString': '1668906287.0'}]},
  {'FeatureGroupName': 'data-souce-feature-group',
   'RecordIdentifierValueAsString': '3',
   'Record': [{'Feature

# Submit Athena query to retrieve features from data_souce_feature_group

In [11]:
data_souce_query = data_souce_feature_group.athena_query()
data_souce_table = data_souce_query.table_name
database_name = data_souce_query.database
%store data_souce_table
%store database_name

query_string = f"""SELECT * from "sagemaker_featurestore"."{data_souce_table}" LIMIT 10000"""
data_souce_query = data_souce_feature_group.athena_query()

Stored 'data_souce_table' (str)
Stored 'database_name' (str)


In [33]:
data_souce_query.run(query_string=query_string,output_location=f"s3://{s3_bucket_name}/{prefix}/query_results12")
data_souce_query.wait()
dataset = data_souce_query.as_dataframe()

In [35]:
dataset.head(10)

Unnamed: 0,id,age,dist,income,gender,marital_status,target,eventtime,write_time,api_invocation_time,is_deleted
0,1331,71,7.125673,"100-149,999",M,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:14.000,False
1,1346,76,1.142977,"60-69,999",M,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:14.000,False
2,1371,81,11.605287,"50-59,999",F,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:14.000,False
3,77,75,23.030903,"20-29,999",F,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:14.000,False
4,81,94,1.630203,"10-19,999",F,S,1,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:15.000,False
5,2758,76,2.614747,"40-49,999",M,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:15.000,False
6,91,68,0.451748,"60-69,999",F,M,1,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:15.000,False
7,2839,73,1.281901,"70-79,999",M,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:15.000,False
8,201,85,13.921918,"30-39,999",F,M,1,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:16.000,False
9,1541,88,2.552892,"50-59,999",M,M,0,1668906000.0,2022-11-20 01:17:15.250,2022-11-20 01:12:16.000,False


# Create SageMaker Scikit Estimator

In [46]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'train_script.py'
train_input = sagemaker_session.upload_data("data")

sklearn = SKLearn(
    entry_point=script_path,
    instance_type="ml.m4.xlarge",
    framework_version="0.23-1",
    py_version="py3",
    role=role,
    sagemaker_session=sagemaker_session)

# Train SKLearn Estimator

In [47]:
sklearn.fit({'train': train_input})

2022-11-20 18:19:31 Starting - Starting the training job...
2022-11-20 18:20:00 Starting - Preparing the instances for trainingProfilerReport-1668968371: InProgress
.........
2022-11-20 18:21:19 Downloading - Downloading input data...
2022-11-20 18:21:59 Training - Downloading the training image......
2022-11-20 18:23:00 Training - Training image download completed. Training in progress..[34m2022-11-20 18:22:56,573 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-11-20 18:22:56,576 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-20 18:22:56,587 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-11-20 18:22:57,037 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-20 18:22:57,051 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-20 18:22:57,065 sag