In [36]:
import yaml
import boto3
from datetime import datetime
from time import strftime, gmtime
import pandas as pd


import sagemaker
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.feature_group import FeatureGroup

In [3]:
SETTING_FILE_PATH = "../settings.yaml"

In [15]:
with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)

In [56]:
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = boto3.Session().region_name

sm = boto3.client('sagemaker')
featurestore_runtime = boto3.client("sagemaker-featurestore-runtime")
s3 = boto3.client('s3')

In [57]:
df_train = pd.read_csv('../avazu-ctr-prediction/train_partial')

In [58]:
feature_cols = ['id', 'event_time', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
                            'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
                            'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
                            'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

target = 'click'

In [59]:
current_time_sec = int(round(time.time()))
df_train['event_time'] = pd.Series([current_time_sec]*len(df_train), dtype="float64")

In [77]:
prefix = "ctr-prediction-feature-store"
feature_group_name = "ctr-prediction-group" + f"-{strftime('%d-%H-%M-%S', gmtime())}"

feature_definitions = [
    FeatureDefinition(feature_name=feature_name, feature_type=FeatureTypeEnum.STRING)
    for feature_name in feature_cols
]

feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)


In [78]:
record_identifier_name = "id"
event_time_feature_name = "event_time"

In [79]:
feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix}",
    record_identifier_name = record_identifier_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=False)

{'FeatureGroupArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:feature-group/ctr-prediction-group-10-01-14-57',
 'ResponseMetadata': {'RequestId': 'fdce3b8a-1229-4a4e-80f0-899b91fd2a5a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fdce3b8a-1229-4a4e-80f0-899b91fd2a5a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '114',
   'date': 'Tue, 10 May 2022 01:15:00 GMT'},
  'RetryAttempts': 0}}

In [80]:
feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:feature-group/ctr-prediction-group-10-01-14-57',
 'FeatureGroupName': 'ctr-prediction-group-10-01-14-57',
 'RecordIdentifierFeatureName': 'id',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'id', 'FeatureType': 'String'},
  {'FeatureName': 'event_time', 'FeatureType': 'String'},
  {'FeatureName': 'click', 'FeatureType': 'String'},
  {'FeatureName': 'hour', 'FeatureType': 'String'},
  {'FeatureName': 'C1', 'FeatureType': 'String'},
  {'FeatureName': 'banner_pos', 'FeatureType': 'String'},
  {'FeatureName': 'site_id', 'FeatureType': 'String'},
  {'FeatureName': 'site_domain', 'FeatureType': 'String'},
  {'FeatureName': 'site_category', 'FeatureType': 'String'},
  {'FeatureName': 'app_id', 'FeatureType': 'String'},
  {'FeatureName': 'app_domain', 'FeatureType': 'String'},
  {'FeatureName': 'app_category', 'FeatureType': 'String'},
  {'FeatureName': 'device_id', 'FeatureType': 'String'},
  {

In [81]:
sm.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'reviews-feature-group-1649690470',
   'FeatureGroupArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:feature-group/reviews-feature-group-1649690470',
   'CreationTime': datetime.datetime(2022, 4, 12, 0, 54, 35, 739000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'reviews-feature-group-1649337996',
   'FeatureGroupArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:feature-group/reviews-feature-group-1649337996',
   'CreationTime': datetime.datetime(2022, 4, 7, 22, 35, 33, 513000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'ctr-prediction-group-2022-05-10-10-11-49',
   'FeatureGroupArn': 'arn:aws:sagemaker:ap-northeast-1:547760918250:feature-group/ctr-prediction-group-2022-05-10-10-11-49',
   'CreationTime': datetime.datetime(2022, 5, 10, 10, 11, 52, 454000, tzinfo=tzlocal

In [82]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        data_frame[label] = data_frame[label].astype("str").astype("string")
    return data_frame

In [None]:
output_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
df_train['event_time'] = output_date

feature_group.ingest(data_frame=cast_object_to_string(df_train), max_workers=3, wait=True)


In [68]:
feature_store_query = feature_group.athena_query()

In [69]:
feature_store_table = feature_store_query.table_name

In [70]:
print(feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.ctr-prediction-group-2022-05-10-10-11-49 (
  id STRING
  event_time STRING
  click STRING
  hour STRING
  C1 STRING
  banner_pos STRING
  site_id STRING
  site_domain STRING
  site_category STRING
  app_id STRING
  app_domain STRING
  app_category STRING
  device_id STRING
  device_ip STRING
  device_model STRING
  device_type STRING
  device_conn_type STRING
  C14 STRING
  C15 STRING
  C16 STRING
  C17 STRING
  C18 STRING
  C19 STRING
  C20 STRING
  C21 STRING
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://ctr-prediction/ctr-prediction-feature-store/547760918250/sagemaker/ap-northeast-1/offline-store/ctr-prediction-group-2022-05-10-10-11-49'


In [86]:
query_string = """
SELECT id, hour, click 
FROM "{}" LIMIT 5
""".format(
    feature_store_table
)


In [87]:
feature_store_query.run(query_string=query_string, output_location="s3://" + bucket + "/" + prefix + "/query_results/")

feature_store_query.wait()

In [88]:
dataset = pd.DataFrame()

dataset = feature_store_query.as_dataframe()

dataset

Unnamed: 0,id,hour,click
