## ライブラリ
必要なライブラリを全てimport するようにしていますが、説明のためにコード中で必要な箇所でimportを行なっています

In [None]:
from pathlib import Path
import yaml
import boto3
from datetime import datetime
from time import strftime, gmtime
import pandas as pd
import warnings

import sagemaker
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.feature_group import FeatureGroup

pd.set_option('display.max_columns', 30)
warnings.simplefilter('ignore')

## データ準備

In [246]:
import yaml
import boto3

import sagemaker

# AWSの設定ファイル・データのあるフォルダのローカルパス
SETTING_FILE_PATH = "../config/settings.yaml"
DATASET_FOLDER_PATH = "./avazu-ctr-prediction"

with open(SETTING_FILE_PATH) as file:
    aws_info = yaml.safe_load(file)

    
sess = sagemaker.Session()
role = aws_info['aws']['sagemaker']['role']
bucket = aws_info['aws']['sagemaker']['s3bucket']
region = boto3.Session().region_name

sm = boto3.client('sagemaker', region_name=region)
featurestore_runtime = boto3.client("sagemaker-featurestore-runtime", region_name=region)

In [5]:
from pathlib import Path
import pandas as pd

dataset_folder = Path(DATASET_FOLDER_PATH)
# df_train = pd.read_csv(dataset_folder / "train")
# df_train_partial = df_train[df_train.index % 100 == 0]
# df_train_partial.to_csv(dataset_folder / "train_partial", index=False)

df_train_partial = pd.read_csv(dataset_folder / "train_partial")

In [184]:
df_train_partial.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,event_time
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,ddd2926e,44956a24,1,2,15706,320,50,1722,0,35,-1,79,2022-05-11T09:22:24Z
1,1.001579e+19,0,14102100,1005,1,856e6d3f,58a89a43,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,4375586d,5ec45883,1,0,19772,320,50,2227,0,687,100075,48,2022-05-11T09:22:24Z
2,1.002948e+18,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,f95efa07,a99f214a,ab9a5222,2ee63ff8,1,0,20596,320,50,2161,0,35,-1,157,2022-05-11T09:22:24Z
3,1.004511e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,bbe53381,542422a7,1,0,19743,320,50,2264,3,427,100000,61,2022-05-11T09:22:24Z
4,1.00599e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,8a014cbb,04f5b394,1,0,15702,320,50,1722,0,35,-1,79,2022-05-11T09:22:24Z


In [185]:
# レコードの識別子
df_train_partial['id'] = df_train_partial.index

# レコードの生成時刻を表す列を追加
output_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
df_train_partial['event_time'] = output_date

In [186]:
record_identifier_name = "id"  # レコードの識別子
event_time_feature_name = "event_time"  # レコードの生成時刻

feature_names = [
    "click",
    "hour",
    "C1",
    "banner_pos",
    "site_id",
    "site_domain",
    "site_category",
    "app_id",
    "app_domain",
    "app_category",
    "device_id",
    "device_ip",
    "device_model",
    "device_type",
    "device_conn_type",
    "C14",
    "C15",
    "C16",
    "C17",
    "C18",
    "C19",
    "C20",
    "C21",
] + [record_identifier_name, event_time_feature_name]

print(feature_names)

['click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'id', 'event_time']


## Feature Group の作成

In [187]:
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.feature_group import FeatureGroup

prefix = "ctr-prediction-feature-store"
feature_group_name = "ctr-prediction-group" + f"-{strftime('%d-%H-%M-%S', gmtime())}"
print(feature_group_name)

ctr-prediction-group-11-15-00-25


### load_feature_definitions を使用して Feature Definition のスキーマを自動で識別する場合

In [188]:
feature_group_auto = FeatureGroup(name=feature_group_name, sagemaker_session=sess)

In [189]:
# pandas の DataFrameを直接入力すると ValueError がでる
try:
    feature_group_auto.load_feature_definitions(data_frame=df_train_partial)
except ValueError as e:
    print(e)

Failed to infer Feature type based on dtype object for column site_id.


In [190]:
df_train_partial.dtypes

id                  float64
click                 int64
hour                  int64
C1                    int64
banner_pos            int64
site_id              object
site_domain          object
site_category        object
app_id               object
app_domain           object
app_category         object
device_id            object
device_ip            object
device_model         object
device_type           int64
device_conn_type      int64
C14                   int64
C15                   int64
C16                   int64
C17                   int64
C18                   int64
C19                   int64
C20                   int64
C21                   int64
event_time           object
dtype: object

In [191]:
# Pandas の object 型を Feature Group が認識できる string 型に変換する
def cast_object_to_string(df: pd.DataFrame) -> pd.DataFrame:
    df_tmp = df.copy()
    for label in df_tmp.columns:
        if df_tmp.dtypes[label] == 'object':
            df_tmp[label] = df_tmp[label].astype("str").astype("string")
    return df_tmp

In [192]:
cast_object_to_string(df_train_partial).dtypes

id                  float64
click                 int64
hour                  int64
C1                    int64
banner_pos            int64
site_id              string
site_domain          string
site_category        string
app_id               string
app_domain           string
app_category         string
device_id            string
device_ip            string
device_model         string
device_type           int64
device_conn_type      int64
C14                   int64
C15                   int64
C16                   int64
C17                   int64
C18                   int64
C19                   int64
C20                   int64
C21                   int64
event_time           string
dtype: object

In [193]:
feature_group_auto.load_feature_definitions(data_frame=cast_object_to_string(df_train_partial))

[FeatureDefinition(feature_name='id', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='click', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='hour', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='C1', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='banner_pos', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='site_id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='site_domain', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='site_category', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='app_id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='app_domain', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='app_category'

In [194]:
del feature_group_auto

### 自分で定義したFeature Definition のスキーマを使用する場合

In [253]:
feature_definitions = [
    FeatureDefinition(feature_name=feature_name, feature_type=FeatureTypeEnum.STRING)
    for feature_name in feature_names
]

feature_group_original = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sess)
feature_group_original.feature_definitions

[FeatureDefinition(feature_name='click', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='hour', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='C1', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='banner_pos', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='site_id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='site_domain', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='site_category', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='app_id', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='app_domain', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='app_category', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='device_id', feature_type=<F

In [None]:
feature_group_original.create(
    s3_uri=f"s3://{bucket}/{prefix}", # offline feature store でデータを保存する S3 URI
    record_identifier_name=record_identifier_name, # レコード識別子のカラム名
    event_time_feature_name=event_time_feature_name, # レコード生成時刻のカラム名
    role_arn=role, 
    enable_online_store=True, # online feature store を作成するか. defualt = False
    description = "Feature Group For CTR Prediciton",
    tags = [{"Key":"author", "Value": "satsuki"},{"Key":"target", "Value": "click"} ]
) 

In [None]:
feature_group_original.describe()

In [None]:
sm.list_feature_groups()['FeatureGroupSummaries'][0]

In [263]:
df_train_tmp['event_time']

0     2022-05-12T00:00:21Z
1     2022-05-12T00:00:21Z
2     2022-05-12T00:00:21Z
3     2022-05-12T00:00:21Z
4     2022-05-12T00:00:21Z
5     2022-05-12T00:00:21Z
6     2022-05-12T00:00:21Z
7     2022-05-12T00:00:21Z
8     2022-05-12T00:00:21Z
9     2022-05-12T00:00:21Z
10    2022-05-12T00:00:21Z
11    2022-05-12T00:00:21Z
12    2022-05-12T00:00:21Z
13    2022-05-12T00:00:21Z
14    2022-05-12T00:00:21Z
15    2022-05-12T00:00:21Z
16    2022-05-12T00:00:21Z
17    2022-05-12T00:00:21Z
18    2022-05-12T00:00:21Z
19    2022-05-12T00:00:21Z
Name: event_time, dtype: object

In [279]:
df_train_tmp['banner_pos'].unique()

array([0, 1])

## データの登録

In [280]:
# boto3 の sagemaker clientを利用
record = [
    {"FeatureName": "id", "ValueAsString": str("-2")},
    {"FeatureName": "event_time", "ValueAsString": str("2022-05-12T00:00:21Z")},
    {"FeatureName": "click", "ValueAsString": str("0")},
    {"FeatureName": "C1", "ValueAsString": str("1002")},
    {"FeatureName": "banner_pos", "ValueAsString": str("1")},
]

featurestore_runtime.put_record(FeatureGroupName=feature_group_name, Record=record)


{'ResponseMetadata': {'RequestId': '69adf8c1-423f-4b9a-ae42-8f804504f7be',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '69adf8c1-423f-4b9a-ae42-8f804504f7be',
   'content-type': 'application/json',
   'content-length': '0',
   'date': 'Fri, 13 May 2022 06:39:31 GMT'},
  'RetryAttempts': 0}}

In [281]:
# FeatureGroup の put_recordを利用

from sagemaker.feature_store.inputs import FeatureValue

record = [
    FeatureValue(feature_name="id", value_as_string=str("-1")),
    FeatureValue(feature_name="event_time", value_as_string=str("2022-05-12T00:00:21Z")),
    FeatureValue(feature_name="click", value_as_string=str("1")),
    FeatureValue(feature_name="C1", value_as_string=str("1005")),
    FeatureValue(feature_name="banner_pos", value_as_string=str("0")),
]
feature_group_original.put_record(record)


{'ResponseMetadata': {'RequestId': 'dffca209-9fd9-48fd-980b-1a4d12cdbf26',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'dffca209-9fd9-48fd-980b-1a4d12cdbf26',
   'content-type': 'application/json',
   'content-length': '0',
   'date': 'Fri, 13 May 2022 06:39:40 GMT'},
  'RetryAttempts': 0}}

In [237]:
# FeatureGroup の ingest を利用
feature_group_original.ingest(data_frame=df_train_tmp, max_workers=4, wait=True)

IngestionManagerPandas(feature_group_name='ctr-prediction-group-11-15-00-25', sagemaker_session=<sagemaker.session.Session object at 0x125ed7a10>, data_frame=    id  click      hour    C1  banner_pos   site_id site_domain site_category  \
0    0      0  14102100  1005           0  1fbe01fe    f3845767      28905ebd   
1    1      0  14102100  1005           1  856e6d3f    58a89a43      f028772b   
2    2      0  14102100  1005           0  85f751fd    c4e18dd6      50e219e0   
3    3      0  14102100  1005           0  85f751fd    c4e18dd6      50e219e0   
4    4      0  14102100  1005           0  1fbe01fe    f3845767      28905ebd   
5    5      0  14102100  1005           1  85f751fd    c4e18dd6      50e219e0   
6    6      0  14102100  1005           0  85f751fd    c4e18dd6      50e219e0   
7    7      0  14102100  1002           0  34d1d55f    97df357a      50e219e0   
8    8      1  14102100  1005           0  1fbe01fe    f3845767      28905ebd   
9    9      0  14102100  1005   

## データ取り出し

### Online Feature Store から取り出し

In [273]:
record_identifier_value = str(-1)
response = featurestore_runtime.get_record(FeatureGroupName=feature_group_name, RecordIdentifierValueAsString=record_identifier_value)
response

{'ResponseMetadata': {'RequestId': '2f934a36-601f-4c09-b5d9-d2549372bfaa',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2f934a36-601f-4c09-b5d9-d2549372bfaa',
   'content-type': 'application/json',
   'content-length': '260',
   'date': 'Fri, 13 May 2022 06:20:10 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'click', 'ValueAsString': '1'},
  {'FeatureName': 'C1', 'ValueAsString': '-1005'},
  {'FeatureName': 'banner_pos', 'ValueAsString': '0'},
  {'FeatureName': 'id', 'ValueAsString': '-1'},
  {'FeatureName': 'event_time', 'ValueAsString': '2022-05-12T00:00:21Z'}]}

In [277]:
record_identifier_value = str(-2)
response = featurestore_runtime.get_record(FeatureGroupName=feature_group_name, RecordIdentifierValueAsString=record_identifier_value)
response

{'ResponseMetadata': {'RequestId': '403840f6-28cd-439f-a74b-11732f87aa63',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '403840f6-28cd-439f-a74b-11732f87aa63',
   'content-type': 'application/json',
   'content-length': '166',
   'date': 'Fri, 13 May 2022 06:29:51 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'click', 'ValueAsString': '1'},
  {'FeatureName': 'id', 'ValueAsString': '-2'},
  {'FeatureName': 'event_time', 'ValueAsString': '2022-05-12T00:00:21Z'}]}

In [241]:
record_identifier_values = ["1", "2", "3", "4"]
batch_response = featurestore_runtime.batch_get_record(Identifiers=[{"FeatureGroupName": feature_group_name, "RecordIdentifiersValueAsString": record_identifier_values}])
batch_response 

{'ResponseMetadata': {'RequestId': '1290bb4a-6ecc-4169-8bf6-55e96565fad7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1290bb4a-6ecc-4169-8bf6-55e96565fad7',
   'content-type': 'application/json',
   'content-length': '5488',
   'date': 'Wed, 11 May 2022 15:15:20 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'ctr-prediction-group-11-15-00-25',
   'RecordIdentifierValueAsString': '1',
   'Record': [{'FeatureName': 'click', 'ValueAsString': '0'},
    {'FeatureName': 'hour', 'ValueAsString': '14102100'},
    {'FeatureName': 'C1', 'ValueAsString': '1005'},
    {'FeatureName': 'banner_pos', 'ValueAsString': '1'},
    {'FeatureName': 'site_id', 'ValueAsString': '856e6d3f'},
    {'FeatureName': 'site_domain', 'ValueAsString': '58a89a43'},
    {'FeatureName': 'site_category', 'ValueAsString': 'f028772b'},
    {'FeatureName': 'app_id', 'ValueAsString': 'ecad2386'},
    {'FeatureName': 'app_domain', 'ValueAsString': '7801e8d9'},
    {'FeatureName': 'app_cate

### Offline Feature Store から取り出し

In [243]:
feature_store_query = feature_group_original.athena_query()
feature_store_table = feature_store_query.table_name
print(feature_store_table)

ctr-prediction-group-11-15-00-25-1652281250


In [None]:
print(feature_group.as_hive_ddl())

In [282]:
query_string = """
SELECT *
FROM "{}" LIMIT 5
""".format(
    feature_store_table
)

In [285]:
feature_store_query.run(query_string=query_string, output_location="s3://" + bucket + "/" + prefix + "/query_results/")
feature_store_query.wait()

In [286]:
dataset = pd.DataFrame()
dataset = feature_store_query.as_dataframe()
dataset

Unnamed: 0,click,hour,c1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,c14,c15,c16,c17,c18,c19,c20,c21,id,event_time,write_time,api_invocation_time,is_deleted
0,0,14102100.0,1005.0,0.0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,52aa6971,8a4875bd,1.0,0.0,15706.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,10,2022-05-12T00:00:21Z,2022-05-11 15:09:56.893,2022-05-11 15:04:45.000,False
1,1,14102100.0,1005.0,0.0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,bc7f9471,8b1aa260,1.0,0.0,15705.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,8,2022-05-12T00:00:21Z,2022-05-11 15:17:17.810,2022-05-11 15:12:14.000,False
2,0,14102100.0,1005.0,0.0,3a66a5a5,9e328a4d,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,e3510321,24f6b932,1.0,0.0,19666.0,300.0,250.0,2253.0,2.0,303.0,100026.0,52.0,12,2022-05-12T00:00:21Z,2022-05-11 15:09:56.642,2022-05-11 15:04:48.000,False
3,1,14102100.0,1005.0,0.0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,0c8a8801,c144e605,1.0,0.0,15708.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,17,2022-05-12T00:00:21Z,2022-05-11 15:09:56.642,2022-05-11 15:04:48.000,False
4,1,,,,,,,,,,,,,,,,,,,,,,,-1,2022-05-12T00:00:21Z,2022-05-13 06:34:21.256,2022-05-13 06:29:27.000,False
