In [185]:
import pandas as pd
import random, string
from time import gmtime, strftime, sleep
import boto3
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role

In [186]:
df = pd.read_csv('s3://sagemaker-sample-files/datasets/tabular/fraud_detection/synthethic_fraud_detection_SA/sampled_transactions.csv')

In [187]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card4,card5,card6,...,F17,N1,N2,N3,N4,N5,N6,N7,N8,N9
0,3343087,0,8810855,29.0,12469,360.0,150.0,mastercard,126.0,debit,...,519,F,F,T,T,T,T,T,F,T
1,3307318,0,7955295,107.95,16188,178.0,150.0,mastercard,224.0,debit,...,773,F,T,T,T,F,F,F,F,T
2,3555327,0,15084339,159.95,1825,555.0,150.0,visa,226.0,debit,...,771,F,T,F,F,T,T,T,T,F
3,3310736,0,8017157,159.95,10057,225.0,150.0,mastercard,224.0,debit,...,903,T,T,F,T,T,F,T,F,F
4,3034711,0,1127470,117.0,11444,555.0,150.0,visa,226.0,debit,...,579,T,T,T,F,T,F,T,F,F


In [188]:
gm_time = gmtime()
fg_timestamp = strftime("%Y-%m-%d'T'%H:%M:%SZ", gm_time)
df['EventTime'] = fg_timestamp

In [189]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype("str").astype("string")

In [190]:
cast_object_to_string(df)

In [191]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card4,card5,card6,...,N1,N2,N3,N4,N5,N6,N7,N8,N9,EventTime
0,3343087,0,8810855,29.0,12469,360.0,150.0,mastercard,126.0,debit,...,F,F,T,T,T,T,T,F,T,2021-04-22'T'19:45:01Z
1,3307318,0,7955295,107.95,16188,178.0,150.0,mastercard,224.0,debit,...,F,T,T,T,F,F,F,F,T,2021-04-22'T'19:45:01Z
2,3555327,0,15084339,159.95,1825,555.0,150.0,visa,226.0,debit,...,F,T,F,F,T,T,T,T,F,2021-04-22'T'19:45:01Z
3,3310736,0,8017157,159.95,10057,225.0,150.0,mastercard,224.0,debit,...,T,T,F,T,T,F,T,F,F,2021-04-22'T'19:45:01Z
4,3034711,0,1127470,117.0,11444,555.0,150.0,visa,226.0,debit,...,T,T,T,F,T,F,T,F,F,2021-04-22'T'19:45:01Z


In [192]:
role = get_execution_role()
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
featuregroup_name = 'transactions-fg-manual-ingest'
account_id = boto3.client('sts').get_caller_identity()["Account"]

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [193]:
feature_group = FeatureGroup(name=featuregroup_name, sagemaker_session=feature_store_session)
feature_group.load_feature_definitions(data_frame=df)

[FeatureDefinition(feature_name='TransactionID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='isFraud', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TransactionDT', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='TransactionAmt', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card1', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='card2', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card3', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card4', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='card5', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='card6', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinit

In [194]:
record_identifier_feature_name = "TransactionID"
event_time_feature_name = "EventTime"

bucket = feature_store_session.default_bucket()
s3_folder = 'feature-store-manual-ingestion10'

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

feature_group.create(
    s3_uri=f"s3://{bucket}/{s3_folder}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=False
)

wait_for_feature_group_creation_complete(feature_group=feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup transactions-fg-manual-ingest successfully created.


In [202]:
query = feature_group.athena_query()
fg_table = query.table_name

In [196]:
year, month, day, hour = strftime('%Y-%m-%d-%H', gm_time).split('-')

In [197]:
df['write_time'] = df['api_invocation_time'] = pd.to_datetime(fg_timestamp)
df['is_deleted'] = False

In [198]:
filepath = f"s3://{bucket}/{s3_folder}/{account_id}/sagemaker/{region}/offline-store/{fg_table}/data/year={year}/month={month}/day={day}/hour={hour}/"
filename = strftime("%Y%m%dT%H%M%SZ_", gm_time)
filename += ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(16))
filename += '.parquet'

In [199]:
df.to_parquet(filepath + filename)

In [203]:
query_string = f'SELECT * FROM "{fg_table}"'

query.run(query_string=query_string, output_location=f's3://{bucket}/{s3_folder}/query_results/')
query.wait()
dataset = query.as_dataframe()

dataset.head()

Unnamed: 0,transactionid,isfraud,transactiondt,transactionamt,card1,card2,card3,card4,card5,card6,...,n4,n5,n6,n7,n8,n9,eventtime,write_time,api_invocation_time,is_deleted
0,3343087,0,8810855,29.0,12469,360.0,150.0,mastercard,126.0,debit,...,T,T,T,T,F,T,2021-04-22'T'19:45:01Z,2021-04-22 19:45:01.000,2021-04-22 19:45:01.000,False
1,3307318,0,7955295,107.95,16188,178.0,150.0,mastercard,224.0,debit,...,T,F,F,F,F,T,2021-04-22'T'19:45:01Z,2021-04-22 19:45:01.000,2021-04-22 19:45:01.000,False
2,3555327,0,15084339,159.95,1825,555.0,150.0,visa,226.0,debit,...,F,T,T,T,T,F,2021-04-22'T'19:45:01Z,2021-04-22 19:45:01.000,2021-04-22 19:45:01.000,False
3,3310736,0,8017157,159.95,10057,225.0,150.0,mastercard,224.0,debit,...,T,T,F,T,F,F,2021-04-22'T'19:45:01Z,2021-04-22 19:45:01.000,2021-04-22 19:45:01.000,False
4,3034711,0,1127470,117.0,11444,555.0,150.0,visa,226.0,debit,...,F,T,F,T,F,F,2021-04-22'T'19:45:01Z,2021-04-22 19:45:01.000,2021-04-22 19:45:01.000,False


In [204]:
feature_group.delete()