# This Notebook uses AWS feature store and creates two feature groups from CSV files.
   1. **Reads S3 files into panda data frames**
   2. **Gets appropriate "AWS boto3, sagemaker, feature-store sessions"** 
   3. **Configures 'feature groups" that includes appending "EventTime" and loading "feature definitions"**
   4. **Creates "feature groups" and waits for completion - This takes 2-3 min. Again this just establishes "file and directory" structure for offlinestore on s3**
   5. **Loads the "panda dataframes" into feature stores - This actually loads data into S3. This takes 10-15 min depending on size of data**
   6. **Miscs - validation that data is loaded.**

In [1]:
# default bucket and input data files
bucket = "fsbu-user1-s3bucket-1"
fraud_txn_file = "fraud_and_transactions.csv"
fraud_summary_file = "fraud_txn_summary.csv"

# offline store area for feature-store.
feature_store_s3_bucket = "fsbu-user1-feature-store"

# All access defined on this role
sagemaker_iam_role='arn:aws:iam::123456789012:role/fsbu-user1-sagemaker-fullaccess'


## Step 1 - Read s3 .csv files into panda data frames.** 
**These files could be the outputs from AWS data wrangler preprocessing step.** 
**While we can do pre-processing here it is recommended to do this before we create the "feature store".**

In [2]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import io
import boto3

boto3_session = boto3.Session()

s3_client = boto3_session.client('s3')

try:
    fraud_txn_data_object = s3_client.get_object(Bucket=bucket, Key=fraud_txn_file)
    fraud_summary_data_object = s3_client.get_object(Bucket=bucket, Key=fraud_summary_file)
except ClientError as e:
    print("Error with AWS call while reading files from S3: %s" % e)
  
txn_data = pd.read_csv(io.BytesIO(fraud_txn_data_object["Body"].read()))
summary_data = pd.read_csv(io.BytesIO(fraud_summary_data_object["Body"].read()))

txn_data.head()
summary_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,auth_account_id,auth_date,daily_sum,daily_count
0,4019012000171232,1997-09-19,32.44,1
1,4019012000171232,1997-10-29,68.49,1
2,4019012000198253,1997-10-12,10.0,2
3,4019012000198253,1997-10-16,10.0,2
4,4019012000198253,1997-10-26,5.0,1


In [3]:
print("Txn dataframe sahape: ", txn_data.shape)
print("Summary dataframe sahape: ", summary_data.shape)
print("Txn dataframe head: ", txn_data.head())


Txn dataframe sahape:  (201648, 53)
Summary dataframe sahape:  (171408, 4)
Txn dataframe head:     Unique_ID   recnum   AUTH_ACCOUNT_ID   AUTH_DATE         AUTH_TIME  \
0   18683804   603802  4024004756639495  1996-12-08  30DEC99:18:05:28   
1   18683904   556031  4024004655600437  1996-11-19  30DEC99:17:57:26   
2   18684004   710837  4024006030055626  1997-05-20  30DEC99:14:53:10   
3   18684104  1651398  4071006999539689  1997-02-17  30DEC99:18:54:10   
4   18684204  1817317  4301781037190134  1996-11-02  30DEC99:12:18:40   

   AUTH_HOUR  AUTH_AMOUNT  AUTH_CURR_CODE  AUTH_CURR_RATE AUTH_DECISION  ...  \
0         18        78.00             NaN             NaN             A  ...   
1         17        27.00             NaN             NaN             A  ...   
2         14        36.96             NaN             0.0             A  ...   
3         18        24.76             NaN             0.0             A  ...   
4         12        80.07             NaN             0.0        

## Step 2 - Create sagemaker sessions and feature group objects.** 
**Load feature definitions from panda data frames.** 


In [4]:
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

sagemaker_client = boto3_session.client('sagemaker')
featurestore_runtime = boto3_session.client(service_name="sagemaker-featurestore-runtime")

feature_store_session = Session(boto_session=boto3_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

fraud_txn_feature_group = FeatureGroup(
    name="fraud-txn-feature-group", sagemaker_session=feature_store_session
)

fraud_summary_feature_group = FeatureGroup(
    name="fraud-summary-feature-group", sagemaker_session=feature_store_session
)



## Step 3 - Append panda frames with additional details required for "feature stores".** 
**REad about feature deinitions ane event time of feature stores .** 


In [5]:
import time

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(txn_data)
cast_object_to_string(summary_data)

# record identifier and event time feature names
record_identifier_feature_name = "auth_account_id"
event_time_feature_name = "EventTime"

# append EventTime feature
txn_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(txn_data), dtype="float64"
)
summary_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(summary_data), dtype="float64"
)
    
print("summary head with new eventtime: ", summary_data.head())

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
fraud_txn_feature_group.load_feature_definitions(data_frame=txn_data)
fraud_summary_feature_group.load_feature_definitions(data_frame=summary_data)



summary head with new eventtime:      auth_account_id   auth_date  daily_sum  daily_count     EventTime
0  4019012000171232  1997-09-19      32.44            1  1.628479e+09
1  4019012000171232  1997-10-29      68.49            1  1.628479e+09
2  4019012000198253  1997-10-12      10.00            2  1.628479e+09
3  4019012000198253  1997-10-16      10.00            2  1.628479e+09
4  4019012000198253  1997-10-26       5.00            1  1.628479e+09


[FeatureDefinition(feature_name='auth_account_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='auth_date', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='daily_sum', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='daily_count', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>)]

In [6]:
fraud_txn_feature_group.load_feature_definitions(data_frame=txn_data)

[FeatureDefinition(feature_name='Unique_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='recnum', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='AUTH_ACCOUNT_ID', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='AUTH_DATE', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='AUTH_TIME', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='AUTH_HOUR', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='AUTH_AMOUNT', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='AUTH_CURR_CODE', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='AUTH_CURR_RATE', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='AUTH_DECISION', feature_type=<FeatureTypeEnum.STRING: 'St

## Step 4 - Create feature groups This lays down the structure on S3 and does not actually load data yet.** 
 

In [11]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

record_identifier_feature_name = "Unique_ID"
fraud_txn_feature_group.create(
    s3_uri=f"s3://{feature_store_s3_bucket}/",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=sagemaker_iam_role,
    enable_online_store=True
)

record_identifier_feature_name = "auth_account_id"
fraud_summary_feature_group.create(
    s3_uri=f"s3://{feature_store_s3_bucket}/",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=sagemaker_iam_role,
    enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=fraud_txn_feature_group)
wait_for_feature_group_creation_complete(feature_group=fraud_summary_feature_group)


Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup fraud-txn-feature-group successfully created.
Waiting for Feature Group Creation
FeatureGroup fraud-summary-feature-group successfully created.


In [12]:
sagemaker_client.list_feature_groups()

#fraud_txn_feature_group.describe()
#fraud_summary_feature_group.describe()

#fraud_txn_feature_group.delete()
#fraud_summary_feature_group.delete()


{'FeatureGroupSummaries': [{'FeatureGroupName': 'fraud-txn-feature-group',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:123456789012:feature-group/fraud-txn-feature-group',
   'CreationTime': datetime.datetime(2021, 8, 8, 23, 20, 58, 636000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'fraud-summary-feature-group',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:123456789012:feature-group/fraud-summary-feature-group',
   'CreationTime': datetime.datetime(2021, 8, 8, 23, 20, 59, 340000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'}],
 'ResponseMetadata': {'RequestId': 'e00dfe14-7db7-46d3-b969-012e46e9ea80',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e00dfe14-7db7-46d3-b969-012e46e9ea80',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '452',
   'date': 'Mon, 09 Aug 2021 03:21:45 GMT'},
  'RetryAttempts': 0}}

## Step 4 - Use following to save DDL for EP and any other processes. During feature creation as_hive_DDL() o/p comes with all features.....
#### When I did same during exploration after features are created, as_hive_DDL() did not show all features. - Got to do some additional research.

In [13]:
fraud_txn_feature_group.as_hive_ddl()

"CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.fraud-txn-feature-group (\n  Unique_ID INT\n  recnum INT\n  AUTH_ACCOUNT_ID INT\n  AUTH_DATE STRING\n  AUTH_TIME STRING\n  AUTH_HOUR INT\n  AUTH_AMOUNT FLOAT\n  AUTH_CURR_CODE FLOAT\n  AUTH_CURR_RATE FLOAT\n  AUTH_DECISION STRING\n  AUTH_TRAN_TYPE STRING\n  AUTH_AVAIL_CREDIT INT\n  AUTH_CRED_LINE INT\n  AUTH_SIC INT\n  AUTH_ZIP3_CODE STRING\n  AUTH_ZIP_REST_CODE STRING\n  AUTH_MERCH_CNTRY STRING\n  AUTH_PIN_VER STRING\n  AUTH_CVV STRING\n  AUTH_KEY_SWIPE STRING\n  AUTH_CARD_EXP_DATE STRING\n  AUTH_AUTH_ID FLOAT\n  AUTH_REASON_CODE FLOAT\n  AUTH_ADVICE STRING\n  AUTH_MERCHANT_ID STRING\n  AUTH_TERM_ID STRING\n  AUTH_WHICH_CARD FLOAT\n  EXT_AUTH_YEAR INT\n  EXT_AUTH_MONTH INT\n  EXT_AUTH_DAY INT\n  EXT_AUTH_BIN INT\n  FRAUD_TYPE STRING\n  FRAUD_DATE_FIRST_FRAUD FLOAT\n  FRAUD_DETECTED_DATE FLOAT\n  FRAUD_LABEL INT\n  CARD_ZIP3_CODE FLOAT\n  CARD_OPEN_DATE STRING\n  CARD_EXPIRE_DATE FLOAT\n  CARD_CRED_LINE FLOAT\n  CARD_TYPE STRI

In [14]:
fraud_summary_feature_group.as_hive_ddl()

"CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.fraud-summary-feature-group (\n  auth_account_id INT\n  auth_date STRING\n  daily_sum FLOAT\n  daily_count INT\n  EventTime FLOAT\n  write_time TIMESTAMP\n  event_time TIMESTAMP\n  is_deleted BOOLEAN\n)\nROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'\n  STORED AS\n  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'\n  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'\nLOCATION 's3://fsbu-user1-feature-store/123456789012/sagemaker/us-east-1/offline-store/fraud-summary-feature-group-1628479259/data'"

#### Step 5 - Ingest the data from panda into feature stores and wait till all data gets loaded.** 

In [15]:
# Load data into feature store.

fraud_txn_feature_group.ingest(data_frame=txn_data, max_workers=3, wait=True)
fraud_summary_feature_group.ingest(data_frame=summary_data, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='fraud-summary-feature-group', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f9b69c48860>, max_workers=3, max_processes=1, _async_result=<multiprocess.pool.MapResult object at 0x7f9b6592dba8>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [18]:
## This cell waits for data to be loaded to both feature groups.
## it may take a while 

def wait_for_data_load(input_feature_group_s3_prefix):
    offline_store_contents = None
    while offline_store_contents is None:
        objects_in_bucket = s3_client.list_objects(
            Bucket=feature_store_s3_bucket, Prefix=input_feature_group_s3_prefix
        )
        if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
            offline_store_contents = objects_in_bucket["Contents"]
        else:
            print("Waiting for data in offline store...\n")
            sleep(60)

#### 
fraud_txn_feature_group_resolved_output_s3_uri = (
    fraud_txn_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
fraud_txn_feature_group_s3_prefix = fraud_txn_feature_group_resolved_output_s3_uri.replace(
    f"s3://{feature_store_s3_bucket}/", ""
)

print("Path of data : ", fraud_txn_feature_group_resolved_output_s3_uri)
wait_for_data_load(fraud_txn_feature_group_s3_prefix)
print("Data available for txn data.")
            
fraud_summary_feature_group_resolved_output_s3_uri = (
    fraud_summary_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
fraud_summary_feature_group_s3_prefix = fraud_summary_feature_group_resolved_output_s3_uri.replace(
    f"s3://{feature_store_s3_bucket}/", ""
)

print("Path of data : ", fraud_summary_feature_group_resolved_output_s3_uri)
wait_for_data_load(fraud_summary_feature_group_s3_prefix)
print("Data available for summary data.")

Path of data :  s3://fsbu-user1-feature-store/123456789012/sagemaker/us-east-1/offline-store/fraud-txn-feature-group-1628479258/data
Data available for txn data.
Path of data :  s3://fsbu-user1-feature-store/123456789012/sagemaker/us-east-1/offline-store/fraud-summary-feature-group-1628479259/data
Data available for summary data.


## Step 5 - Explore the feature store for some of the data.** 

In [19]:
# See how we can get one specific accountid record
record_identifier_value = 18683804

record_txn = featurestore_runtime.get_record(
    FeatureGroupName="fraud-txn-feature-group",
    RecordIdentifierValueAsString=str(record_identifier_value),
)
print("Txn record for 0  :", record_txn)
print ("------------------------------")

record_identifier_value = 4019012000171232
record_summary = featurestore_runtime.get_record(
    FeatureGroupName="fraud-summary-feature-group",
    RecordIdentifierValueAsString=str(record_identifier_value),
)
print("summary record for 4019012000171232 :", record_summary)

Txn record for 0  : {'ResponseMetadata': {'RequestId': '15a05fae-e202-49d1-bacb-97fcb4091cbf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '15a05fae-e202-49d1-bacb-97fcb4091cbf', 'content-type': 'application/json', 'content-length': '2779', 'date': 'Mon, 09 Aug 2021 12:18:04 GMT'}, 'RetryAttempts': 0}, 'Record': [{'FeatureName': 'Unique_ID', 'ValueAsString': '18683804'}, {'FeatureName': 'recnum', 'ValueAsString': '603802'}, {'FeatureName': 'AUTH_ACCOUNT_ID', 'ValueAsString': '4024004756639495'}, {'FeatureName': 'AUTH_DATE', 'ValueAsString': '1996-12-08'}, {'FeatureName': 'AUTH_TIME', 'ValueAsString': '30DEC99:18:05:28'}, {'FeatureName': 'AUTH_HOUR', 'ValueAsString': '18'}, {'FeatureName': 'AUTH_AMOUNT', 'ValueAsString': '78.0'}, {'FeatureName': 'AUTH_DECISION', 'ValueAsString': 'A'}, {'FeatureName': 'AUTH_TRAN_TYPE', 'ValueAsString': 'M'}, {'FeatureName': 'AUTH_AVAIL_CREDIT', 'ValueAsString': '248'}, {'FeatureName': 'AUTH_CRED_LINE', 'ValueAsString': '7000'}, {'FeatureNa