# This Notebook explores features in AWS feature store created earlier.
##### Step 1. **Explore earlier created feature groups**
##### Step 2. **Update summary_feature_group with additional data and see how offline and online stores differ in terms of data returned** 
##### Step 3. **Leverage "tags" with versions and see how you can only pull specific versioned data**
   

In [1]:
# default_bucket for everything.
default_bucket = "fsbu-user1-s3bucket-1"

# offline store area for feature-store.
feature_store_s3_bucket = "fsbu-user1-feature-store"

# All access defined on this role
sagemaker_iam_role='arn:aws:iam::123456789012:role/fsbu-user1-sagemaker-fullaccess'


## Step1  Begin
##### Create sessions 
##### Describe feature groups
##### Total number of records and validate against the original input CSV files

#### Step 1 - create required sessions, config objects to access feature stores.** 

In [None]:
import numpy as np
import pandas as pd
import boto3

import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

boto3_session = boto3.Session()
s3_client = boto3_session.client('s3')
sagemaker_client = boto3_session.client('sagemaker')
featurestore_runtime = boto3_session.client(service_name="sagemaker-featurestore-runtime")

feature_store_session = Session(boto_session=boto3_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# get all feature groups and then describe each one
feature_group_list=[]
print("Feature groups in our store are: ")
for i in sagemaker_client.list_feature_groups().get('FeatureGroupSummaries'):
    feature_group_name = i.get('FeatureGroupName')
    feature_group_list.append(FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session))
    print(feature_group_name)

print("----------------")
for feature_group in feature_group_list:
    print("--- Describe output of ")
    print(feature_group.describe())


##### Validate features saved into feature store
##### O/Ps from following should match to the orginal contents of files in terms of number of rows. o/p has additional columns such as "EventTime", lastaccess time etc which were not there in original input csv files.
##### dir(fraud_txn_feature_group) ==> shows all methods available

In [3]:
#fraud_txn_feature_group = feature_group_list[0]
#fraud_summary_feature_group = feature_group_list[1]
#dir(fraud_txn_feature_group.athena_query())
def explore_feature_group(feature_group):
    a_query = feature_group.athena_query()
    a_table = a_query.table_name
    a_query_string = ('select * from "' + a_table + '"' )
    a_query.run(query_string=a_query_string, output_location= "s3://" + default_bucket + "/query_results")
    a_query.wait()
    a_pd = pd.DataFrame()
    a_pd = a_query.as_dataframe()
    print("pd frame shape :", a_pd.shape)
    print(a_pd.head())
    
for feature_group in feature_group_list:
    print("=========")
    explore_feature_group(feature_group)
    
#print("out: ", txn)
# txn_pd.shape()
#fraud_summary_feature_group.delete()



  exec(code_obj, self.user_global_ns, self.user_ns)


pd frame shape : (201648, 57)
   unique_id   recnum   auth_account_id   auth_date         auth_time  \
0   18687804  2486410  5425769053974909  1996-11-11  30DEC99:10:34:20   
1   16425806  2063110  5254002122218203  1997-03-07  30DEC99:15:18:52   
2   15564705  1918971  4301781950162187  1997-02-06  30DEC99:08:40:04   
3   16426506   497549  4024004529636483  1997-05-17  30DEC99:12:18:16   
4   18694804  2557455  5432259408087973  1996-11-26  30DEC99:10:34:09   

   auth_hour  auth_amount  auth_curr_code  auth_curr_rate auth_decision  ...  \
0         10        99.68             NaN             0.0             A  ...   
1         15        70.35             NaN             NaN             A  ...   
2          8         1.00             NaN             0.0             A  ...   
3         12        59.53             NaN             0.0             A  ...   
4         10        23.65             NaN             0.0             A  ...   

  fraud_stage   padzip  txtcode                   

#### Print equivalent DDL statements that can be accessed thru EP

In [None]:
for feature_group in feature_group_list:
    print(feature_group.describe())
    print("----------------")
    print(feature_group.as_hive_ddl())
    print("================================================-")

##  Step 2 starts here 
#### Update summary feature group with new data rows and then explore..
#### See the difference between Offline store and Online store ...

In [6]:
# Load updtaes data into feature store.
# default bucket and input data files
bucket = "fsbu-user1-s3bucket-1"
fraud_summary_file = "fraud_txn_summary_update.csv"

# All access defined on this role
sagemaker_iam_role='arn:aws:iam::123456789012:role/fsbu-user1-sagemaker-fullaccess'


In [7]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import io
import boto3

try:
    fraud_summary_data_object = s3_client.get_object(Bucket=bucket, Key=fraud_summary_file)
except ClientError as e:
    print("Error with AWS call while reading files from S3: %s" % e)

summary_data = pd.read_csv(io.BytesIO(fraud_summary_data_object["Body"].read()))

summary_data.head()

Unnamed: 0,auth_account_id,auth_date,daily_sum,daily_count
0,4019012000171232,1997-09-19,42.44,1
1,4019012000171232,1997-10-29,78.49,1
2,4019012000198253,1997-10-12,20.0,2
3,4019012000198253,1997-10-16,10.0,2
4,4019012000198253,1997-10-26,15.0,1


In [8]:
import time

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(summary_data)

# record identifier and event time feature names
record_identifier_feature_name = "auth_account_id"
event_time_feature_name = "EventTime"

# append EventTime feature
summary_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(summary_data), dtype="float64"
)
    
print("summary head with new eventtime: ", summary_data.head())



summary head with new eventtime:      auth_account_id   auth_date  daily_sum  daily_count     EventTime
0  4019012000171232  1997-09-19      42.44            1  1.628512e+09
1  4019012000171232  1997-10-29      78.49            1  1.628512e+09
2  4019012000198253  1997-10-12      20.00            2  1.628512e+09
3  4019012000198253  1997-10-16      10.00            2  1.628512e+09
4  4019012000198253  1997-10-26      15.00            1  1.628512e+09


In [9]:
## This cell waits for data to be loaded to feature groups.
## We will just update summary feature group with additional data. 
## the new data will get udpated to both online and offline stores.

fraud_summary_feature_group = feature_group_list[1]
fraud_summary_feature_group.ingest(data_frame=summary_data, max_workers=2, wait=True)

def wait_for_data_load(input_feature_group_s3_prefix):
    offline_store_contents = None
    while offline_store_contents is None:
        objects_in_bucket = s3_client.list_objects(
            Bucket=feature_store_s3_bucket, Prefix=input_feature_group_s3_prefix
        )
        if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
            offline_store_contents = objects_in_bucket["Contents"]
        else:
            print("Waiting for data in offline store...\n")
            sleep(60)

#### 
            
fraud_summary_feature_group_resolved_output_s3_uri = (
    fraud_summary_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
fraud_summary_feature_group_s3_prefix = fraud_summary_feature_group_resolved_output_s3_uri.replace(
    f"s3://{feature_store_s3_bucket}/", ""
)

print("Path of data : ", fraud_summary_feature_group_resolved_output_s3_uri)
wait_for_data_load(fraud_summary_feature_group_s3_prefix)
print("Data available for summary data.")

Path of data :  s3://fsbu-user1-feature-store/123456789012/sagemaker/us-east-1/offline-store/fraud-summary-feature-group-1628479259/data
Data available for summary data.


#### Ok. Let us review summary_feature after updates. 
##### Get Record gives you the latest record. 4019012000171232,1997-10-29,78.49,1 - The one with $78.49 is returned this time after the update. Which means getRecord goes after online store/. This can be used during inference if we need the latest record.
##### see if you get all records of summary feature. Original ones + new ones. This means batch gets to offline store and hence all records are returned. this is for Training.

In [10]:
# See how we can get one specific accountid record
# 

record_identifier_value = 4019012000171232
record_summary = featurestore_runtime.get_record(
    FeatureGroupName="fraud-summary-feature-group",
    RecordIdentifierValueAsString=str(record_identifier_value),
)
print("summary record for 4019012000171232 :", record_summary)

summary record for 4019012000171232 : {'ResponseMetadata': {'RequestId': 'a19cb1c5-0b1c-4cd2-a265-5aeb0f7e5c9c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'a19cb1c5-0b1c-4cd2-a265-5aeb0f7e5c9c', 'content-type': 'application/json', 'content-length': '299', 'date': 'Mon, 09 Aug 2021 12:25:34 GMT'}, 'RetryAttempts': 0}, 'Record': [{'FeatureName': 'auth_account_id', 'ValueAsString': '4019012000171232'}, {'FeatureName': 'auth_date', 'ValueAsString': '1997-10-29'}, {'FeatureName': 'daily_sum', 'ValueAsString': '78.49'}, {'FeatureName': 'daily_count', 'ValueAsString': '1'}, {'FeatureName': 'EventTime', 'ValueAsString': '1628511922.0'}]}


In [12]:
# following takes a while to reflect latest since data needs to be synced from online to offsite.- It took more than 10 min some cases.
explore_feature_group(fraud_summary_feature_group)

pd frame shape : (171428, 8)
    auth_account_id   auth_date  daily_sum  daily_count     eventtime  \
0  5432257460521789  1996-12-24      26.56            1  1.628479e+09   
1  5432257470541781  1996-11-11       7.49            1  1.628479e+09   
2  5432257470541781  1996-11-12       3.95            1  1.628479e+09   
3  5432257470541781  1996-11-16      23.69            2  1.628479e+09   
4  5432257470541781  1996-11-25      16.68            1  1.628479e+09   

                write_time      api_invocation_time  is_deleted  
0  2021-08-09 04:05:06.237  2021-08-09 04:00:06.000       False  
1  2021-08-09 04:05:06.237  2021-08-09 04:00:07.000       False  
2  2021-08-09 04:05:06.237  2021-08-09 04:00:07.000       False  
3  2021-08-09 04:05:06.237  2021-08-09 04:00:07.000       False  
4  2021-08-09 04:05:06.237  2021-08-09 04:00:07.000       False  


## Step 3 - 
#### Leverage tags/versions to identify different versions of data.

In [None]:
dir(fraud_summary_feature_group.athena_query())

In [10]:

response = s3_client.list_objects_v2(
            Bucket="fsbu-user1-feature-store",
            Prefix ='123456789012/sagemaker/us-east-1/offline-store/fraud-summary-feature-group-1627918092/data',
            MaxKeys=100 )['Contents'][0]['Key']
#print("response: ", response)
for rec in response:
    


response:  123456789012/sagemaker/us-east-1/offline-store/fraud-summary-feature-group-1627918092/data/year=2021/month=08/day=02/hour=15/20210802T151642Z_028pMefNM5x5IQyM.parquet
