In [168]:
import json
import uuid
import boto3
import sagemaker
import pandas as pd

from sagemaker.feature_store.feature_group import FeatureGroup, FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.inputs import TableFormatEnum
from datetime import datetime, timezone, date
import time

In [169]:
s3 = boto3.client('s3')
session = sagemaker.Session()
feature_store_client = session.boto_session.client('sagemaker-featurestore-runtime')
sagemaker_client = session.boto_session.client('sagemaker')
bucket_name = 'team1-index-predictor-bucket'
feature_group_name = 'index-predictor-feature-group-v7'
feature_store_bucket_prefix = 'feature-store-v7'
sm_role = sagemaker.get_execution_role()

In [170]:
def generate_event_timestamp():
    # naive datetime representing local time
    naive_dt = datetime.now()
    # take timezone into account
    aware_dt = naive_dt.astimezone()
    # time in UTC
    utc_dt = aware_dt.astimezone(timezone.utc)
    # transform to ISO-8601 format
    event_time = utc_dt.isoformat(timespec='milliseconds')
    event_time = event_time.replace('+00:00', 'Z')
    return event_time

In [171]:
df = pd.read_csv("../data/processed/data.csv")


def convert_col_name(c):
    return c.lower().replace('.', '_').replace('-', '_').rstrip('_')


df = df.rename(columns=convert_col_name)

In [172]:
feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=session,
)

feature_group.load_feature_definitions(data_frame=df)

print()




In [173]:
feature_group.create(
    s3_uri=f's3://{bucket_name}/{feature_store_bucket_prefix}',
    record_identifier_name='datetime',
    event_time_feature_name='datetime',
    role_arn=sm_role, 
    enable_online_store=False,
    table_format=TableFormatEnum.ICEBERG 
)

{'FeatureGroupArn': 'arn:aws:sagemaker:eu-central-1:567821811420:feature-group/index-predictor-feature-group-v7',
 'ResponseMetadata': {'RequestId': 'a3dfe05c-1eab-4682-92d6-4a656e2bdbcd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a3dfe05c-1eab-4682-92d6-4a656e2bdbcd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '112',
   'date': 'Fri, 21 Jun 2024 07:15:55 GMT'},
  'RetryAttempts': 0}}

In [174]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get('FeatureGroupStatus')
    print(f'Initial status: {status}')
    while status == 'Creating':
        print(f'Waiting for feature group: {feature_group.name} to be created ...')
        time.sleep(5)
        status = feature_group.describe().get('FeatureGroupStatus')
    if status != 'Created':
        raise SystemExit(f'Failed to create feature group {feature_group.name}: {status}')
    print(f'FeatureGroup {feature_group.name} was successfully created.')

In [175]:
wait_for_feature_group_creation_complete(feature_group)

feature_group.describe()

Initial status: Creating
Waiting for feature group: index-predictor-feature-group-v7 to be created ...
Waiting for feature group: index-predictor-feature-group-v7 to be created ...
Waiting for feature group: index-predictor-feature-group-v7 to be created ...
Waiting for feature group: index-predictor-feature-group-v7 to be created ...
Waiting for feature group: index-predictor-feature-group-v7 to be created ...
Waiting for feature group: index-predictor-feature-group-v7 to be created ...
FeatureGroup index-predictor-feature-group-v7 was successfully created.


{'FeatureGroupArn': 'arn:aws:sagemaker:eu-central-1:567821811420:feature-group/index-predictor-feature-group-v7',
 'FeatureGroupName': 'index-predictor-feature-group-v7',
 'RecordIdentifierFeatureName': 'datetime',
 'EventTimeFeatureName': 'datetime',
 'FeatureDefinitions': [{'FeatureName': 'datetime', 'FeatureType': 'String'},
  {'FeatureName': 'open', 'FeatureType': 'Fractional'},
  {'FeatureName': 'high', 'FeatureType': 'Fractional'},
  {'FeatureName': 'low', 'FeatureType': 'Fractional'},
  {'FeatureName': 'close', 'FeatureType': 'Fractional'},
  {'FeatureName': 'volume', 'FeatureType': 'Integral'},
  {'FeatureName': 'hour', 'FeatureType': 'Integral'},
  {'FeatureName': 'minute', 'FeatureType': 'Integral'},
  {'FeatureName': 'dayofweek_0', 'FeatureType': 'Integral'},
  {'FeatureName': 'dayofweek_1', 'FeatureType': 'Integral'},
  {'FeatureName': 'dayofweek_2', 'FeatureType': 'Integral'},
  {'FeatureName': 'dayofweek_3', 'FeatureType': 'Integral'},
  {'FeatureName': 'dayofweek_4', 'Fe

In [154]:
feature_group.ingest(data_frame=df, max_processes=3, wait=True)

IngestionManagerPandas(feature_group_name='index-predictor-feature-group-v6', feature_definitions={'datetime': {'FeatureName': 'datetime', 'FeatureType': 'String'}, 'open': {'FeatureName': 'open', 'FeatureType': 'Fractional'}, 'high': {'FeatureName': 'high', 'FeatureType': 'Fractional'}, 'low': {'FeatureName': 'low', 'FeatureType': 'Fractional'}, 'close': {'FeatureName': 'close', 'FeatureType': 'Fractional'}, 'volume': {'FeatureName': 'volume', 'FeatureType': 'Integral'}, 'hour': {'FeatureName': 'hour', 'FeatureType': 'Integral'}, 'minute': {'FeatureName': 'minute', 'FeatureType': 'Integral'}, 'dayofweek_0': {'FeatureName': 'dayofweek_0', 'FeatureType': 'Integral'}, 'dayofweek_1': {'FeatureName': 'dayofweek_1', 'FeatureType': 'Integral'}, 'dayofweek_2': {'FeatureName': 'dayofweek_2', 'FeatureType': 'Integral'}, 'dayofweek_3': {'FeatureName': 'dayofweek_3', 'FeatureType': 'Integral'}, 'dayofweek_4': {'FeatureName': 'dayofweek_4', 'FeatureType': 'Integral'}, 'open_lag_1': {'FeatureName':

In [50]:
feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=session,
)

In [155]:
feature_names = [f.feature_name for f in feature_group.feature_definitions]
len(feature_names)

165

In [199]:
query = feature_group.athena_query()
query_string = f"""
SELECT * FROM "{query.table_name}"
"""

output_location = f"s3://{bucket_name}/data/retrieved/"

query.run(query_string=query_string, output_location=output_location)
query.wait()

In [200]:
query_results = query.as_dataframe()
len(query_results), len(df)

(7627, 7623)

In [167]:
query_results.head()

Unnamed: 0,write_time,api_invocation_time,is_deleted,datetime,open,high,low,close,volume,hour,...,high_lag_29,low_lag_29,volume_lag_29,open_lag_30,close_lag_30,high_lag_30,low_lag_30,volume_lag_30,close_target,type
0,2024-06-20 22:44:15.578000 UTC,2024-06-20 22:39:17.000000 UTC,False,2024-06-07T09:34:00.000000Z,5335.959961,5336.040039,5331.330078,5331.47998,6560401,9,...,5350.830078,5348.97998,4825000.0,5352.319824,5350.580078,5352.540039,5350.52002,4934000.0,1,train
1,2024-06-20 22:44:15.578000 UTC,2024-06-20 22:39:18.000000 UTC,False,2024-06-07T09:49:00.000000Z,5343.529785,5345.830078,5343.529785,5345.810059,3962175,9,...,5353.700195,5351.990234,12785000.0,5352.189941,5351.97998,5353.680176,5351.899902,8510000.0,1,train
2,2024-06-20 22:44:15.578000 UTC,2024-06-20 22:39:18.000000 UTC,False,2024-06-07T10:07:00.000000Z,5349.080078,5349.649902,5348.350098,5349.649902,3982549,10,...,5336.069824,5333.509766,4595323.0,5336.319824,5335.410156,5336.560059,5335.009766,5202327.0,0,train
3,2024-06-20 22:39:14.416000 UTC,2024-06-20 22:37:15.000000 UTC,False,2024-05-28T09:41:00.000000Z,5311.689941,5312.640137,5310.02002,5311.680176,6481790,9,...,5302.680176,5300.439941,5286000.0,5302.419922,5302.689941,5302.919922,5302.330078,4506000.0,0,train
4,2024-06-20 22:39:14.416000 UTC,2024-06-20 22:37:16.000000 UTC,False,2024-05-28T10:11:00.000000Z,5306.450195,5306.660156,5305.509766,5305.509766,4847915,10,...,5312.560059,5310.390137,6801446.0,5311.689941,5311.680176,5312.640137,5310.02002,6481790.0,0,train


In [55]:
!aws s3 ls s3://team1-index-predictor-bucket/data/retrieved/

2024-06-20 20:43:27   33455213 ccac4477-2b47-437d-beb7-e35d5d3ba7fb.csv
2024-06-20 20:43:28       8406 ccac4477-2b47-437d-beb7-e35d5d3ba7fb.csv.metadata


In [56]:
!aws s3 cp s3://team1-index-predictor-bucket/data/retrieved/ccac4477-2b47-437d-beb7-e35d5d3ba7fb.csv.metadata ../data/retrieved/metadata.csv.metadata

download: s3://team1-index-predictor-bucket/data/retrieved/ccac4477-2b47-437d-beb7-e35d5d3ba7fb.csv.metadata to ../data/retrieved/metadata.csv.metadata


In [None]:
!aws s3 cp s3://team1-index-predictor-bucket/data/retrieved/ccac4477-2b47-437d-beb7-e35d5d3ba7fb.csv ../data/retrieved/metadata.csv

In [89]:
query_results.to_csv("../data/retrieved/data_0.csv")