In [3]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
arn:aws:iam::676076160400:role/LabRole


## Ingesting Data Into Feature Store

In [6]:
from sagemaker import get_execution_role
import pandas as pd
import time
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.session import Session


In [7]:
# Load the cleaned data
cleaned_data_path = 'cleaned_data_with_features.csv'
data_cleaned = pd.read_csv(cleaned_data_path)

# Print the column names to verify
print("Columns in DataFrame:", data_cleaned.columns)

# Add 'event_time' column with the current timestamp in ISO-8601 format
data_cleaned['event_time'] = pd.to_datetime('now').strftime('%Y-%m-%dT%H:%M:%SZ')

# Ensure 'reviews.numHelpful' column is numeric, fill NaN with 0, and cap values to fit within the range of a 64-bit signed integer
max_int64 = 9223372036854775807  # 64-bit signed integer max value
if 'reviews.numHelpful' in data_cleaned.columns:
    data_cleaned['reviews.numHelpful'] = pd.to_numeric(data_cleaned['reviews.numHelpful'], errors='coerce').fillna(0).astype(int)
    data_cleaned['reviews.numHelpful'] = data_cleaned['reviews.numHelpful'].apply(lambda x: min(x, max_int64))

# Rename the columns to conform to AWS constraints
data_cleaned.rename(columns={
    'reviews.date': 'reviews_date',
    'reviews.dateSeen': 'reviews_dateSeen',
    'reviews.doRecommend': 'reviews_doRecommend',
    'reviews.numHelpful': 'reviews_numHelpful',
    'reviews.rating': 'reviews_rating',
    'reviews.sourceURLs': 'reviews_sourceURLs',
    'reviews.text': 'reviews_text',
    'reviews.title': 'reviews_title',
    'reviews.username': 'reviews_username',
    'primaryCategories': 'primaryCategories',  # Adjust this if needed
    'event_time': 'event_time'
}, inplace=True)

# Further replace any remaining non-alphanumeric characters in column names with underscores
data_cleaned.columns = [col.replace('.', '_').replace('-', '_') for col in data_cleaned.columns]


Columns in DataFrame: Index(['id', 'asins', 'brand', 'categories', 'dateAdded', 'dateUpdated',
       'imageURLs', 'keys', 'manufacturerNumber', 'name', 'primaryCategories',
       'reviews.date', 'reviews.dateSeen', 'reviews.doRecommend',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs',
       'upc', 'weight', 'processed_reviews', 'review_length', 'sentiment',
       'word_count', 'num_reviews'],
      dtype='object')


In [8]:
# Initialize SageMaker session
sagemaker_session = Session()

# Define a new feature group name and schema
unique_suffix = str(int(time.time()))  # Unique suffix based on the current time
new_feature_group_name = f'electronics_reviews_feature_group_{unique_suffix}'
record_identifier_name = 'id'
event_time_feature_name = 'event_time'
s3_uri = 's3://electronics-dataset/feature-store/'  # Correct S3 URI for feature store

feature_definitions = [
    FeatureDefinition(feature_name='id', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='asins', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='brand', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='categories', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='dateAdded', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='dateUpdated', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='imageURLs', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='keys', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='manufacturerNumber', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='name', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='primaryCategories', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_date', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_dateSeen', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_doRecommend', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_numHelpful', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='reviews_rating', feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name='reviews_sourceURLs', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_text', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_title', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='reviews_username', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='sourceURLs', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='upc', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='weight', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='processed_reviews', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='sentiment', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='word_count', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='num_reviews', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='review_length', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='event_time', feature_type=FeatureTypeEnum.STRING)
]

# Delete existing feature group if it exists
try:
    feature_group = FeatureGroup(name=new_feature_group_name, sagemaker_session=sagemaker_session)
    feature_group.delete()
    print(f"Deleted existing feature group: {new_feature_group_name}")
except Exception as e:
    print(f"No existing feature group to delete: {e}")


No existing feature group to delete: An error occurred (ResourceNotFound) when calling the DeleteFeatureGroup operation: Resource Not Found: Amazon SageMaker can't find a FeatureGroup with name electronics_reviews_feature_group_1719844110


In [9]:
# Create the new feature group
feature_group = FeatureGroup(
    name=new_feature_group_name,
    feature_definitions=feature_definitions,
    sagemaker_session=sagemaker_session
)

# Create the feature group in SageMaker
feature_group.create(
    s3_uri=s3_uri,
    record_identifier_name=record_identifier_name,
    event_time_feature_name=event_time_feature_name,
    role_arn='arn:aws:iam::676076160400:role/LabRole'
)

# Wait for the feature group to be active
while feature_group.describe().get("FeatureGroupStatus") != "Created":
    print("Waiting for feature group creation...")
    time.sleep(5)

# Ingest the data into the feature store
feature_group.ingest(data_frame=data_cleaned, max_workers=3, wait=True)
print("Data ingestion completed.")

Waiting for feature group creation...
Waiting for feature group creation...
Waiting for feature group creation...
Waiting for feature group creation...
Data ingestion completed.
