# AAI-540 Group 6 Project
# VenueSignal

# 4. Feature Store

In [1]:
!python --version

Python 3.12.9


In [2]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
from datetime import datetime
import time
from time import gmtime, strftime, sleep

# Check boto3 version
print(f"boto3 version: {boto3.__version__}")
print(f"sagemaker version: {sagemaker.__version__}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
boto3 version: 1.42.34
sagemaker version: 2.245.0


In [3]:
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role

# Configure region and session
REGION = "us-east-1"

boto_session = boto3.Session(region_name=REGION)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=REGION)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=REGION
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# Get execution role
role = get_execution_role()
print(f"SageMaker Role: {role}")

SageMaker Role: arn:aws:iam::297628177412:role/LabRole


In [21]:
# Your existing Athena bucket
ATHENA_BUCKET = "yelp-aai540-group6-athena-297628177412"
ATHENA_DB = "yelp"

# FeatureStore configuration
FEATURE_STORE_PREFIX = "feature-store"
FEATURE_STORE_OFFLINE_PREFIX = f"{FEATURE_STORE_PREFIX}/offline-store"

# S3 clients
s3_client = boto_session.client("s3")
athena_client = boto_session.client("athena")

print(f"Athena Bucket: {ATHENA_BUCKET}")
print(f"Feature Store Prefix: {FEATURE_STORE_PREFIX}")
print(f"Athena Database: {ATHENA_DB}")

Athena Bucket: yelp-aai540-group6-athena-297628177412
Feature Store Prefix: feature-store
Athena Database: yelp


In [5]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor

# Athena connection
athena_results_path = f"s3://{ATHENA_BUCKET}/athena-results/"

conn = connect(
    s3_staging_dir=athena_results_path,
    region_name=REGION,
    cursor_class=PandasCursor
)

In [6]:
# Query business data with key features
business_query = f"""
SELECT 
    b.business_id,
    b.name,
    b.city,
    b.state,
    b.latitude,
    b.longitude,
    b.stars,
    b.review_count,
    b.is_open,
    b.categories
FROM {ATHENA_DB}.business b
WHERE b.is_open = 1
LIMIT 10000
"""

print("Fetching business data from Athena...")
business_df = pd.read_sql(business_query, conn)
print(f"Loaded {len(business_df)} businesses")
business_df.head()

Fetching business data from Athena...


  business_df = pd.read_sql(business_query, conn)


‚úÖ Loaded 10000 businesses


Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,review_count,is_open,categories
0,nHUJCK-Ek_lAVTSyEz9GEQ,Adventure Coast Fun Park,Spring Hill,FL,28.435263,-82.566966,4.5,10,1,"American (New), Restaurants, Event Planning & ..."
1,xHspDrdyq1g27yRDezB5yA,Kaiserman JCC,Philadelphia,PA,39.982542,-75.268542,3.5,11,1,"Preschools, Recreation Centers, Education, Gym..."
2,XNBExNkAx5fjYM6fjqfzAA,Armory Park,Tucson,AZ,32.21943,-110.968498,3.5,6,1,"Parks, Active Life"
3,1p15mHF9dFRBYH_VKcdDRg,Bill's European Autoworks,Spring HIll,FL,28.434833,-82.542742,4.5,9,1,"Automotive, Auto Repair, Oil Change Stations, ..."
4,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,Edmonton,AB,53.494577,-113.517946,4.0,57,1,"Restaurants, Sushi Bars, Japanese"


In [7]:
# Query business attributes with parking features
attributes_query = f"""
SELECT 
    ba.business_id,
    ba.parking_garage,
    ba.parking_street,
    ba.parking_lot,
    ba.parking_valet,
    ba.parking_validated,
    ba.wifi,
    ba.alcohol,
    ba.noiselevel,
    ba.restaurantspricerange2,
    ba.outdoorseating,
    ba.restaurantsdelivery,
    ba.restaurantstakeout,
    ba.goodforkids,
    ba.restaurantsgoodforgroups,
    ba.ambience_casual,
    ba.ambience_trendy,
    ba.ambience_upscale,
    ba.open_days_count,
    ba.open_on_weekend
FROM {ATHENA_DB}.business_attributes ba
LIMIT 10000
"""

print("Fetching business attributes from Athena...")
attributes_df = pd.read_sql(attributes_query, conn)
print(f"Loaded {len(attributes_df)} business attribute records")
attributes_df.head()

Fetching business attributes from Athena...


  attributes_df = pd.read_sql(attributes_query, conn)


‚úÖ Loaded 10000 business attribute records


Unnamed: 0,business_id,parking_garage,parking_street,parking_lot,parking_valet,parking_validated,wifi,alcohol,noiselevel,restaurantspricerange2,outdoorseating,restaurantsdelivery,restaurantstakeout,goodforkids,restaurantsgoodforgroups,ambience_casual,ambience_trendy,ambience_upscale,open_days_count,open_on_weekend
0,nHUJCK-Ek_lAVTSyEz9GEQ,,,,,,no,,,,,,,True,,,,,7.0,True
1,xHspDrdyq1g27yRDezB5yA,,,,,,,,,,,,,True,,,,,7.0,True
2,HCbJPXWXvwN-C7XfmVy3gA,True,True,False,False,False,free,full_bar,average,2.0,True,,True,True,True,True,False,False,7.0,True
3,XNBExNkAx5fjYM6fjqfzAA,False,False,False,False,False,,,,,,,,True,,,,,,False
4,1p15mHF9dFRBYH_VKcdDRg,,,,,,free,,,,,,,,,,,,5.0,False


In [8]:
# Query aggregated review statistics per business
review_stats_query = f"""
SELECT 
    business_id,
    COUNT(*) as total_reviews,
    AVG(stars) as avg_review_stars,
    STDDEV(stars) as stddev_review_stars,
    SUM(useful) as total_useful_votes,
    SUM(funny) as total_funny_votes,
    SUM(cool) as total_cool_votes,
    COUNT(DISTINCT user_id) as unique_reviewers
FROM {ATHENA_DB}.review
GROUP BY business_id
HAVING COUNT(*) >= 5
LIMIT 10000
"""

print("Calculating review statistics from Athena...")
review_stats_df = pd.read_sql(review_stats_query, conn)
print(f"Calculated stats for {len(review_stats_df)} businesses")
review_stats_df.head()

Calculating review statistics from Athena...


  review_stats_df = pd.read_sql(review_stats_query, conn)


‚úÖ Calculated stats for 10000 businesses


Unnamed: 0,business_id,total_reviews,avg_review_stars,stddev_review_stars,total_useful_votes,total_funny_votes,total_cool_votes,unique_reviewers
0,bqR_3sT_rNp0hZglPtX6rw,594,4.181818,1.130867,416,162,237,578
1,quCIR7UcrMmpaKXYwet8pw,352,4.298295,1.050951,380,53,276,339
2,XyGRDrgCK0z4CiA6nhwEaw,268,4.354478,1.13065,222,49,106,258
3,PVVFos1LDfD7iETY0w4vaA,441,4.129252,1.067743,336,114,179,417
4,KMF17foSqysDXLZKpiPj9A,101,2.465347,1.48704,230,69,51,101


## Feature Engineering

In [9]:
# Merge business data with attributes
features_df = business_df.merge(attributes_df, on='business_id', how='left')

# Merge with review statistics
features_df = features_df.merge(review_stats_df, on='business_id', how='left')

print(f" Combined dataset shape: {features_df.shape}")
features_df.head()

 Combined dataset shape: (10000, 36)


Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,review_count,is_open,categories,...,ambience_upscale,open_days_count,open_on_weekend,total_reviews,avg_review_stars,stddev_review_stars,total_useful_votes,total_funny_votes,total_cool_votes,unique_reviewers
0,nHUJCK-Ek_lAVTSyEz9GEQ,Adventure Coast Fun Park,Spring Hill,FL,28.435263,-82.566966,4.5,10,1,"American (New), Restaurants, Event Planning & ...",...,,7.0,True,,,,,,,
1,xHspDrdyq1g27yRDezB5yA,Kaiserman JCC,Philadelphia,PA,39.982542,-75.268542,3.5,11,1,"Preschools, Recreation Centers, Education, Gym...",...,,7.0,True,,,,,,,
2,XNBExNkAx5fjYM6fjqfzAA,Armory Park,Tucson,AZ,32.21943,-110.968498,3.5,6,1,"Parks, Active Life",...,,,False,,,,,,,
3,1p15mHF9dFRBYH_VKcdDRg,Bill's European Autoworks,Spring HIll,FL,28.434833,-82.542742,4.5,9,1,"Automotive, Auto Repair, Oil Change Stations, ...",...,,5.0,False,,,,,,,
4,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,Edmonton,AB,53.494577,-113.517946,4.0,57,1,"Restaurants, Sushi Bars, Japanese",...,False,6.0,True,,,,,,,


In [11]:
# Feature Engineering

# 1. Parking Score: Aggregate parking availability
def calculate_parking_score(row):
    """Calculate a parking availability score (0-5)"""
    score = 0
    if pd.notna(row['parking_lot']) and row['parking_lot']:
        score += 2
    if pd.notna(row['parking_garage']) and row['parking_garage']:
        score += 2
    if pd.notna(row['parking_street']) and row['parking_street']:
        score += 1
    if pd.notna(row['parking_valet']) and row['parking_valet']:
        score += 1
    if pd.notna(row['parking_validated']) and row['parking_validated']:
        score += 1
    return min(score, 5)  # Cap at 5

features_df['parking_availability_score'] = features_df.apply(calculate_parking_score, axis=1)

# 2. Has Parking (binary)
features_df['has_parking'] = (features_df['parking_availability_score'] > 0).astype(int)

# 3. Price Range (convert to numeric, handle missing)
features_df['price_range'] = pd.to_numeric(features_df['restaurantspricerange2'], errors='coerce').fillna(2.0)

# 4. Review Engagement Score
features_df['review_engagement'] = (
    features_df['total_useful_votes'].fillna(0) + 
    features_df['total_funny_votes'].fillna(0) + 
    features_df['total_cool_votes'].fillna(0)
) / features_df['total_reviews'].fillna(1)

# 5. Rating Consistency (inverse of std dev)
features_df['rating_consistency'] = 1 / (features_df['stddev_review_stars'].fillna(0.5) + 0.1)

# 6. Is Urban (based on city categorization - simplified)
urban_cities = ['Las Vegas', 'Philadelphia', 'Phoenix', 'Charlotte', 'Toronto', 'Tampa']
features_df['is_urban'] = features_df['city'].isin(urban_cities).astype(int)

# 7. Category features
features_df['is_restaurant'] = features_df['categories'].fillna('').str.contains('Restaurant', case=False, na=False).astype(int)
features_df['is_food_service'] = features_df['categories'].fillna('').str.contains('Food|Restaurant|Bar|Cafe', case=False, na=False).astype(int)

# 8. Operational features
features_df['open_days_count'] = pd.to_numeric(features_df['open_days_count'], errors='coerce').fillna(0).astype(int)

# Convert string boolean to int for open_on_weekend
if 'open_on_weekend' in features_df.columns:
    features_df['open_on_weekend'] = features_df['open_on_weekend'].map({
        'true': 1, 'True': 1, True: 1, 1: 1,
        'false': 0, 'False': 0, False: 0, 0: 0,
        None: 0, '': 0
    }).fillna(0).astype(int)

# Fill remaining NaNs with appropriate defaults
numeric_cols = features_df.select_dtypes(include=[np.number]).columns
features_df[numeric_cols] = features_df[numeric_cols].fillna(0)

# Boolean columns to int - handle Athena's string booleans
bool_cols = ['parking_garage', 'parking_street', 'parking_lot', 'parking_valet', 
             'parking_validated', 'goodforkids', 'restaurantsgoodforgroups',
             'ambience_casual', 'ambience_trendy', 'ambience_upscale']

def convert_bool_to_int(series):
    """Convert various boolean representations to integer"""
    return series.map({
        'true': 1, 'True': 1, True: 1, 1: 1,
        'false': 0, 'False': 0, False: 0, 0: 0,
        None: 0, '': 0
    }).fillna(0).astype(int)

for col in bool_cols:
    if col in features_df.columns:
        features_df[col] = convert_bool_to_int(features_df[col])

print("Feature engineering complete")
print(f"Total features: {features_df.shape[1]}")
features_df[['business_id', 'name', 'parking_availability_score', 'has_parking', 
             'review_engagement', 'rating_consistency', 'is_urban']].head(10)

‚úÖ Feature engineering complete
Total features: 44


Unnamed: 0,business_id,name,parking_availability_score,has_parking,review_engagement,rating_consistency,is_urban
0,nHUJCK-Ek_lAVTSyEz9GEQ,Adventure Coast Fun Park,0,0,0.0,10.0,0
1,xHspDrdyq1g27yRDezB5yA,Kaiserman JCC,0,0,0.0,10.0,1
2,XNBExNkAx5fjYM6fjqfzAA,Armory Park,0,0,0.0,10.0,0
3,1p15mHF9dFRBYH_VKcdDRg,Bill's European Autoworks,0,0,0.0,10.0,0
4,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,2,1,0.0,10.0,0
5,SBgr-5n-kV3EeZztYsstUQ,Locust Lane Craft Brewery,2,1,0.0,10.0,0
6,wR8V8u3_wv-yjR-CIHyY0Q,The Collins Apartments,0,0,0.0,10.0,1
7,_CFqjUDwqpfKlsbl01KQ8Q,Disc Replay,0,0,0.0,10.0,0
8,v_C_FZW_rPsqHdBEY_O4kQ,Shapiro Supply,0,0,0.0,10.0,0
9,O4oK6A5wJuVWpYP4IMgUqw,Journeys Shoes,0,0,0.0,10.0,1


## Prepare the Data for the Feature Store

In [12]:
# Select features for Feature Store
feature_columns = [
    'business_id',
    'name',
    'city',
    'state',
    'latitude',
    'longitude',
    'stars',
    'review_count',
    'is_open',
    'parking_availability_score',
    'has_parking',
    'parking_garage',
    'parking_street',
    'parking_lot',
    'parking_valet',
    'price_range',
    'total_reviews',
    'avg_review_stars',
    'review_engagement',
    'rating_consistency',
    'is_urban',
    'is_restaurant',
    'is_food_service',
    'open_days_count',
    'open_on_weekend',
    'goodforkids',
    'restaurantsgoodforgroups',
    'ambience_casual',
    'ambience_trendy',
    'ambience_upscale',
]

feature_store_df = features_df[feature_columns].copy()

# Convert object types to string
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype('str').astype('string')

cast_object_to_string(feature_store_df)

# Add EventTime (required by Feature Store)
current_time_sec = int(round(time.time()))
feature_store_df['event_time'] = pd.Series([float(current_time_sec)] * len(feature_store_df), dtype='float64')

# Ensure business_id is string type
feature_store_df['business_id'] = feature_store_df['business_id'].astype('string')

print(f"Prepared {len(feature_store_df)} records for Feature Store")
print(f"Data types:\n{feature_store_df.dtypes}")
feature_store_df.head()

‚úÖ Prepared 10000 records for Feature Store
üìã Data types:
business_id                   string[python]
name                          string[python]
city                          string[python]
state                         string[python]
latitude                             float64
longitude                            float64
stars                                float64
review_count                           int64
is_open                                int64
parking_availability_score             int64
has_parking                            int64
parking_garage                         int64
parking_street                         int64
parking_lot                            int64
parking_valet                          int64
price_range                          float64
total_reviews                        float64
avg_review_stars                     float64
review_engagement                    float64
rating_consistency                   float64
is_urban                              

Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,review_count,is_open,parking_availability_score,...,is_restaurant,is_food_service,open_days_count,open_on_weekend,goodforkids,restaurantsgoodforgroups,ambience_casual,ambience_trendy,ambience_upscale,event_time
0,nHUJCK-Ek_lAVTSyEz9GEQ,Adventure Coast Fun Park,Spring Hill,FL,28.435263,-82.566966,4.5,10,1,0,...,1,1,7,1,1,0,0,0,0,1769396000.0
1,xHspDrdyq1g27yRDezB5yA,Kaiserman JCC,Philadelphia,PA,39.982542,-75.268542,3.5,11,1,0,...,0,0,7,1,1,0,0,0,0,1769396000.0
2,XNBExNkAx5fjYM6fjqfzAA,Armory Park,Tucson,AZ,32.21943,-110.968498,3.5,6,1,0,...,0,0,0,0,1,0,0,0,0,1769396000.0
3,1p15mHF9dFRBYH_VKcdDRg,Bill's European Autoworks,Spring HIll,FL,28.434833,-82.542742,4.5,9,1,0,...,0,0,5,0,0,0,0,0,0,1769396000.0
4,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,Edmonton,AB,53.494577,-113.517946,4.0,57,1,2,...,1,1,6,1,1,1,1,0,0,1769396000.0


In [14]:
# Create FeatureGroup name with timestamp
feature_group_name = f"venuesignal-business-features-{strftime('%d-%H-%M-%S', gmtime())}"

print(f"Feature Group Name: {feature_group_name}")

Feature Group Name: venuesignal-business-features-26-02-57-35


In [15]:
# Initialize FeatureGroup
business_feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=feature_store_session
)

# Load feature definitions from DataFrame
business_feature_group.load_feature_definitions(data_frame=feature_store_df)

print("Feature definitions loaded")
print(f"Total features: {len(business_feature_group.feature_definitions)}")

Feature definitions loaded
Total features: 31


In [16]:
# Create FeatureGroup in SageMaker
print(f"Creating FeatureGroup: {feature_group_name}")

business_feature_group.create(
    s3_uri=f"s3://{ATHENA_BUCKET}/{FEATURE_STORE_OFFLINE_PREFIX}",
    record_identifier_name="business_id",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True
)

print("Waiting for FeatureGroup creation to complete.")

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("  ... still creating")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

wait_for_feature_group_creation_complete(business_feature_group)

Creating FeatureGroup: venuesignal-business-features-26-02-57-35
‚è≥ Waiting for FeatureGroup creation to complete.
  ... still creating
  ... still creating
  ... still creating
  ... still creating
‚úÖ FeatureGroup venuesignal-business-features-26-02-57-35 successfully created.


In [17]:
# Verify FeatureGroup was created
business_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:297628177412:feature-group/venuesignal-business-features-26-02-57-35',
 'FeatureGroupName': 'venuesignal-business-features-26-02-57-35',
 'RecordIdentifierFeatureName': 'business_id',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'business_id',
   'FeatureType': 'String'},
  {'FeatureName': 'name', 'FeatureType': 'String'},
  {'FeatureName': 'city', 'FeatureType': 'String'},
  {'FeatureName': 'state', 'FeatureType': 'String'},
  {'FeatureName': 'latitude', 'FeatureType': 'Fractional'},
  {'FeatureName': 'longitude', 'FeatureType': 'Fractional'},
  {'FeatureName': 'stars', 'FeatureType': 'Fractional'},
  {'FeatureName': 'review_count', 'FeatureType': 'Integral'},
  {'FeatureName': 'is_open', 'FeatureType': 'Integral'},
  {'FeatureName': 'parking_availability_score', 'FeatureType': 'Integral'},
  {'FeatureName': 'has_parking', 'FeatureType': 'Integral'},
  {'FeatureName': 'parking_garage', 'FeatureType': 'I

In [18]:
# Ingest data into FeatureGroup
print(f"Ingesting {len(feature_store_df)} records into FeatureGroup")

business_feature_group.ingest(
    data_frame=feature_store_df,
    max_workers=3,
    wait=True
)

print("Data ingestion complete!")

Ingesting 10000 records into FeatureGroup
Data ingestion complete!


## Verify the Data Ingestion

In [19]:
# Get a sample record from online store
sample_business_id = str(feature_store_df['business_id'].iloc[0])

print(f"Fetching record for business_id: {sample_business_id}")

record_response = featurestore_runtime.get_record(
    FeatureGroupName=feature_group_name,
    RecordIdentifierValueAsString=sample_business_id
)

print(f"Retrieved record:")
for feature in record_response['Record']:
    print(f"  {feature['FeatureName']}: {feature['ValueAsString']}")

Fetching record for business_id: nHUJCK-Ek_lAVTSyEz9GEQ
Retrieved record:
  business_id: nHUJCK-Ek_lAVTSyEz9GEQ
  name: Adventure Coast Fun Park
  city: Spring Hill
  state: FL
  latitude: 28.4352625
  longitude: -82.5669664
  stars: 4.5
  review_count: 10
  is_open: 1
  parking_availability_score: 0
  has_parking: 0
  parking_garage: 0
  parking_street: 0
  parking_lot: 0
  parking_valet: 0
  price_range: 2.0
  total_reviews: 0.0
  avg_review_stars: 0.0
  review_engagement: 0.0
  rating_consistency: 10.0
  is_urban: 0
  is_restaurant: 1
  is_food_service: 1
  open_days_count: 7
  open_on_weekend: 1
  goodforkids: 1
  restaurantsgoodforgroups: 0
  ambience_casual: 0
  ambience_trendy: 0
  ambience_upscale: 0
  event_time: 1769396225.0


In [20]:
# Wait for offline store to be populated
print("Waiting for data to appear in offline store")

offline_store_s3_uri = business_feature_group.describe().get("OfflineStoreConfig").get("S3StorageConfig").get("ResolvedOutputS3Uri")
offline_store_prefix = offline_store_s3_uri.replace(f"s3://{ATHENA_BUCKET}/", "")

print(f"Offline store location: {offline_store_s3_uri}")

offline_store_contents = None
max_wait_minutes = 10
wait_count = 0

while offline_store_contents is None and wait_count < max_wait_minutes:
    objects_in_bucket = s3_client.list_objects(
        Bucket=ATHENA_BUCKET,
        Prefix=offline_store_prefix
    )
    if 'Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1:
        offline_store_contents = objects_in_bucket['Contents']
        print(f"‚úÖ Data available in offline store! ({len(offline_store_contents)} objects)")
    else:
        wait_count += 1
        print(f"  ... waiting ({wait_count}/{max_wait_minutes} minutes)")
        sleep(60)

if offline_store_contents is None:
    print("Offline store data not yet available. Continue anyway - it will appear shortly.")
else:
    print(f"Offline store ready with {len(offline_store_contents)} files")

Waiting for data to appear in offline store
Offline store location: s3://yelp-aai540-group6-athena-297628177412/feature-store/offline-store/297628177412/sagemaker/us-east-1/offline-store/venuesignal-business-features-26-02-57-35-1769396281/data
  ... waiting (1/10 minutes)
  ... waiting (2/10 minutes)
  ... waiting (3/10 minutes)
‚úÖ Data available in offline store! (54 objects)
Offline store ready with 54 files


In [22]:
# Generate Hive DDL for the FeatureGroup
print("üìã Hive DDL for FeatureGroup:\n")
print(business_feature_group.as_hive_ddl())

üìã Hive DDL for FeatureGroup:

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.venuesignal-business-features-26-02-57-35 (
  business_id STRING
  name STRING
  city STRING
  state STRING
  latitude FLOAT
  longitude FLOAT
  stars FLOAT
  review_count INT
  is_open INT
  parking_availability_score INT
  has_parking INT
  parking_garage INT
  parking_street INT
  parking_lot INT
  parking_valet INT
  price_range FLOAT
  total_reviews FLOAT
  avg_review_stars FLOAT
  review_engagement FLOAT
  rating_consistency FLOAT
  is_urban INT
  is_restaurant INT
  is_food_service INT
  open_days_count INT
  open_on_weekend INT
  goodforkids INT
  restaurantsgoodforgroups INT
  ambience_casual INT
  ambience_trendy INT
  ambience_upscale INT
  event_time FLOAT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMA

In [23]:
# Query offline store using the FeatureGroup's Athena query interface
athena_query = business_feature_group.athena_query()

# Get the table name
table_name = athena_query.table_name
print(f"Athena table name: {table_name}")

Athena table name: venuesignal_business_features_26_02_57_35_1769396281


In [24]:
# Build a training dataset query
# Select businesses with good parking vs poor parking and their ratings

training_query = f"""
SELECT 
    business_id,
    stars,
    review_count,
    parking_availability_score,
    has_parking,
    price_range,
    total_reviews,
    avg_review_stars,
    review_engagement,
    rating_consistency,
    is_urban,
    is_restaurant,
    open_days_count,
    open_on_weekend,
    goodforkids,
    restaurantsgoodforgroups,
    CASE 
        WHEN parking_availability_score >= 3 THEN 1 
        ELSE 0 
    END as has_good_parking,
    CASE
        WHEN avg_review_stars >= 4.0 THEN 1
        ELSE 0
    END as is_highly_rated
FROM "{table_name}"
WHERE total_reviews >= 10
    AND is_restaurant = 1
    AND is_urban = 1
LIMIT 5000
"""

print("Running Athena query on offline store...")
print(f"Query: {training_query}\n")

# Execute query
athena_query.run(
    query_string=training_query,
    output_location=f"s3://{ATHENA_BUCKET}/{FEATURE_STORE_PREFIX}/query_results/"
)

print("Waiting for query to complete")
athena_query.wait()

# Load results into DataFrame
training_dataset = athena_query.as_dataframe()

print(f"Training dataset ready: {training_dataset.shape}")
training_dataset.head(10)

Running Athena query on offline store...
Query: 
SELECT 
    business_id,
    stars,
    review_count,
    parking_availability_score,
    has_parking,
    price_range,
    total_reviews,
    avg_review_stars,
    review_engagement,
    rating_consistency,
    is_urban,
    is_restaurant,
    open_days_count,
    open_on_weekend,
    goodforkids,
    restaurantsgoodforgroups,
    CASE 
        WHEN parking_availability_score >= 3 THEN 1 
        ELSE 0 
    END as has_good_parking,
    CASE
        WHEN avg_review_stars >= 4.0 THEN 1
        ELSE 0
    END as is_highly_rated
FROM "venuesignal_business_features_26_02_57_35_1769396281"
WHERE total_reviews >= 10
    AND is_restaurant = 1
    AND is_urban = 1
LIMIT 5000


Waiting for query to complete
Training dataset ready: (30, 18)


Unnamed: 0,business_id,stars,review_count,parking_availability_score,has_parking,price_range,total_reviews,avg_review_stars,review_engagement,rating_consistency,is_urban,is_restaurant,open_days_count,open_on_weekend,goodforkids,restaurantsgoodforgroups,has_good_parking,is_highly_rated
0,_HcAoG6Jj_XTj4-XNEbo6Q,3.0,172,2,1,1.0,175.0,3.205714,1.857143,0.618306,1,1,7,1,1,1,0,0
1,y1HgVnt4K5owmYXC5yXLGg,4.5,138,3,1,2.0,140.0,4.307143,1.492857,0.777099,1,1,7,1,1,1,1,1
2,hc7I_QpCszT3mlIP9uW19w,4.0,102,0,0,2.0,112.0,3.857143,1.1875,0.899157,1,1,0,0,0,0,0,0
3,Zjg5nixJ-sTr9B6tsDzClw,4.0,153,0,0,2.0,156.0,4.115385,2.871795,0.743969,1,1,0,0,0,0,0,1
4,Fi2aJPlbnkeVyRKErn0IBQ,4.0,348,2,1,2.0,366.0,3.800546,1.527322,0.702961,1,1,7,1,1,1,0,0
5,txasGoZ1qB5toVaFiTbpmQ,4.0,88,0,0,2.0,88.0,4.056818,3.772727,0.714995,1,1,7,1,1,1,0,1
6,ZPjokVSTWDyRMWxaW7z_9A,4.0,75,1,1,1.0,81.0,3.950617,1.518519,0.829407,1,1,7,1,1,0,0,0
7,oBhJuukGRqPVvYBfTkhuZA,3.5,385,0,0,2.0,401.0,3.668329,1.361596,0.750889,1,1,0,0,0,0,0,0
8,d5fAUl4lKaNxGfiXj4Kygg,3.5,241,1,1,1.0,251.0,3.701195,1.956175,0.655175,1,1,7,1,1,1,0,0
9,fr2qDm_mY1afIGMvqsKUCg,3.5,50,3,1,1.0,53.0,3.377358,1.188679,0.636715,1,1,7,1,1,1,1,0


In [25]:
# Analyze the training dataset
print("Training Dataset Summary:\n")
print(f"Total records: {len(training_dataset)}")
print(f"\nTarget Distribution (is_highly_rated):")
print(training_dataset['is_highly_rated'].value_counts())
print(f"\nParking Distribution (has_good_parking):")
print(training_dataset['has_good_parking'].value_counts())
print(f"\nCorrelation between parking and rating:")
print(pd.crosstab(training_dataset['has_good_parking'], training_dataset['is_highly_rated'], normalize='index'))

Training Dataset Summary:

Total records: 30

Target Distribution (is_highly_rated):
is_highly_rated
0    19
1    11
Name: count, dtype: int64

Parking Distribution (has_good_parking):
has_good_parking
0    26
1     4
Name: count, dtype: int64

Correlation between parking and rating:
is_highly_rated          0         1
has_good_parking                    
0                 0.692308  0.307692
1                 0.250000  0.750000


In [None]:
# Reproducibility
np.random.seed(12341)

# Create a random number for each row
rand = np.random.rand(len(training_dataset))

# Assign split based on cumulative ratios
training_dataset["split"] = np.select(
    [
        rand < 0.40,
        (rand >= 0.40) & (rand < 0.50),
        (rand >= 0.50) & (rand < 0.60),
        rand >= 0.60
    ],
    ["train", "val", "test", "prod"]
)

# split values check
print(training_dataset["split"].value_counts(normalize=True))

training_dataset.to_csv("./training_data/split_dataset.csv", index=False)

In [26]:
# Save training dataset to S3 for model training
training_data_s3_path = f"s3://{ATHENA_BUCKET}/{FEATURE_STORE_PREFIX}/training_data/venuesignal_training.csv"

# Save without index
training_dataset.to_csv('venuesignal_training.csv', index=False)

# Upload to S3
s3_client.upload_file(
    'venuesignal_training.csv',
    ATHENA_BUCKET,
    f"{FEATURE_STORE_PREFIX}/training_data/venuesignal_training.csv"
)

print(f"Training dataset saved to: {training_data_s3_path}")

Training dataset saved to: s3://yelp-aai540-group6-athena-297628177412/feature-store/training_data/venuesignal_training.csv


## Query the Feature Store

In [27]:
# Simulate real-time feature retrieval for inference
test_business_ids = feature_store_df['business_id'].head(5).tolist()

print("Retrieving features for sample businesses:\n")

for business_id in test_business_ids:
    try:
        record = featurestore_runtime.get_record(
            FeatureGroupName=feature_group_name,
            RecordIdentifierValueAsString=business_id
        )
        
        # Extract key features
        features = {f['FeatureName']: f['ValueAsString'] for f in record['Record']}
        
        print(f"Business: {features.get('name', 'N/A')}")
        print(f"  Location: {features.get('city', 'N/A')}, {features.get('state', 'N/A')}")
        print(f"  Stars: {features.get('stars', 'N/A')} | Reviews: {features.get('total_reviews', 'N/A')}")
        print(f"  Parking Score: {features.get('parking_availability_score', 'N/A')}")
        print(f"  Rating Consistency: {features.get('rating_consistency', 'N/A')}")
        print(f"  Review Engagement: {features.get('review_engagement', 'N/A')}\n")
        
    except Exception as e:
        print(f"Error retrieving {business_id}: {e}\n")

Retrieving features for sample businesses:

Business: Adventure Coast Fun Park
  Location: Spring Hill, FL
  Stars: 4.5 | Reviews: 0.0
  Parking Score: 0
  Rating Consistency: 10.0
  Review Engagement: 0.0

Business: Kaiserman JCC
  Location: Philadelphia, PA
  Stars: 3.5 | Reviews: 0.0
  Parking Score: 0
  Rating Consistency: 10.0
  Review Engagement: 0.0

Business: Armory Park
  Location: Tucson, AZ
  Stars: 3.5 | Reviews: 0.0
  Parking Score: 0
  Rating Consistency: 10.0
  Review Engagement: 0.0

Business: Bill's European Autoworks
  Location: Spring HIll, FL
  Stars: 4.5 | Reviews: 0.0
  Parking Score: 0
  Rating Consistency: 10.0
  Review Engagement: 0.0

Business: Sushi Wasabi
  Location: Edmonton, AB
  Stars: 4.0 | Reviews: 0.0
  Parking Score: 2
  Rating Consistency: 10.0
  Review Engagement: 0.0



In [28]:
# Batch retrieval
print("Batch retrieving features for multiple businesses\n")

batch_response = featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": feature_group_name,
            "RecordIdentifiersValueAsString": test_business_ids[:3]
        }
    ]
)

print(f"Retrieved {len(batch_response['Records'])} records")
print(f"Errors: {len(batch_response.get('Errors', []))}")

# Display first record
if batch_response['Records']:
    first_record = batch_response['Records'][0]
    print(f"\nSample record:")
    for feature in first_record['Record']:
        print(f"  {feature['FeatureName']}: {feature['ValueAsString']}")

Batch retrieving features for multiple businesses

Retrieved 3 records
Errors: 0

Sample record:
  business_id: XNBExNkAx5fjYM6fjqfzAA
  name: Armory Park
  city: Tucson
  state: AZ
  latitude: 32.2194299
  longitude: -110.9684979
  stars: 3.5
  review_count: 6
  is_open: 1
  parking_availability_score: 0
  has_parking: 0
  parking_garage: 0
  parking_street: 0
  parking_lot: 0
  parking_valet: 0
  price_range: 2.0
  total_reviews: 0.0
  avg_review_stars: 0.0
  review_engagement: 0.0
  rating_consistency: 10.0
  is_urban: 0
  is_restaurant: 0
  is_food_service: 0
  open_days_count: 0
  open_on_weekend: 0
  goodforkids: 1
  restaurantsgoodforgroups: 0
  ambience_casual: 0
  ambience_trendy: 0
  ambience_upscale: 0
  event_time: 1769396225.0


## Uncomment to Run

These cells below can be uncommented to clear out the FeatureGroup and S3 FeatureStore data. We need this for the coming weeks as we work on this project.

In [None]:
# # Delete FeatureGroup (uncomment to run)
# business_feature_group.delete()
# print(f"Deleted FeatureGroup: {feature_group_name}")

# # Clean up S3 data (uncomment to run)
# # Warning: This will delete all Feature Store data
# 
# import boto3
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(ATHENA_BUCKET)
# 
# # Delete offline store data
# bucket.objects.filter(Prefix=f"{FEATURE_STORE_OFFLINE_PREFIX}/").delete()
# print(f"Deleted offline store data")
# 
# # Delete query results
# bucket.objects.filter(Prefix=f"{FEATURE_STORE_PREFIX}/query_results/").delete()
# print(f"Deleted query results")


In [None]:
# # This Deletes all feature groups you might have.
# groups = sagemaker_client.list_feature_groups()

# for group in groups["FeatureGroupSummaries"]:
#     sagemaker_client.delete_feature_group(
#         FeatureGroupName=group["FeatureGroupName"]
#     )