# AAI-540 Group 6 Project
# VenueSignal

# 4. Feature Store

In [3]:
!python --version

Python 3.12.9


In [4]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
from datetime import datetime
import time
from time import gmtime, strftime, sleep

# Check boto3 version
print(f"boto3 version: {boto3.__version__}")
print(f"sagemaker version: {sagemaker.__version__}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
boto3 version: 1.37.3
sagemaker version: 2.245.0


In [5]:
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker import get_execution_role

# Configure region and session
REGION = "us-east-1"

boto_session = boto3.Session(region_name=REGION)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=REGION)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=REGION
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# Get execution role
role = get_execution_role()
print(f"SageMaker Role: {role}")

SageMaker Role: arn:aws:iam::297628177412:role/LabRole


In [6]:
# Existing Athena bucket
ATHENA_BUCKET = "yelp-aai540-group6-athena-297628177412"
ATHENA_DB = "yelp"

# FeatureStore configuration
FEATURE_STORE_PREFIX = "feature-store"
FEATURE_STORE_OFFLINE_PREFIX = f"{FEATURE_STORE_PREFIX}/offline-store"

# S3 clients
s3_client = boto_session.client("s3")
athena_client = boto_session.client("athena")

print(f"Athena Bucket: {ATHENA_BUCKET}")
print(f"Feature Store Prefix: {FEATURE_STORE_PREFIX}")
print(f"Athena Database: {ATHENA_DB}")

Athena Bucket: yelp-aai540-group6-athena-297628177412
Feature Store Prefix: feature-store
Athena Database: yelp


In [7]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor

# Athena connection
athena_results_path = f"s3://{ATHENA_BUCKET}/athena-results/"

conn = connect(
    s3_staging_dir=athena_results_path,
    region_name=REGION,
    cursor_class=PandasCursor
)

In [8]:
# Load processed business data with parking features (already engineered from EDA notebook)
processed_data_path = f"s3://{ATHENA_BUCKET}/processed-data/business_with_parking.parquet"

print("Loading processed business data with parking features...")
print(f"Source: {processed_data_path}")
business_df = pd.read_parquet(processed_data_path)
print(f"âœ… Loaded {len(business_df)} businesses from processed data")
print(f"Columns: {list(business_df.columns)}")
business_df.head()

Loading processed business data with parking features...
Source: s3://yelp-aai540-group6-athena-297628177412/processed-data/business_with_parking.parquet
âœ… Loaded 21232 businesses from processed data
Columns: ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories', 'parking_garage', 'parking_street', 'parking_validated', 'parking_lot', 'parking_valet', 'price_range', 'restaurantsreservations', 'restaurantstakeout', 'restaurantsdelivery', 'outdoorseating', 'wifi', 'alcohol', 'has_garage', 'has_street', 'has_validated', 'has_lot', 'has_valet', 'parking_types_count', 'has_any_parking']


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,outdoorseating,wifi,alcohol,has_garage,has_street,has_validated,has_lot,has_valet,parking_types_count,has_any_parking
0,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,5714 111 Street NW,Edmonton,AB,T6H 3G1,53.494577,-113.517946,4.0,57,...,False,free,beer_and_wine,False,False,False,True,False,1,True
1,kGjpBqAqG4mIuFSgU2KKEA,The Royal Indian Cuisine,272 S 20th St,Philadelphia,PA,19103,39.948365,-75.1745,4.0,28,...,False,no,,False,True,False,False,False,1,True
2,uKEabBQrn0gLzvTppOrKIA,Casita Taqueria,2701 4th St,Saint Petersburg,FL,33704,27.796976,-82.638168,4.5,207,...,True,no,beer_and_wine,False,False,False,True,False,1,True
3,8JEOmVGQffkr6xtn4vvplQ,Pita Kebob,3028 E College Ave,Ruskin,FL,33570,27.713401,-82.394372,4.0,142,...,True,free,,False,True,False,True,False,2,True
4,rF0xI_3jjlsEKp3N0Z0BuQ,Trattoria Totaro,639 Spring Mill Ave,Conshohocken,PA,19428,40.075052,-75.294523,4.0,135,...,True,no,,False,True,False,False,False,1,True


In [9]:
# Load review data from parquet for text analysis
review_parquet_path = f"s3://{ATHENA_BUCKET}/parquet/review/"

print("Loading review data from parquet for text feature extraction...")
print(f"Source: {review_parquet_path}")
review_df = pd.read_parquet(review_parquet_path)
print(f"âœ… Loaded {len(review_df):,} reviews from parquet data")
print(f"Columns: {list(review_df.columns)}")
print(f"Memory usage: {review_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
review_df.head()

Loading review data from parquet for text feature extraction...
Source: s3://yelp-aai540-group6-athena-297628177412/parquet/review/
âœ… Loaded 6,990,280 reviews from parquet data
Columns: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date', 'year']
Memory usage: 6135.74 MB


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year
0,4uijRkoBeofzTiiZxqY9IA,5mHprbXJYoz-7ACL-xL2eA,agRssYozLGjGEsBZAo1Cbw,1.0,3,3,2,Jesus - this place sucks. This and other place...,2005-09-15 23:32:15,2005
1,tgXlgpUx0KcCIlyU3_dNQQ,3MYdpmHeNwC6FquRWi3YOg,iGMS1iM71HLrZN3V_kqd2g,4.0,0,1,2,"Pretty good place to bowl, went on $0.50 bowli...",2005-06-10 04:49:33,2005
2,G7ZpIMCu12HLlAeIeZJ4tg,uSxeEUPMW1nUJr-5jHYt6Q,RGVyb36fh2M2BTTUYxURLg,4.0,0,0,0,Second visit: the staff was so nice today - I ...,2005-11-13 20:22:56,2005
3,ps1ImlE0Vor3JBg1eWIKBw,n-lBS02-3yvlY5Q91mmwDA,NJTikPQCpas4q7HZInlykQ,3.0,0,0,0,Old school locals bar- a fine example of what ...,2005-07-10 22:06:30,2005
4,24fpHfJygvv9D-G0DjVtDg,ZftKc54UnKJSbuzyEg4SjA,AnFDd2B4rt3J5fiTg3g3wg,5.0,0,0,0,It may be commercial (and I never knew it was ...,2005-04-10 23:19:57,2005


In [10]:
# Calculate review statistics per business (keeping this for businesses not in processed data)
print("Calculating review statistics from review data...")

review_stats_df = review_df.groupby('business_id').agg({
    'stars': ['count', 'mean', 'std'],
    'useful': 'sum',
    'funny': 'sum',
    'cool': 'sum',
    'user_id': 'nunique'
}).reset_index()

# Flatten column names
review_stats_df.columns = ['business_id', 'total_reviews', 'avg_review_stars', 'stddev_review_stars',
                           'total_useful_votes', 'total_funny_votes', 'total_cool_votes', 'unique_reviewers']

# Filter to businesses with at least 5 reviews
review_stats_df = review_stats_df[review_stats_df['total_reviews'] >= 5]

print(f"âœ… Calculated stats for {len(review_stats_df):,} businesses")
review_stats_df.head()

Calculating review statistics from review data...
âœ… Calculated stats for 150,346 businesses


Unnamed: 0,business_id,total_reviews,avg_review_stars,stddev_review_stars,total_useful_votes,total_funny_votes,total_cool_votes,unique_reviewers
0,---kPU91CF4Lq2-WlRu9Lw,24,4.5,0.978019,16,1,13,24
1,--0iUa4sNDFiZFrAdIWhZQ,14,3.214286,1.368805,25,3,4,14
2,--30_8IhuyMHbSOcNWd6DQ,9,3.555556,1.943651,7,2,0,9
3,--7PUidqRWpRSpXebiyxTg,12,1.75,1.05529,9,2,0,12
4,--7jw19RH9JKXgFohspgQw,13,4.230769,1.535895,16,0,0,13


In [11]:
# Extract text features from reviews
print("Extracting parking-related text features from reviews...")
print("This may take a few minutes for large datasets...")

def extract_parking_text_features(business_reviews):
    """Extract parking-related features from review text"""
    if business_reviews.empty or 'text' not in business_reviews.columns:
        return pd.Series({
            'parking_mentions': 0,
            'parking_positive_sentiment': 0,
            'parking_negative_sentiment': 0,
            'free_parking_mentions': 0,
            'valet_mentions': 0
        })
    
    # Combine all review text for this business (limit to save memory)
    all_text = ' '.join(business_reviews['text'].fillna('').astype(str).head(100)).lower()
    
    # Parking-related keywords
    parking_keywords = ['parking', 'park ', 'lot', 'garage', 'valet']
    positive_parking = ['easy parking', 'free parking', 'plenty of parking', 
                       'convenient parking', 'good parking', 'ample parking',
                       'easy to park', 'plenty of spots']
    negative_parking = ['no parking', 'hard to park', 'expensive parking', 
                       'parking nightmare', 'difficult parking', 'limited parking',
                       'parking is terrible', 'impossible to park']
    
    # Count mentions
    parking_mentions = sum(all_text.count(keyword) for keyword in parking_keywords)
    positive_mentions = sum(all_text.count(phrase) for phrase in positive_parking)
    negative_mentions = sum(all_text.count(phrase) for phrase in negative_parking)
    free_parking = all_text.count('free parking') + all_text.count('no charge')
    valet_mentions = all_text.count('valet')
    
    return pd.Series({
        'parking_mentions': min(parking_mentions, 100),  # Cap to avoid outliers
        'parking_positive_sentiment': positive_mentions,
        'parking_negative_sentiment': negative_mentions,
        'free_parking_mentions': free_parking,
        'valet_mentions': valet_mentions
    })

# Filter reviews for businesses in processed data
business_ids_in_processed = set(business_df['business_id'])
relevant_reviews = review_df[review_df['business_id'].isin(business_ids_in_processed)]

print(f"Processing reviews for {len(business_ids_in_processed):,} businesses...")
print(f"Found {len(relevant_reviews):,} relevant reviews")

# Extract text features by business (process in batches to manage memory)
text_features = relevant_reviews.groupby('business_id').apply(extract_parking_text_features).reset_index()

print(f"âœ… Extracted text features for {len(text_features):,} businesses")
print(f"\nText feature summary:")
print(text_features[['parking_mentions', 'parking_positive_sentiment', 'parking_negative_sentiment']].describe())
text_features.head(10)

Extracting parking-related text features from reviews...
This may take a few minutes for large datasets...
Processing reviews for 21,232 businesses...
Found 3,364,709 relevant reviews
âœ… Extracted text features for 21,232 businesses

Text feature summary:
       parking_mentions  parking_positive_sentiment  \
count      21232.000000                21232.000000   
mean          11.406980                    0.296015   
std           10.177281                    0.688581   
min            0.000000                    0.000000   
25%            5.000000                    0.000000   
50%            9.000000                    0.000000   
75%           15.000000                    0.000000   
max          100.000000                    9.000000   

       parking_negative_sentiment  
count                21232.000000  
mean                     0.058026  
std                      0.277465  
min                      0.000000  
25%                      0.000000  
50%                      0.0000

  text_features = relevant_reviews.groupby('business_id').apply(extract_parking_text_features).reset_index()


Unnamed: 0,business_id,parking_mentions,parking_positive_sentiment,parking_negative_sentiment,free_parking_mentions,valet_mentions
0,---kPU91CF4Lq2-WlRu9Lw,1,0,0,0,0
1,--epgcb7xHGuJ-4PUeSLAw,14,0,0,0,0
2,--lqIzK-ZVTtgwiQM63XgQ,4,0,0,0,0
3,-09Oc2D14vRnmirPh0vlXw,9,0,0,0,0
4,-0FX23yAacC4bbLaGPvyxw,27,0,0,0,9
5,-0TffRSXXIlBYVbb5AwfTg,14,0,0,0,0
6,-0__F9fnKt8uioCKztF5Ww,10,0,0,0,0
7,-0iIxySkp97WNlwK66OGWg,22,0,0,0,0
8,-0jK77zdE3-plqXuwXtilQ,2,0,0,0,0
9,-0jzoPt3UeXn6FUXVQvyPg,39,0,0,1,15


## Feature Engineering

In [12]:
# Merge processed data with text features and review stats
print("Merging all data sources...")

# Start with processed business data (already has most features)
features_df = business_df.copy()

# Merge with text features from reviews
features_df = features_df.merge(text_features, on='business_id', how='left')

# Merge with review statistics (to update/supplement existing stats)
features_df = features_df.merge(
    review_stats_df[['business_id', 'total_reviews', 'avg_review_stars', 
                     'total_useful_votes', 'total_funny_votes', 'total_cool_votes']], 
    on='business_id', 
    how='left',
    suffixes=('', '_new')
)

# Use new stats if available, otherwise keep existing
for col in ['total_reviews', 'avg_review_stars', 'total_useful_votes', 'total_funny_votes', 'total_cool_votes']:
    new_col = f'{col}_new'
    if new_col in features_df.columns:
        features_df[col] = features_df[new_col].fillna(features_df[col])
        features_df.drop(columns=[new_col], inplace=True)

# Fill missing text features with 0 (businesses with no reviews)
text_feature_cols = ['parking_mentions', 'parking_positive_sentiment', 
                    'parking_negative_sentiment', 'free_parking_mentions', 'valet_mentions']
for col in text_feature_cols:
    if col in features_df.columns:
        features_df[col] = features_df[col].fillna(0)

print(f"Combined dataset shape: {features_df.shape}")
print(f"Columns: {list(features_df.columns)}")
features_df.head()

Merging all data sources...
Combined dataset shape: (21232, 41)
Columns: ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories', 'parking_garage', 'parking_street', 'parking_validated', 'parking_lot', 'parking_valet', 'price_range', 'restaurantsreservations', 'restaurantstakeout', 'restaurantsdelivery', 'outdoorseating', 'wifi', 'alcohol', 'has_garage', 'has_street', 'has_validated', 'has_lot', 'has_valet', 'parking_types_count', 'has_any_parking', 'parking_mentions', 'parking_positive_sentiment', 'parking_negative_sentiment', 'free_parking_mentions', 'valet_mentions', 'total_reviews', 'avg_review_stars', 'total_useful_votes', 'total_funny_votes', 'total_cool_votes']


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,parking_mentions,parking_positive_sentiment,parking_negative_sentiment,free_parking_mentions,valet_mentions,total_reviews,avg_review_stars,total_useful_votes,total_funny_votes,total_cool_votes
0,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,5714 111 Street NW,Edmonton,AB,T6H 3G1,53.494577,-113.517946,4.0,57,...,11,1,0,1,0,57,4.087719,74,18,26
1,kGjpBqAqG4mIuFSgU2KKEA,The Royal Indian Cuisine,272 S 20th St,Philadelphia,PA,19103,39.948365,-75.1745,4.0,28,...,0,0,0,0,0,28,4.0,31,1,6
2,uKEabBQrn0gLzvTppOrKIA,Casita Taqueria,2701 4th St,Saint Petersburg,FL,33704,27.796976,-82.638168,4.5,207,...,35,0,0,0,0,211,4.407583,145,60,87
3,8JEOmVGQffkr6xtn4vvplQ,Pita Kebob,3028 E College Ave,Ruskin,FL,33570,27.713401,-82.394372,4.0,142,...,8,0,0,0,0,151,4.13245,71,6,19
4,rF0xI_3jjlsEKp3N0Z0BuQ,Trattoria Totaro,639 Spring Mill Ave,Conshohocken,PA,19428,40.075052,-75.294523,4.0,135,...,13,2,0,0,0,141,3.985816,85,11,22


In [13]:
# Display top 25 cities by business count
print("Top 25 cities by business count in the dataset:\n")

city_counts = features_df['city'].value_counts().head(25)

for i, (city, count) in enumerate(city_counts.items(), 1):
    print(f"{i:2d}. {city:25s} - {count:,} businesses")

print(f"\nTotal unique cities in dataset: {features_df['city'].nunique():,}")

Top 25 cities by business count in the dataset:

 1. Philadelphia              - 2,143 businesses
 2. Tampa                     - 1,187 businesses
 3. Indianapolis              - 1,148 businesses
 4. Nashville                 - 1,086 businesses
 5. Tucson                    - 1,009 businesses
 6. New Orleans               - 992 businesses
 7. Edmonton                  - 690 businesses
 8. Saint Louis               - 665 businesses
 9. Reno                      - 583 businesses
10. Boise                     - 393 businesses
11. Santa Barbara             - 352 businesses
12. St. Louis                 - 259 businesses
13. Metairie                  - 259 businesses
14. Clearwater                - 258 businesses
15. Wilmington                - 252 businesses
16. Saint Petersburg          - 204 businesses
17. Franklin                  - 190 businesses
18. Sparks                    - 162 businesses
19. St. Petersburg            - 154 businesses
20. Meridian                  - 146 businesses
2

In [14]:
# Enhanced Feature Engineering with Text Data

print("Engineering enhanced features...")

# Check if parking_availability_score already exists from processed data
if 'parking_availability_score' not in features_df.columns:
    # 1. Parking Score: Aggregate parking availability (if not already present)
    def calculate_parking_score(row):
        """Calculate a parking availability score (0-5)"""
        score = 0
        if pd.notna(row.get('parking_lot')) and row.get('parking_lot'):
            score += 2
        if pd.notna(row.get('parking_garage')) and row.get('parking_garage'):
            score += 2
        if pd.notna(row.get('parking_street')) and row.get('parking_street'):
            score += 1
        if pd.notna(row.get('parking_valet')) and row.get('parking_valet'):
            score += 1
        if pd.notna(row.get('parking_validated')) and row.get('parking_validated'):
            score += 1
        return min(score, 5)  # Cap at 5
    
    features_df['parking_availability_score'] = features_df.apply(calculate_parking_score, axis=1)
    print("Created parking_availability_score from structured data")
else:
    print("Using existing parking_availability_score from processed data")

# 2. Enhanced Parking Score: Combine structured + text data
def calculate_enhanced_parking_score(row):
    """Enhanced parking score using both structured and text data"""
    # Start with original structured score
    structured_score = row.get('parking_availability_score', 0)
    
    # Add text-based adjustments
    text_boost = 0
    
    # Positive sentiment boosts score
    if row['parking_positive_sentiment'] > row['parking_negative_sentiment']:
        text_boost += 1
    
    # Free parking is a major plus
    if row['free_parking_mentions'] > 0:
        text_boost += 1
    
    # Valet service adds convenience
    if row['valet_mentions'] > 0:
        text_boost += 0.5
    
    # Strong negative sentiment reduces score
    if row['parking_negative_sentiment'] > row['parking_positive_sentiment'] * 2:
        text_boost -= 1
    
    # Parking mentions without sentiment indicates awareness (small boost)
    if row['parking_mentions'] > 5 and row['parking_positive_sentiment'] == 0 and row['parking_negative_sentiment'] == 0:
        text_boost += 0.5
        
    return min(max(structured_score + text_boost, 0), 5)  # Keep between 0-5

features_df['enhanced_parking_score'] = features_df.apply(calculate_enhanced_parking_score, axis=1)

# 3. Has Parking (binary)
features_df['has_parking'] = (features_df['parking_availability_score'] > 0).astype(int)

# 4. Ensure other features exist
if 'price_range' not in features_df.columns:
    features_df['price_range'] = pd.to_numeric(features_df.get('restaurantspricerange2', 2), errors='coerce').fillna(2.0)

if 'review_engagement' not in features_df.columns:
    features_df['review_engagement'] = (
        features_df['total_useful_votes'].fillna(0) + 
        features_df['total_funny_votes'].fillna(0) + 
        features_df['total_cool_votes'].fillna(0)
    ) / features_df['total_reviews'].fillna(1)

if 'rating_consistency' not in features_df.columns:
    if 'stddev_review_stars' in features_df.columns:
        features_df['rating_consistency'] = 1 / (features_df['stddev_review_stars'].fillna(0.5) + 0.1)
    else:
        features_df['rating_consistency'] = 1 / (0.5 + 0.1)

# 5. Category features
if 'is_restaurant' not in features_df.columns:
    features_df['is_restaurant'] = features_df.get('categories', '').fillna('').str.contains('Restaurant', case=False, na=False).astype(int)
if 'is_food_service' not in features_df.columns:
    features_df['is_food_service'] = features_df.get('categories', '').fillna('').str.contains('Food|Restaurant|Bar|Cafe', case=False, na=False).astype(int)

# Handle data type conversions
numeric_cols = features_df.select_dtypes(include=[np.number]).columns
features_df[numeric_cols] = features_df[numeric_cols].fillna(0)

# Boolean columns to int - handle various boolean representations
bool_cols = ['parking_garage', 'parking_street', 'parking_lot', 'parking_valet', 
             'parking_validated', 'goodforkids', 'restaurantsgoodforgroups']

def convert_bool_to_int(series):
    """Convert various boolean representations to integer"""
    if series.dtype == 'bool':
        return series.astype(int)
    return series.map({
        'true': 1, 'True': 1, True: 1, 1: 1,
        'false': 0, 'False': 0, False: 0, 0: 0,
        None: 0, '': 0, 'None': 0
    }).fillna(0).astype(int)

for col in bool_cols:
    if col in features_df.columns:
        features_df[col] = convert_bool_to_int(features_df[col])

print("Enhanced feature engineering complete")
print(f"Total features: {features_df.shape[1]}")
print("\nKey parking features comparison:")
features_df[['business_id', 'name', 'parking_availability_score', 'enhanced_parking_score',
             'parking_mentions', 'parking_positive_sentiment', 'parking_negative_sentiment']].head(10)

Engineering enhanced features...
Created parking_availability_score from structured data
Enhanced feature engineering complete
Total features: 48

Key parking features comparison:


Unnamed: 0,business_id,name,parking_availability_score,enhanced_parking_score,parking_mentions,parking_positive_sentiment,parking_negative_sentiment
0,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,2,4.0,11,1,0
1,kGjpBqAqG4mIuFSgU2KKEA,The Royal Indian Cuisine,1,1.0,0,0,0
2,uKEabBQrn0gLzvTppOrKIA,Casita Taqueria,2,2.5,35,0,0
3,8JEOmVGQffkr6xtn4vvplQ,Pita Kebob,3,3.5,8,0,0
4,rF0xI_3jjlsEKp3N0Z0BuQ,Trattoria Totaro,1,2.0,13,2,0
5,qma1sGQv7ArUtzuUTQElRg,Katie's Pizza & Pasta Osteria,2,2.5,6,0,0
6,zjQDk4tZyhEroyqtkgvx0g,The Cobblestone Eatery and Drinkery,1,1.5,6,0,0
7,jsQQu9rVerR3OGxIa0hRQQ,Kelly's Taproom,1,1.5,7,0,0
8,Hp3Ony7yW60VPuWHQFIIHA,Frady's One Stop Food Store,1,1.5,6,0,0
9,UX3eq0WsVva-cqlVrzyzFg,Cantina Los Tres Hombres,5,5.0,17,0,0


## Prepare the Data for the Feature Store

In [15]:
# Select enhanced features for Feature Store
feature_columns = [
    'business_id',
    'name',
    'city',
    'state',
    'latitude',
    'longitude',
    'stars',
    'review_count',
    'is_open',
    'parking_availability_score',
    'enhanced_parking_score',
    'has_parking',
    'parking_garage',
    'parking_street',
    'parking_lot',
    'parking_valet',
    'price_range',
    'total_reviews',
    'avg_review_stars',
    'review_engagement',
    'rating_consistency',
    'is_restaurant',
    'is_food_service',
    'open_days_count',
    'open_on_weekend',
    'goodforkids',
    'restaurantsgoodforgroups',
    'parking_mentions',
    'parking_positive_sentiment',
    'parking_negative_sentiment',
    'free_parking_mentions',
    'valet_mentions'
]

# Filter to only columns that exist
available_columns = [col for col in feature_columns if col in features_df.columns]
missing_columns = [col for col in feature_columns if col not in features_df.columns]

if missing_columns:
    print(f"Missing columns (will be skipped): {missing_columns}")

feature_store_df = features_df[available_columns].copy()

# Convert object types to string
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == 'object':
            data_frame[label] = data_frame[label].astype('str').astype('string')

cast_object_to_string(feature_store_df)

# Add EventTime (required by Feature Store)
current_time_sec = int(round(time.time()))
feature_store_df['event_time'] = pd.Series([float(current_time_sec)] * len(feature_store_df), dtype='float64')

# Ensure business_id is string type
feature_store_df['business_id'] = feature_store_df['business_id'].astype('string')

print(f"Prepared {len(feature_store_df):,} records for Feature Store")
print(f"Total features: {len(feature_store_df.columns)}")
print(f"\nData types:\n{feature_store_df.dtypes}")
feature_store_df.head()

Missing columns (will be skipped): ['open_days_count', 'open_on_weekend', 'goodforkids', 'restaurantsgoodforgroups']
Prepared 21,232 records for Feature Store
Total features: 29

Data types:
business_id                   string[python]
name                          string[python]
city                          string[python]
state                         string[python]
latitude                             float64
longitude                            float64
stars                                float64
review_count                           int64
is_open                                int64
parking_availability_score             int64
enhanced_parking_score               float64
has_parking                            int64
parking_garage                         int64
parking_street                         int64
parking_lot                            int64
parking_valet                          int64
price_range                   string[python]
total_reviews                          int64

Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,review_count,is_open,parking_availability_score,...,review_engagement,rating_consistency,is_restaurant,is_food_service,parking_mentions,parking_positive_sentiment,parking_negative_sentiment,free_parking_mentions,valet_mentions,event_time
0,jZoeTGRfhG9n_Jo6VMqwxw,Sushi Wasabi,Edmonton,AB,53.494577,-113.517946,4.0,57,1,2,...,2.070175,1.666667,1,1,11,1,0,1,0,1769562000.0
1,kGjpBqAqG4mIuFSgU2KKEA,The Royal Indian Cuisine,Philadelphia,PA,39.948365,-75.1745,4.0,28,1,1,...,1.357143,1.666667,1,1,0,0,0,0,0,1769562000.0
2,uKEabBQrn0gLzvTppOrKIA,Casita Taqueria,Saint Petersburg,FL,27.796976,-82.638168,4.5,207,1,2,...,1.383886,1.666667,1,1,35,0,0,0,0,1769562000.0
3,8JEOmVGQffkr6xtn4vvplQ,Pita Kebob,Ruskin,FL,27.713401,-82.394372,4.0,142,1,3,...,0.635762,1.666667,1,1,8,0,0,0,0,1769562000.0
4,rF0xI_3jjlsEKp3N0Z0BuQ,Trattoria Totaro,Conshohocken,PA,40.075052,-75.294523,4.0,135,1,1,...,0.836879,1.666667,1,1,13,2,0,0,0,1769562000.0


In [16]:
# Create FeatureGroup name with timestamp
feature_group_name = f"venuesignal-business-features-{strftime('%d-%H-%M-%S', gmtime())}"

print(f"Feature Group Name: {feature_group_name}")

Feature Group Name: venuesignal-business-features-28-01-05-34


In [17]:
# Initialize FeatureGroup
business_feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=feature_store_session
)

# Load feature definitions from DataFrame
business_feature_group.load_feature_definitions(data_frame=feature_store_df)

print("Feature definitions loaded")
print(f"Total features: {len(business_feature_group.feature_definitions)}")

Feature definitions loaded
Total features: 29


In [18]:
# Create FeatureGroup in SageMaker
print(f"Creating FeatureGroup: {feature_group_name}")

business_feature_group.create(
    s3_uri=f"s3://{ATHENA_BUCKET}/{FEATURE_STORE_OFFLINE_PREFIX}",
    record_identifier_name="business_id",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True
)

print("Waiting for FeatureGroup creation to complete.")

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("  ... still creating")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

wait_for_feature_group_creation_complete(business_feature_group)

Creating FeatureGroup: venuesignal-business-features-28-01-05-34
Waiting for FeatureGroup creation to complete.
  ... still creating
  ... still creating
  ... still creating
  ... still creating
  ... still creating
FeatureGroup venuesignal-business-features-28-01-05-34 successfully created.


In [19]:
# Verify FeatureGroup was created
business_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:297628177412:feature-group/venuesignal-business-features-28-01-05-34',
 'FeatureGroupName': 'venuesignal-business-features-28-01-05-34',
 'RecordIdentifierFeatureName': 'business_id',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'business_id',
   'FeatureType': 'String'},
  {'FeatureName': 'name', 'FeatureType': 'String'},
  {'FeatureName': 'city', 'FeatureType': 'String'},
  {'FeatureName': 'state', 'FeatureType': 'String'},
  {'FeatureName': 'latitude', 'FeatureType': 'Fractional'},
  {'FeatureName': 'longitude', 'FeatureType': 'Fractional'},
  {'FeatureName': 'stars', 'FeatureType': 'Fractional'},
  {'FeatureName': 'review_count', 'FeatureType': 'Integral'},
  {'FeatureName': 'is_open', 'FeatureType': 'Integral'},
  {'FeatureName': 'parking_availability_score', 'FeatureType': 'Integral'},
  {'FeatureName': 'enhanced_parking_score', 'FeatureType': 'Fractional'},
  {'FeatureName': 'has_parking', 'Featur

In [20]:
# Ingest data into FeatureGroup
print(f"Ingesting {len(feature_store_df)} records into FeatureGroup")

business_feature_group.ingest(
    data_frame=feature_store_df,
    max_workers=3,
    wait=True
)

print("Data ingestion complete")

Ingesting 21232 records into FeatureGroup
Data ingestion complete


## Verify the Data Ingestion

In [21]:
# Get a sample record from online store
sample_business_id = str(feature_store_df['business_id'].iloc[0])

print(f"Fetching record for business_id: {sample_business_id}")

record_response = featurestore_runtime.get_record(
    FeatureGroupName=feature_group_name,
    RecordIdentifierValueAsString=sample_business_id
)

print(f"Retrieved record:")
for feature in record_response['Record']:
    print(f"  {feature['FeatureName']}: {feature['ValueAsString']}")

Fetching record for business_id: jZoeTGRfhG9n_Jo6VMqwxw
Retrieved record:
  business_id: jZoeTGRfhG9n_Jo6VMqwxw
  name: Sushi Wasabi
  city: Edmonton
  state: AB
  latitude: 53.4945772799
  longitude: -113.517946463
  stars: 4.0
  review_count: 57
  is_open: 1
  parking_availability_score: 2
  enhanced_parking_score: 4.0
  has_parking: 1
  parking_garage: 0
  parking_street: 0
  parking_lot: 1
  parking_valet: 0
  price_range: 2
  total_reviews: 57
  avg_review_stars: 4.087719298245614
  review_engagement: 2.0701754385964914
  rating_consistency: 1.6666666666666667
  is_restaurant: 1
  is_food_service: 1
  parking_mentions: 11
  parking_positive_sentiment: 1
  parking_negative_sentiment: 0
  free_parking_mentions: 1
  valet_mentions: 0
  event_time: 1769562334.0


In [22]:
# Wait for offline store to be populated
print("Waiting for data to appear in offline store")

offline_store_s3_uri = business_feature_group.describe().get("OfflineStoreConfig").get("S3StorageConfig").get("ResolvedOutputS3Uri")
offline_store_prefix = offline_store_s3_uri.replace(f"s3://{ATHENA_BUCKET}/", "")

print(f"Offline store location: {offline_store_s3_uri}")

offline_store_contents = None
max_wait_minutes = 10
wait_count = 0

while offline_store_contents is None and wait_count < max_wait_minutes:
    objects_in_bucket = s3_client.list_objects(
        Bucket=ATHENA_BUCKET,
        Prefix=offline_store_prefix
    )
    if 'Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1:
        offline_store_contents = objects_in_bucket['Contents']
        print(f"Data available in offline store ({len(offline_store_contents)} objects)")
    else:
        wait_count += 1
        print(f"  ... waiting ({wait_count}/{max_wait_minutes} minutes)")
        sleep(60)

if offline_store_contents is None:
    print("Offline store data not yet available. Continue anyway - it will appear shortly.")
else:
    print(f"Offline store ready with {len(offline_store_contents)} files")

Waiting for data to appear in offline store
Offline store location: s3://yelp-aai540-group6-athena-297628177412/feature-store/offline-store/297628177412/sagemaker/us-east-1/offline-store/venuesignal-business-features-28-01-05-34-1769562334/data
  ... waiting (1/10 minutes)
  ... waiting (2/10 minutes)
  ... waiting (3/10 minutes)
  ... waiting (4/10 minutes)
  ... waiting (5/10 minutes)
Data available in offline store (54 objects)
Offline store ready with 54 files


In [23]:
# Generate Hive DDL for the FeatureGroup
print("ðŸ“‹ Hive DDL for FeatureGroup:\n")
print(business_feature_group.as_hive_ddl())

ðŸ“‹ Hive DDL for FeatureGroup:

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.venuesignal-business-features-28-01-05-34 (
  business_id STRING
  name STRING
  city STRING
  state STRING
  latitude FLOAT
  longitude FLOAT
  stars FLOAT
  review_count INT
  is_open INT
  parking_availability_score INT
  enhanced_parking_score FLOAT
  has_parking INT
  parking_garage INT
  parking_street INT
  parking_lot INT
  parking_valet INT
  price_range STRING
  total_reviews INT
  avg_review_stars FLOAT
  review_engagement FLOAT
  rating_consistency FLOAT
  is_restaurant INT
  is_food_service INT
  parking_mentions INT
  parking_positive_sentiment INT
  parking_negative_sentiment INT
  free_parking_mentions INT
  valet_mentions INT
  event_time FLOAT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parq

In [24]:
# Query offline store using the FeatureGroup's Athena query interface
athena_query = business_feature_group.athena_query()

# Get the table name
table_name = athena_query.table_name
print(f"Athena table name: {table_name}")

Athena table name: venuesignal_business_features_28_01_05_34_1769562334


In [25]:
# Check what columns are actually in the Feature Store table
print("Columns available in Feature Store:\n")
feature_names = [fd.feature_name for fd in business_feature_group.feature_definitions]
for i, fname in enumerate(sorted(feature_names), 1):
    print(f"{i:2d}. {fname}")
print(f"\nTotal columns: {len(feature_names)}")

Columns available in Feature Store:

 1. avg_review_stars
 2. business_id
 3. city
 4. enhanced_parking_score
 5. event_time
 6. free_parking_mentions
 7. has_parking
 8. is_food_service
 9. is_open
10. is_restaurant
11. latitude
12. longitude
13. name
14. parking_availability_score
15. parking_garage
16. parking_lot
17. parking_mentions
18. parking_negative_sentiment
19. parking_positive_sentiment
20. parking_street
21. parking_valet
22. price_range
23. rating_consistency
24. review_count
25. review_engagement
26. stars
27. state
28. total_reviews
29. valet_mentions

Total columns: 29


In [26]:
# Build an enhanced training dataset query with text features
# Select businesses with comprehensive parking data

training_query = f"""
SELECT 
    business_id,
    stars,
    review_count,
    parking_availability_score,
    enhanced_parking_score,
    has_parking,
    price_range,
    total_reviews,
    avg_review_stars,
    review_engagement,
    rating_consistency,
    is_restaurant,
    parking_mentions,
    parking_positive_sentiment,
    parking_negative_sentiment,
    free_parking_mentions,
    valet_mentions,
    CASE 
        WHEN enhanced_parking_score >= 3 THEN 1 
        ELSE 0 
    END as has_good_parking,
    CASE
        WHEN avg_review_stars >= 4.0 THEN 1
        ELSE 0
    END as is_highly_rated
FROM "{table_name}"
WHERE total_reviews >= 10
    AND is_restaurant = 1
LIMIT 10000
"""

print("Running enhanced Athena query on offline store...")
print(f"Query: {training_query}\n")

# Execute query
athena_query.run(
    query_string=training_query,
    output_location=f"s3://{ATHENA_BUCKET}/{FEATURE_STORE_PREFIX}/query_results/"
)

print("Waiting for query to complete...")
athena_query.wait()

# Load results into DataFrame
training_dataset = athena_query.as_dataframe()

print(f"Training dataset ready: {training_dataset.shape}")
print(f"Rows: {training_dataset.shape[0]:,} | Features: {training_dataset.shape[1]}")
training_dataset.head(10)

Running enhanced Athena query on offline store...
Query: 
SELECT 
    business_id,
    stars,
    review_count,
    parking_availability_score,
    enhanced_parking_score,
    has_parking,
    price_range,
    total_reviews,
    avg_review_stars,
    review_engagement,
    rating_consistency,
    is_restaurant,
    parking_mentions,
    parking_positive_sentiment,
    parking_negative_sentiment,
    free_parking_mentions,
    valet_mentions,
    CASE 
        WHEN enhanced_parking_score >= 3 THEN 1 
        ELSE 0 
    END as has_good_parking,
    CASE
        WHEN avg_review_stars >= 4.0 THEN 1
        ELSE 0
    END as is_highly_rated
FROM "venuesignal_business_features_28_01_05_34_1769562334"
WHERE total_reviews >= 10
    AND is_restaurant = 1
LIMIT 10000


Waiting for query to complete...
Training dataset ready: (10000, 19)
Rows: 10,000 | Features: 19


Unnamed: 0,business_id,stars,review_count,parking_availability_score,enhanced_parking_score,has_parking,price_range,total_reviews,avg_review_stars,review_engagement,rating_consistency,is_restaurant,parking_mentions,parking_positive_sentiment,parking_negative_sentiment,free_parking_mentions,valet_mentions,has_good_parking,is_highly_rated
0,71U7MxQEhwitJOm4CQpRwQ,4.0,325,2,2.5,1,2.0,345,3.881159,1.44058,1.666667,1,14,0,0,0,0,0,0
1,qEzhExWx4nogW_0B5JfUCw,3.5,99,2,2.5,1,1.0,104,3.567308,1.153846,1.666667,1,18,0,0,0,0,0,0
2,z0SRVH4OpDQxgf2-m_wcZg,3.5,68,2,2.5,1,1.0,71,3.43662,1.521127,1.666667,1,14,0,0,0,0,0,0
3,dUctvEfHQccW_uxtRup2QQ,3.0,212,2,2.5,1,2.0,217,3.235023,1.0553,1.666667,1,28,0,0,0,0,0,0
4,N6VbRekOBjRfru_Ci1AFnQ,4.0,20,1,1.0,1,1.0,21,3.952381,1.142857,1.666667,1,2,0,0,0,0,0,0
5,56X2VNNLIi4aopkIncSNvg,4.5,15,1,1.0,1,,15,4.733333,0.733333,1.666667,1,0,0,0,0,0,0,1
6,M6ZoEY54OKKPq9O9Boee7g,3.5,351,1,1.5,1,2.0,368,3.736413,3.720109,1.666667,1,7,0,0,0,0,0,0
7,BiSdlIuOlf6MqyEZ6aWhfQ,3.5,301,5,5.0,1,2.0,304,3.638158,0.8125,1.666667,1,11,0,0,0,0,1,0
8,DMxgL0TdpR9HCD4P1CdxdQ,3.0,144,2,3.0,1,1.0,151,3.039735,1.05298,1.666667,1,10,1,0,0,0,1,0
9,UnUUSoiBSP1IGa-IdgehYQ,3.5,80,2,3.0,1,2.0,80,3.675,1.1625,1.666667,1,18,1,0,0,0,1,0


In [27]:
# Analyze the training dataset
print("Training Dataset Summary:\n")
print(f"Total records: {len(training_dataset)}")
print(f"\nTarget Distribution (is_highly_rated):")
print(training_dataset['is_highly_rated'].value_counts())
print(f"\nParking Distribution (has_good_parking):")
print(training_dataset['has_good_parking'].value_counts())
print(f"\nCorrelation between parking and rating:")
print(pd.crosstab(training_dataset['has_good_parking'], training_dataset['is_highly_rated'], normalize='index'))

Training Dataset Summary:

Total records: 10000

Target Distribution (is_highly_rated):
is_highly_rated
0    6330
1    3670
Name: count, dtype: int64

Parking Distribution (has_good_parking):
has_good_parking
0    6821
1    3179
Name: count, dtype: int64

Correlation between parking and rating:
is_highly_rated          0         1
has_good_parking                    
0                 0.657968  0.342032
1                 0.579427  0.420573


In [29]:
# Reproducibility
np.random.seed(12341)

# Create a random number for each row
rand = np.random.rand(len(training_dataset))

# Assign split based on cumulative ratios
training_dataset["split"] = np.select(
    [
        rand < 0.40,
        (rand >= 0.40) & (rand < 0.50),
        (rand >= 0.50) & (rand < 0.60),
        rand >= 0.60
    ],
    ["train", "val", "test", "prod"]
)

# split values check
print(training_dataset["split"].value_counts(normalize=True))

training_dataset.to_csv("./training_data/split_dataset.csv", index=False)

split
train    0.4025
prod     0.3984
val      0.1003
test     0.0988
Name: proportion, dtype: float64


In [30]:
# Save training dataset to S3 for model training
training_data_s3_path = f"s3://{ATHENA_BUCKET}/{FEATURE_STORE_PREFIX}/training_data/venuesignal_training.csv"

# Save without index
training_dataset.to_csv('venuesignal_training.csv', index=False)

# Upload to S3
s3_client.upload_file(
    'venuesignal_training.csv',
    ATHENA_BUCKET,
    f"{FEATURE_STORE_PREFIX}/training_data/venuesignal_training.csv"
)

print(f"Training dataset saved to: {training_data_s3_path}")

Training dataset saved to: s3://yelp-aai540-group6-athena-297628177412/feature-store/training_data/venuesignal_training.csv


## Query the Feature Store

In [31]:
# Simulate real-time feature retrieval for inference
test_business_ids = feature_store_df['business_id'].head(5).tolist()

print("Retrieving features for sample businesses:\n")

for business_id in test_business_ids:
    try:
        record = featurestore_runtime.get_record(
            FeatureGroupName=feature_group_name,
            RecordIdentifierValueAsString=business_id
        )
        
        # Extract key features
        features = {f['FeatureName']: f['ValueAsString'] for f in record['Record']}
        
        print(f"Business: {features.get('name', 'N/A')}")
        print(f"  Location: {features.get('city', 'N/A')}, {features.get('state', 'N/A')}")
        print(f"  Stars: {features.get('stars', 'N/A')} | Reviews: {features.get('total_reviews', 'N/A')}")
        print(f"  Parking Score: {features.get('parking_availability_score', 'N/A')}")
        print(f"  Rating Consistency: {features.get('rating_consistency', 'N/A')}")
        print(f"  Review Engagement: {features.get('review_engagement', 'N/A')}\n")
        
    except Exception as e:
        print(f"Error retrieving {business_id}: {e}\n")

Retrieving features for sample businesses:

Business: Sushi Wasabi
  Location: Edmonton, AB
  Stars: 4.0 | Reviews: 57
  Parking Score: 2
  Rating Consistency: 1.6666666666666667
  Review Engagement: 2.0701754385964914

Business: The Royal Indian Cuisine
  Location: Philadelphia, PA
  Stars: 4.0 | Reviews: 28
  Parking Score: 1
  Rating Consistency: 1.6666666666666667
  Review Engagement: 1.3571428571428572

Business: Casita Taqueria
  Location: Saint Petersburg, FL
  Stars: 4.5 | Reviews: 211
  Parking Score: 2
  Rating Consistency: 1.6666666666666667
  Review Engagement: 1.3838862559241707

Business: Pita Kebob
  Location: Ruskin, FL
  Stars: 4.0 | Reviews: 151
  Parking Score: 3
  Rating Consistency: 1.6666666666666667
  Review Engagement: 0.6357615894039735

Business: Trattoria Totaro
  Location: Conshohocken, PA
  Stars: 4.0 | Reviews: 141
  Parking Score: 1
  Rating Consistency: 1.6666666666666667
  Review Engagement: 0.8368794326241135



In [32]:
# Batch retrieval
print("Batch retrieving features for multiple businesses\n")

batch_response = featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": feature_group_name,
            "RecordIdentifiersValueAsString": test_business_ids[:3]
        }
    ]
)

print(f"Retrieved {len(batch_response['Records'])} records")
print(f"Errors: {len(batch_response.get('Errors', []))}")

# Display first record
if batch_response['Records']:
    first_record = batch_response['Records'][0]
    print(f"\nSample record:")
    for feature in first_record['Record']:
        print(f"  {feature['FeatureName']}: {feature['ValueAsString']}")

Batch retrieving features for multiple businesses

Retrieved 3 records
Errors: 0

Sample record:
  business_id: uKEabBQrn0gLzvTppOrKIA
  name: Casita Taqueria
  city: Saint Petersburg
  state: FL
  latitude: 27.7969762597
  longitude: -82.6381678127
  stars: 4.5
  review_count: 207
  is_open: 1
  parking_availability_score: 2
  enhanced_parking_score: 2.5
  has_parking: 1
  parking_garage: 0
  parking_street: 0
  parking_lot: 1
  parking_valet: 0
  price_range: 1
  total_reviews: 211
  avg_review_stars: 4.407582938388626
  review_engagement: 1.3838862559241707
  rating_consistency: 1.6666666666666667
  is_restaurant: 1
  is_food_service: 1
  parking_mentions: 35
  parking_positive_sentiment: 0
  parking_negative_sentiment: 0
  free_parking_mentions: 0
  valet_mentions: 0
  event_time: 1769562334.0


## Uncomment to Run

These cells below can be uncommented to clear out the FeatureGroup and S3 FeatureStore data. We need this for the coming weeks as we work on this project.

In [None]:
# Delete FeatureGroup (uncomment to run)
# business_feature_group.delete()
# print(f"Deleted FeatureGroup: {feature_group_name}")

# # Clean up S3 data (uncomment to run)
 # Warning: This will delete all Feature Store data
# 
# import boto3
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(ATHENA_BUCKET)
# 
# Delete offline store data
# bucket.objects.filter(Prefix=f"{FEATURE_STORE_OFFLINE_PREFIX}/").delete()
# print(f"Deleted offline store data")
# 
# # Delete query results
# bucket.objects.filter(Prefix=f"{FEATURE_STORE_PREFIX}/query_results/").delete()
# print(f"Deleted query results")


In [None]:
# # This Deletes all feature groups you might have.
# groups = sagemaker_client.list_feature_groups()

# for group in groups["FeatureGroupSummaries"]:
#     sagemaker_client.delete_feature_group(
#         FeatureGroupName=group["FeatureGroupName"]
#     )