# AAI-540 Group 6 Project<br>
# VenueSignal<br>
<br>

# Setup, Configuration, and Connect

In [None]:
!python --version

Python 3.12.9


In [None]:
!pip list

Package                                   Version
----------------------------------------- -----------
absl-py                                   2.3.1
accelerate                                1.12.0
adagio                                    0.2.6
aioboto3                                  14.3.0
aiobotocore                               2.22.0
aiofiles                                  25.1.0
aiohappyeyeballs                          2.6.1
aiohttp                                   3.13.2
aiohttp-cors                              0.8.1
aioitertools                              0.12.0
aiosignal                                 1.4.0
aiosqlite                                 0.19.0
alembic                                   1.17.2
altair                                    5.5.0
amazon-q-developer-jupyterlab-ext         3.4.8
amazon_sagemaker_jupyter_ai_q_developer   1.2.8
amazon_sagemaker_jupyter_scheduler        3.1.15
amazon-sagemaker-sql-editor               0.1.19
amazon-sagemaker-sql-e

In [None]:
!pip install --disable-pip-version-check -q pip --upgrade > /dev/null
!pip install --disable-pip-version-check -q wrapt==1.17.2
!pip install --disable-pip-version-check -q sparkmagic==0.22.0
!pip install --disable-pip-version-check -q nvidia-ml-py3==7.352.0
!pip install --disable-pip-version-check -q pydynamodb

In [None]:
!pip install --disable-pip-version-check -q awscli==1.18.216 boto3==1.29.6 botocore==1.37.2

[31mERROR: Cannot install awscli==1.18.216 and botocore==1.37.2 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [None]:
!pip install --disable-pip-version-check -q botocore==1.37.4

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.22.0 requires botocore<1.37.4,>=1.37.2, but you have botocore 1.37.4 which is incompatible.
boto3 1.42.34 requires botocore<1.43.0,>=1.42.34, but you have botocore 1.37.4 which is incompatible.
awscli 1.44.24 requires botocore==1.42.34, but you have botocore 1.37.4 which is incompatible.[0m[31m
[0m

In [None]:
!pip install --disable-pip-version-check -q awswrangler

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.22.0 requires botocore<1.37.4,>=1.37.2, but you have botocore 1.42.34 which is incompatible.[0m[31m
[0m

In [None]:
!pip list

Package                                   Version
----------------------------------------- -----------
absl-py                                   2.3.1
accelerate                                1.12.0
adagio                                    0.2.6
aioboto3                                  14.3.0
aiobotocore                               2.22.0
aiofiles                                  25.1.0
aiohappyeyeballs                          2.6.1
aiohttp                                   3.13.2
aiohttp-cors                              0.8.1
aioitertools                              0.12.0
aiosignal                                 1.4.0
aiosqlite                                 0.19.0
alembic                                   1.17.2
altair                                    5.5.0
amazon-q-developer-jupyterlab-ext         3.4.8
amazon_sagemaker_jupyter_ai_q_developer   1.2.8
amazon_sagemaker_jupyter_scheduler        3.1.15
amazon-sagemaker-sql-editor               0.1.19
amazon-sagemaker-sql-e

In [None]:
setup_dependencies_passed = True

In [None]:
%store setup_dependencies_passed

Stored 'setup_dependencies_passed' (bool)


In [None]:
%store

Stored variables and their in-db values:
setup_dependencies_passed             -> True


In [None]:
pip install --upgrade boto3 botocore awscli

Note: you may need to restart the kernel to use updated packages.


In [None]:
#@title Setup and Import libraries
import pandas as pd
import numpy as np
import os
import json
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# AWS
import boto3
import sagemaker
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Libraries imported successfully


In [None]:
#@title S3 Bucket Configuration
# The bucket is located at s3://yelp-aai540-group6/yelp-dataset/
# The ARN for the bucket is arn:aws:s3:::yelp-aai540-group6
S3_BUCKET = 'yelp-aai540-group6'
DATABASE_NAME = 'yelp-dataset'
REGION = 'us-west-2'


In [None]:
#@title Display S3 Bucket - Yelp Dataset
from IPython.display import display, HTML

display(
    HTML(
        f'<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{S3_BUCKET}/yelp-dataset/json/?region={REGION}&tab=overview">S3 Bucket - Yelp Dataset</a></b>'
    )
)

In [None]:
#@title Athena Configuration
ATHENA_OUTPUT = f's3://{S3_BUCKET}/athena-results/'

In [None]:
#@title Display Athena Tables - Yelp Dataset

# Load Data


In [None]:
#@title Query to load restaurant data with parking info
business_query = f"""
SELECT
    business_id,
    name,
    address,
    city,
    state,
    postal_code,
    latitude,
    longitude,
    stars,
    review_count,
    is_open,
    attributes,
    categories
FROM {DATABASE_NAME}.business
WHERE categories LIKE '%Restaurant%'
  AND is_open = 1
  AND review_count >= 10
"""

print("Loading restaurant data from Athena...")
print("This may take 1-2 minutes...")

df_business = pd.read_sql(business_query, conn)

print(f"\n✓ Loaded {len(df_business):,} restaurants")
print(f"\nDataset shape: {df_business.shape}")
print(f"\nMemory usage: {df_business.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
#@title Basic Data Overview
# Display first few rows
print("Sample Data:")
display(df_business.head())

# Data types
print("\nData Types:")
print(df_business.dtypes)

# Basic statistics
print("\nBasic Statistics:")
display(df_business[['stars', 'review_count', 'latitude', 'longitude']].describe())

In [None]:
#@title Parse Parking Attributes
def parse_parking_attributes(attr_str):
    """
    Parse parking attributes from the attributes string.
    Returns a dictionary with parking information.
    """
    if pd.isna(attr_str) or attr_str == '{}':
        return None

    try:
        # The attributes might be a string representation of a dict
        if isinstance(attr_str, str):
            attr_dict = eval(attr_str) if attr_str.startswith('{') else {}
        else:
            attr_dict = attr_str

        # Extract BusinessParking
        parking_str = attr_dict.get('BusinessParking', None)
        if parking_str is None or parking_str == 'None':
            return None

        # Parse parking dictionary
        if isinstance(parking_str, str):
            # Remove quotes and parse as dict
            parking_str = parking_str.replace("'True'", "True").replace("'False'", "False")
            parking_dict = eval(parking_str)
        else:
            parking_dict = parking_str

        # Convert to boolean values
        result = {}
        for key, value in parking_dict.items():
            if isinstance(value, str):
                result[key] = value.lower() == 'true'
            else:
                result[key] = bool(value)

        return result

    except Exception as e:
        return None

# Apply parsing function
print("Parsing parking attributes...")
df_business['parking_info'] = df_business['attributes'].apply(parse_parking_attributes)

# Count businesses with parking data
has_parking = df_business['parking_info'].notna().sum()
print(f"\n✓ Restaurants with parking data: {has_parking:,} ({has_parking/len(df_business)*100:.1f}%)")

In [None]:
#@title Extract individual parking features
def extract_parking_features(parking_dict):
    """Extract individual parking type flags"""
    if parking_dict is None:
        return pd.Series({
            'has_garage': False,
            'has_street': False,
            'has_validated': False,
            'has_lot': False,
            'has_valet': False,
            'parking_types_count': 0
        })

    return pd.Series({
        'has_garage': parking_dict.get('garage', False),
        'has_street': parking_dict.get('street', False),
        'has_validated': parking_dict.get('validated', False),
        'has_lot': parking_dict.get('lot', False),
        'has_valet': parking_dict.get('valet', False),
        'parking_types_count': sum(parking_dict.values())
    })

# Apply extraction
parking_features = df_business['parking_info'].apply(extract_parking_features)
df_business = pd.concat([df_business, parking_features], axis=1)

print("Parking features extracted:")
print(parking_features.head(10))

# Exploratory Data Analysis - Business Data

## Geographic Distribution

In [None]:
#@title  Top Cities
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Cities with most restaurants
top_cities = df_business['city'].value_counts().head(15)
ax1 = axes[0]
top_cities.plot(kind='barh', ax=ax1, color='steelblue')
ax1.set_xlabel('Number of Restaurants')
ax1.set_title('Top 15 Cities by Restaurant Count')
ax1.invert_yaxis()

print(f"\nTotal cities: {df_business['city'].nunique()}")

In [None]:
#@title Top States
top_states = df_business['state'].value_counts().head(10)
ax2 = axes[1]
top_states.plot(kind='barh', ax=ax2, color='coral')
ax2.set_xlabel('Number of Restaurants')
ax2.set_title('Top 10 States by Restaurant Count')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

print(f"Total states: {df_business['state'].nunique()}")

## Rating Distribution

In [None]:
#@title Rating statistics
print("Rating Statistics:")
print(f"Mean: {df_business['stars'].mean():.2f}")
print(f"Median: {df_business['stars'].median():.2f}")
print(f"Std Dev: {df_business['stars'].std():.2f}")
print(f"Min: {df_business['stars'].min():.1f}")
print(f"Max: {df_business['stars'].max():.1f}")

In [None]:
#@title Plot Rating Statistics Distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram
ax1 = axes[0]
df_business['stars'].hist(bins=20, ax=ax1, color='skyblue', edgecolor='black')
ax1.axvline(df_business['stars'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df_business["stars"].mean():.2f}')
ax1.set_xlabel('Star Rating')
ax1.set_ylabel('Number of Restaurants')
ax1.set_title('Distribution of Restaurant Ratings')
ax1.legend()

# Box plot
ax2 = axes[1]
df_business.boxplot(column='stars', ax=ax2)
ax2.set_ylabel('Star Rating')
ax2.set_title('Rating Distribution (Box Plot)')

plt.tight_layout()
plt.show()

## Review Count Distribution

In [None]:
#@title Review count statistics
print("Review Count Statistics:")
print(f"Mean: {df_business['review_count'].mean():.0f}")
print(f"Median: {df_business['review_count'].median():.0f}")
print(f"75th percentile: {df_business['review_count'].quantile(0.75):.0f}")
print(f"95th percentile: {df_business['review_count'].quantile(0.95):.0f}")
print(f"Max: {df_business['review_count'].max():.0f}")

In [None]:
# Plot Review Count Statistics
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Log scale histogram
ax1 = axes[0]
df_business['review_count'].hist(bins=50, ax=ax1, color='lightgreen', edgecolor='black')
ax1.set_xlabel('Review Count')
ax1.set_ylabel('Number of Restaurants')
ax1.set_title('Distribution of Review Counts')
ax1.set_yscale('log')

# Scatter: review count vs rating
ax2 = axes[1]
ax2.scatter(df_business['review_count'], df_business['stars'], alpha=0.3, s=10)
ax2.set_xlabel('Review Count')
ax2.set_ylabel('Star Rating')
ax2.set_title('Rating vs Review Count')
ax2.set_xscale('log')

plt.tight_layout()
plt.show()

# Correlation
corr = df_business[['stars', 'review_count']].corr().iloc[0, 1]
print(f"\nCorrelation between stars and review_count: {corr:.3f}")

# Parking Analysis

In [None]:
## Parking Data Availability

In [None]:
#@title  Filter for businesses with parking data
df_parking = df_business[df_business['parking_info'].notna()].copy()

print(f"Restaurants with parking data: {len(df_parking):,}")
print(f"Percentage of dataset: {len(df_parking)/len(df_business)*100:.1f}%")
print(f"\nThis is our primary dataset for modeling!")

# Save for later use
print(f"\nAverage rating (with parking data): {df_parking['stars'].mean():.2f}")
print(f"Average review count (with parking data): {df_parking['review_count'].mean():.0f}")

In [None]:
#@title Parking Type Distribution
# Count each parking type
parking_types = {
    'Garage': df_parking['has_garage'].sum(),
    'Street': df_parking['has_street'].sum(),
    'Validated': df_parking['has_validated'].sum(),
    'Lot': df_parking['has_lot'].sum(),
    'Valet': df_parking['has_valet'].sum()
}

print("Parking Type Availability:")
for ptype, count in sorted(parking_types.items(), key=lambda x: x[1], reverse=True):
    pct = count / len(df_parking) * 100
    print(f"  {ptype}: {count:,} ({pct:.1f}%)")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
ax1 = axes[0]
parking_df = pd.DataFrame(list(parking_types.items()), columns=['Type', 'Count'])
parking_df = parking_df.sort_values('Count', ascending=True)
parking_df.plot(x='Type', y='Count', kind='barh', ax=ax1, legend=False, color='teal')
ax1.set_xlabel('Number of Restaurants')
ax1.set_title('Parking Type Availability')

# Parking types count distribution
ax2 = axes[1]
df_parking['parking_types_count'].value_counts().sort_index().plot(kind='bar', ax=ax2, color='orange')
ax2.set_xlabel('Number of Parking Types Available')
ax2.set_ylabel('Number of Restaurants')
ax2.set_title('Distribution of Parking Type Count')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

In [None]:
#title Parking Combinations
# Create parking combination string
def get_parking_combo(row):
    types = []
    if row['has_garage']: types.append('Garage')
    if row['has_street']: types.append('Street')
    if row['has_validated']: types.append('Validated')
    if row['has_lot']: types.append('Lot')
    if row['has_valet']: types.append('Valet')
    return ', '.join(types) if types else 'None'

df_parking['parking_combo'] = df_parking.apply(get_parking_combo, axis=1)

# Top 15 combinations
top_combos = df_parking['parking_combo'].value_counts().head(15)
print("Top 15 Parking Combinations:")
for combo, count in top_combos.items():
    pct = count / len(df_parking) * 100
    print(f"  {combo}: {count:,} ({pct:.1f}%)")

# Visualize
plt.figure(figsize=(14, 8))
top_combos.plot(kind='barh', color='mediumseagreen')
plt.xlabel('Number of Restaurants')
plt.title('Top 15 Parking Combinations')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
#@title Parking vs Rating
# Compare ratings by parking type
print("Average Rating by Parking Type:")
print("\n(Restaurants WITH this parking type vs WITHOUT)\n")

parking_cols = ['has_garage', 'has_street', 'has_validated', 'has_lot', 'has_valet']
parking_names = ['Garage', 'Street', 'Validated', 'Lot', 'Valet']

results = []
for col, name in zip(parking_cols, parking_names):
    with_parking = df_parking[df_parking[col] == True]['stars'].mean()
    without_parking = df_parking[df_parking[col] == False]['stars'].mean()
    diff = with_parking - without_parking

    results.append({
        'Parking Type': name,
        'Avg Rating (With)': f"{with_parking:.3f}",
        'Avg Rating (Without)': f"{without_parking:.3f}",
        'Difference': f"{diff:+.3f}"
    })

    print(f"{name}:")
    print(f"  With: {with_parking:.3f} stars")
    print(f"  Without: {without_parking:.3f} stars")
    print(f"  Difference: {diff:+.3f} stars\n")

# Create DataFrame for visualization
results_df = pd.DataFrame(results)
display(results_df)

In [None]:
#@title Visualize Parking Type Impact
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, (col, name) in enumerate(zip(parking_cols, parking_names)):
    ax = axes[idx]

    # Box plot comparison
    data_to_plot = [
        df_parking[df_parking[col] == False]['stars'],
        df_parking[df_parking[col] == True]['stars']
    ]

    ax.boxplot(data_to_plot, labels=['Without', 'With'])
    ax.set_ylabel('Star Rating')
    ax.set_title(f'{name} Parking')
    ax.grid(True, alpha=0.3)

# Hide the 6th subplot
axes[5].axis('off')

plt.suptitle('Rating Distribution by Parking Type', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
#@title Analyze by Number of Parking Types
print("Average Rating by Number of Parking Types:")
parking_count_stats = df_parking.groupby('parking_types_count')['stars'].agg(['mean', 'count', 'std'])
parking_count_stats.columns = ['Avg Rating', 'Count', 'Std Dev']
display(parking_count_stats)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Line plot
ax1 = axes[0]
parking_count_stats['Avg Rating'].plot(kind='line', marker='o', ax=ax1, color='darkblue', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Parking Types')
ax1.set_ylabel('Average Star Rating')
ax1.set_title('Rating vs Number of Parking Types')
ax1.grid(True, alpha=0.3)

# Bar plot with error bars
ax2 = axes[1]
parking_count_stats['Avg Rating'].plot(kind='bar', ax=ax2, color='steelblue',
                                        yerr=parking_count_stats['Std Dev'], capsize=4)
ax2.set_xlabel('Number of Parking Types')
ax2.set_ylabel('Average Star Rating')
ax2.set_title('Rating by Parking Type Count (with std dev)')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

# Statistical test
from scipy import stats
corr, p_value = stats.pearsonr(df_parking['parking_types_count'], df_parking['stars'])
print(f"\nCorrelation between parking types count and rating: {corr:.4f}")
print(f"P-value: {p_value:.4e}")
if p_value < 0.05:
    print("✓ Statistically significant!")
else:
    print("Not statistically significant")

# Load and Analyze Review Data

In [None]:
#@title Load Sample Reviews
# Get sample of businesses with parking data
sample_business_ids = df_parking['business_id'].head(1000).tolist()
business_ids_str = "','" .join(sample_business_ids)

# Query reviews for these businesses
review_query = f"""
SELECT
    review_id,
    business_id,
    user_id,
    stars,
    useful,
    funny,
    cool,
    text,
    date
FROM {DATABASE_NAME}.review
WHERE business_id IN ('{business_ids_str}')
LIMIT 50000
"""

print("Loading sample reviews...")
print("This may take 2-3 minutes...")

df_reviews = pd.read_sql(review_query, conn)

print(f"\n✓ Loaded {len(df_reviews):,} reviews")
print(f"✓ For {df_reviews['business_id'].nunique():,} businesses")

In [None]:
#@title Parking Mentions in Reviews
# Search for parking mentions
parking_keywords = ['parking', 'park', 'valet', 'garage', 'lot']

def contains_parking_mention(text):
    """Check if review text mentions parking"""
    if pd.isna(text):
        return False
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in parking_keywords)

df_reviews['mentions_parking'] = df_reviews['text'].apply(contains_parking_mention)

# Statistics
parking_mentions = df_reviews['mentions_parking'].sum()
pct = parking_mentions / len(df_reviews) * 100

print(f"Reviews mentioning parking: {parking_mentions:,} ({pct:.2f}%)")
print(f"\nThis is valuable for sentiment analysis!")

In [None]:
#@title Compare ratings for reviews that mention parking vs those that don't
with_mention = df_reviews[df_reviews['mentions_parking'] == True]['stars'].mean()
without_mention = df_reviews[df_reviews['mentions_parking'] == False]['stars'].mean()

print("Average Rating:")
print(f"  Reviews mentioning parking: {with_mention:.3f} stars")
print(f"  Reviews not mentioning parking: {without_mention:.3f} stars")
print(f"  Difference: {with_mention - without_mention:+.3f} stars")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Distribution comparison
ax1 = axes[0]
df_reviews[df_reviews['mentions_parking'] == False]['stars'].hist(bins=5, alpha=0.5, label='No mention', ax=ax1, color='blue')
df_reviews[df_reviews['mentions_parking'] == True]['stars'].hist(bins=5, alpha=0.5, label='Mentions parking', ax=ax1, color='red')
ax1.set_xlabel('Star Rating')
ax1.set_ylabel('Number of Reviews')
ax1.set_title('Rating Distribution: Parking Mention vs No Mention')
ax1.legend()

# Box plot
ax2 = axes[1]
data_to_plot = [
    df_reviews[df_reviews['mentions_parking'] == False]['stars'],
    df_reviews[df_reviews['mentions_parking'] == True]['stars']
]
ax2.boxplot(data_to_plot, labels=['No Mention', 'Mentions Parking'])
ax2.set_ylabel('Star Rating')
ax2.set_title('Rating Comparison')

plt.tight_layout()
plt.show()

In [None]:
#@title Sample Review Text Analysis
# Show sample reviews mentioning parking
parking_reviews = df_reviews[df_reviews['mentions_parking'] == True].head(10)

print("Sample Reviews Mentioning Parking:\n")
for idx, row in parking_reviews.iterrows():
    print(f"Rating: {row['stars']} stars")
    print(f"Text: {row['text'][:200]}...")
    print("-" * 80)
    print()

# Data Quality Checks

In [None]:
#@title Missing Values
# Check missing values in business data
print("Missing Values in Business Data:")
missing = df_parking.isnull().sum()
missing_pct = (missing / len(df_parking) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing Count': missing.values,
    'Percentage': missing_pct.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
display(missing_df)

if len(missing_df) == 0:
    print("✓ No missing values in key columns!")

In [None]:
#@title Data Validity Checks
# Check for data issues
print("Data Validity Checks:\n")

# Rating range
invalid_ratings = df_parking[(df_parking['stars'] < 1) | (df_parking['stars'] > 5)]
print(f"Invalid ratings (< 1 or > 5): {len(invalid_ratings)}")

# Negative review counts
negative_reviews = df_parking[df_parking['review_count'] < 0]
print(f"Negative review counts: {len(negative_reviews)}")

# Missing coordinates
missing_coords = df_parking[(df_parking['latitude'].isna()) | (df_parking['longitude'].isna())]
print(f"Missing coordinates: {len(missing_coords)}")

# Duplicate business IDs
duplicates = df_parking['business_id'].duplicated().sum()
print(f"Duplicate business IDs: {duplicates}")

print("\n✓ Data quality looks good!" if len(invalid_ratings) == 0 and len(negative_reviews) == 0 else "⚠️  Found some data quality issues")

# City-Level Analysis

In [None]:
#@title Aggregate statistics by city
city_stats = df_parking.groupby('city').agg({
    'business_id': 'count',
    'stars': 'mean',
    'review_count': 'mean',
    'parking_types_count': 'mean',
    'has_valet': 'sum'
}).round(2)

city_stats.columns = ['Restaurant Count', 'Avg Rating', 'Avg Reviews', 'Avg Parking Types', 'Valet Count']
city_stats = city_stats.sort_values('Restaurant Count', ascending=False).head(15)

print("Top 15 Cities - Key Statistics:")
display(city_stats)

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Restaurant count
city_stats['Restaurant Count'].plot(kind='barh', ax=axes[0,0], color='steelblue')
axes[0,0].set_xlabel('Number of Restaurants')
axes[0,0].set_title('Restaurant Count by City')
axes[0,0].invert_yaxis()

# Average rating
city_stats['Avg Rating'].plot(kind='barh', ax=axes[0,1], color='coral')
axes[0,1].set_xlabel('Average Star Rating')
axes[0,1].set_title('Average Rating by City')
axes[0,1].invert_yaxis()

# Average parking types
city_stats['Avg Parking Types'].plot(kind='barh', ax=axes[1,0], color='mediumseagreen')
axes[1,0].set_xlabel('Average Number of Parking Types')
axes[1,0].set_title('Parking Variety by City')
axes[1,0].invert_yaxis()

# Valet availability
city_stats['Valet Count'].plot(kind='barh', ax=axes[1,1], color='gold')
axes[1,1].set_xlabel('Number of Restaurants with Valet')
axes[1,1].set_title('Valet Parking Availability by City')
axes[1,1].invert_yaxis()

plt.tight_layout()
plt.show()

# Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = ['stars', 'review_count', 'latitude', 'longitude',
                'has_garage', 'has_street', 'has_validated', 'has_lot', 'has_valet',
                'parking_types_count']

# Calculate correlation matrix
corr_matrix = df_parking[numeric_cols].corr()

# Visualize
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix: Features vs Rating', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

# Print correlations with stars
print("\nCorrelations with Star Rating:")
stars_corr = corr_matrix['stars'].sort_values(ascending=False)
print(stars_corr)

# Save Processed Data

In [None]:
# Save cleaned dataset to S3
output_path = f's3://{S3_BUCKET}/processed-data/'

print(f"Saving processed data to S3...")
print(f"Location: {output_path}")

# Save as parquet
df_parking.to_parquet(
    f'{output_path}business_with_parking.parquet',
    index=False,
    compression='snappy'
)

df_reviews.to_parquet(
    f'{output_path}reviews_sample.parquet',
    index=False,
    compression='snappy'
)

print("\n✓ Data saved successfully!")
print("\nSaved files:")
print(f"  - business_with_parking.parquet ({len(df_parking):,} rows)")
print(f"  - reviews_sample.parquet ({len(df_reviews):,} rows)")
print("\nReady for Module 3: Feature Engineering!")

# Release Resources

In [None]:

%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>

<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}