# AAI-540 Group 6 Project
# VenueSignal

# Setup, Configuration, and Connect

In [None]:
!python --version

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os
import json
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# AWS
import boto3
import sagemaker
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")

In [None]:
#@title Configuration
REGION = "us-east-1"

# Data bucket (read-only public dataset)
DATA_BUCKET = "yelp-aai540-group6"

# Athena bucket (writable, for query results)
ATHENA_BUCKET = "yelp-aai540-group6-cll"

# Athena configuration
ATHENA_DB = "yelp"  # Database name created in AthenaTables notebook
ATHENA_RESULTS_S3 = f"s3://{ATHENA_BUCKET}/athena-results/"

print(f"Region: {REGION}")
print(f"Data Bucket: {DATA_BUCKET}")
print(f"Athena Bucket: {ATHENA_BUCKET}")
print(f"Athena Database: {ATHENA_DB}")
print(f"Athena Results Location: {ATHENA_RESULTS_S3}")

In [None]:
# Create Athena connection
conn = connect(
    s3_staging_dir=ATHENA_RESULTS_S3,
    region_name=REGION,
    cursor_class=PandasCursor
)

print("✓ Connected to Athena")
print(f"  Database: {ATHENA_DB}")
print(f"  Results: {ATHENA_RESULTS_S3}")

In [None]:
#@title List available tables
tables_query = f"SHOW TABLES IN {ATHENA_DB}"
tables_df = pd.read_sql(tables_query, conn)

print(f"Available tables in '{ATHENA_DB}' database:")
display(tables_df)

# Expected tables: business, business_attributes, review, user, checkin, tip

# Load Data

In [None]:
#@title Query to load restaurant data with parking info
business_query = f"""
SELECT
    b.business_id,
    b.name,
    b.address,
    b.city,
    b.state,
    b.postal_code,
    b.latitude,
    b.longitude,
    b.stars,
    b.review_count,
    b.is_open,
    b.categories,
    -- Parking features from business_attributes table (pre-parsed)
    ba.parking_garage,
    ba.parking_street,
    ba.parking_validated,
    ba.parking_lot,
    ba.parking_valet,
    -- Additional useful attributes
    ba.restaurantspricerange2 as price_range,
    ba.restaurantsreservations,
    ba.restaurantstakeout,
    ba.restaurantsdelivery,
    ba.outdoorseating,
    ba.wifi,
    ba.alcohol
FROM {ATHENA_DB}.business b
LEFT JOIN {ATHENA_DB}.business_attributes ba
    ON b.business_id = ba.business_id
WHERE b.categories LIKE '%Restaurant%'
  AND b.is_open = 1
  AND b.review_count >= 10
"""

print("Loading restaurant data from Athena...")
print("This may take 1-2 minutes...")

df_business = pd.read_sql(business_query, conn)

print(f"\n✓ Loaded {len(df_business):,} restaurants")
print(f"Dataset shape: {df_business.shape}")
print(f"Memory usage: {df_business.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Basic Data Overview
print("Sample Data:")
display(df_business.head())

print("\nData Types:")
print(df_business.dtypes)

print("\nBasic Statistics:")
display(df_business[['stars', 'review_count', 'latitude', 'longitude']].describe())

## Process Parking Features


In [None]:
# Convert parking boolean columns (they come as strings 'true'/'false' from Athena)
parking_cols = ['parking_garage', 'parking_street', 'parking_validated', 'parking_lot', 'parking_valet']

for col in parking_cols:
    if col in df_business.columns:
        # Convert string booleans to actual booleans
        df_business[col] = df_business[col].map({
            'true': True,
            'True': True,
            'false': False,
            'False': False,
            None: False
        }).fillna(False)

# Create convenience columns matching the original notebook naming
df_business['has_garage'] = df_business['parking_garage']
df_business['has_street'] = df_business['parking_street']
df_business['has_validated'] = df_business['parking_validated']
df_business['has_lot'] = df_business['parking_lot']
df_business['has_valet'] = df_business['parking_valet']

# Count how many parking types each restaurant has
df_business['parking_types_count'] = (
    df_business['has_garage'].astype(int) +
    df_business['has_street'].astype(int) +
    df_business['has_validated'].astype(int) +
    df_business['has_lot'].astype(int) +
    df_business['has_valet'].astype(int)
)

# Flag for having any parking
df_business['has_any_parking'] = df_business['parking_types_count'] > 0

print("Parking Features Summary:")
print(f"Total restaurants: {len(df_business):,}")
print(f"With any parking data: {df_business['has_any_parking'].sum():,} ({df_business['has_any_parking'].mean()*100:.1f}%)")
print(f"\nParking Type Distribution:")
for col in ['has_garage', 'has_street', 'has_validated', 'has_lot', 'has_valet']:
    count = df_business[col].sum()
    pct = count / len(df_business) * 100
    print(f"  {col:20s}: {count:6,} ({pct:5.1f}%)")

In [None]:
# Create a filtered dataset with only restaurants that have parking information
df_parking = df_business[df_business['has_any_parking']].copy()

print(f"Filtered dataset with parking info: {len(df_parking):,} restaurants")
print(f"({len(df_parking)/len(df_business)*100:.1f}% of all restaurants)")

## Load Sample Reviews

In [None]:
# Get a sample of reviews for our restaurants
# Note: The review table is partitioned by year and stored in parquet format
reviews_query = f"""
SELECT
    review_id,
    user_id,
    business_id,
    stars,
    useful,
    funny,
    cool,
    text,
    date,
    year
FROM {ATHENA_DB}.review
WHERE year >= 2018
  AND business_id IN (
    SELECT business_id
    FROM {ATHENA_DB}.business
    WHERE categories LIKE '%Restaurant%'
    AND is_open = 1
  )
LIMIT 50000
"""

print("Loading sample reviews...")
print("This may take 2-3 minutes...")

df_reviews = pd.read_sql(reviews_query, conn)

print(f"\n✓ Loaded {len(df_reviews):,} reviews")
print(f"Date range: {df_reviews['date'].min()} to {df_reviews['date'].max()}")

# Exploratory Data Analysis - Business Data

## Geographic Distribution

In [None]:
# Top Cities and States
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Cities with most restaurants
top_cities = df_parking['city'].value_counts().head(15)
ax1 = axes[0]
top_cities.plot(kind='barh', ax=ax1, color='steelblue')
ax1.set_xlabel('Number of Restaurants')
ax1.set_title('Top 15 Cities by Restaurant Count (with parking data)')
ax1.invert_yaxis()

# Top states
top_states = df_parking['state'].value_counts().head(10)
ax2 = axes[1]
top_states.plot(kind='barh', ax=ax2, color='coral')
ax2.set_xlabel('Number of Restaurants')
ax2.set_title('Top 10 States by Restaurant Count (with parking data)')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

print(f"Total cities: {df_parking['city'].nunique()}")
print(f"Total states: {df_parking['state'].nunique()}")

## Rating Distribution

In [None]:
# Rating statistics
print("Rating Statistics:")
print(f"Mean: {df_parking['stars'].mean():.2f}")
print(f"Median: {df_parking['stars'].median():.2f}")
print(f"Std Dev: {df_parking['stars'].std():.2f}")
print(f"Min: {df_parking['stars'].min():.1f}")
print(f"Max: {df_parking['stars'].max():.1f}")

# Plot distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram
ax1 = axes[0]
df_parking['stars'].hist(bins=20, ax=ax1, color='skyblue', edgecolor='black')
ax1.axvline(df_parking['stars'].mean(), color='red', linestyle='--', linewidth=2,
            label=f'Mean: {df_parking["stars"].mean():.2f}')
ax1.set_xlabel('Star Rating')
ax1.set_ylabel('Number of Restaurants')
ax1.set_title('Distribution of Restaurant Ratings')
ax1.legend()

# Box plot
ax2 = axes[1]
df_parking.boxplot(column='stars', ax=ax2)
ax2.set_ylabel('Star Rating')
ax2.set_title('Rating Distribution (Box Plot)')

plt.tight_layout()
plt.show()

## Review Count Distribution

In [None]:
# Review count statistics
print("Review Count Statistics:")
print(f"Mean: {df_parking['review_count'].mean():.1f}")
print(f"Median: {df_parking['review_count'].median():.1f}")
print(f"Std Dev: {df_parking['review_count'].std():.1f}")
print(f"Min: {df_parking['review_count'].min()}")
print(f"Max: {df_parking['review_count'].max()}")

# Plot distribution (log scale)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

ax1 = axes[0]
df_parking['review_count'].hist(bins=50, ax=ax1, color='lightgreen', edgecolor='black')
ax1.set_xlabel('Number of Reviews')
ax1.set_ylabel('Number of Restaurants')
ax1.set_title('Review Count Distribution')

ax2 = axes[1]
df_parking['review_count'].hist(bins=50, ax=ax2, color='lightgreen', edgecolor='black', log=True)
ax2.set_xlabel('Number of Reviews')
ax2.set_ylabel('Number of Restaurants (log scale)')
ax2.set_title('Review Count Distribution (Log Scale)')

plt.tight_layout()
plt.show()

## Parking Analysis

In [None]:
# Parking type distribution
parking_summary = pd.DataFrame({
    'Parking Type': ['Garage', 'Street', 'Validated', 'Lot', 'Valet'],
    'Count': [
        df_parking['has_garage'].sum(),
        df_parking['has_street'].sum(),
        df_parking['has_validated'].sum(),
        df_parking['has_lot'].sum(),
        df_parking['has_valet'].sum()
    ]
})
parking_summary['Percentage'] = (parking_summary['Count'] / len(df_parking) * 100).round(1)
parking_summary = parking_summary.sort_values('Count', ascending=False)

print("Parking Type Availability:")
display(parking_summary)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
ax1 = axes[0]
parking_summary.plot(x='Parking Type', y='Count', kind='bar', ax=ax1, color='steelblue', legend=False)
ax1.set_xlabel('Parking Type')
ax1.set_ylabel('Number of Restaurants')
ax1.set_title('Parking Type Availability')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
ax2 = axes[1]
parking_summary.plot(x='Parking Type', y='Count', kind='pie', ax=ax2, autopct='%1.1f%%',
                     ylabel='', legend=False)
ax2.set_title('Parking Type Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Number of parking types per restaurant
parking_types_dist = df_parking['parking_types_count'].value_counts().sort_index()

print("Distribution of Number of Parking Types:")
for types, count in parking_types_dist.items():
    pct = count / len(df_parking) * 100
    print(f"  {types} type(s): {count:6,} restaurants ({pct:5.1f}%)")

# Visualize
plt.figure(figsize=(12, 6))
parking_types_dist.plot(kind='bar', color='coral')
plt.xlabel('Number of Parking Types Available')
plt.ylabel('Number of Restaurants')
plt.title('Distribution of Number of Parking Types per Restaurant')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Parking vs Rating Analysis

In [None]:
# Compare ratings by parking type
parking_rating_summary = pd.DataFrame({
    'Parking Type': ['Garage', 'Street', 'Validated', 'Lot', 'Valet'],
    'Avg Rating': [
        df_parking[df_parking['has_garage']]['stars'].mean(),
        df_parking[df_parking['has_street']]['stars'].mean(),
        df_parking[df_parking['has_validated']]['stars'].mean(),
        df_parking[df_parking['has_lot']]['stars'].mean(),
        df_parking[df_parking['has_valet']]['stars'].mean()
    ],
    'Count': [
        df_parking['has_garage'].sum(),
        df_parking['has_street'].sum(),
        df_parking['has_validated'].sum(),
        df_parking['has_lot'].sum(),
        df_parking['has_valet'].sum()
    ]
})
parking_rating_summary = parking_rating_summary.sort_values('Avg Rating', ascending=False)

print("Average Rating by Parking Type:")
display(parking_rating_summary)

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(parking_rating_summary['Parking Type'], parking_rating_summary['Avg Rating'], color='gold')
plt.axhline(df_parking['stars'].mean(), color='red', linestyle='--', linewidth=2,
            label=f'Overall Avg: {df_parking["stars"].mean():.2f}')
plt.xlabel('Parking Type')
plt.ylabel('Average Star Rating')
plt.title('Average Restaurant Rating by Parking Type')
plt.ylim(3, 4.5)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Rating by number of parking types
rating_by_parking_count = df_parking.groupby('parking_types_count')['stars'].agg(['mean', 'count'])
rating_by_parking_count.columns = ['Avg Rating', 'Restaurant Count']

print("Rating by Number of Parking Types:")
display(rating_by_parking_count)

# Visualize
fig, ax1 = plt.subplots(figsize=(12, 6))

x = rating_by_parking_count.index
ax1.plot(x, rating_by_parking_count['Avg Rating'], 'o-', color='steelblue', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Parking Types Available')
ax1.set_ylabel('Average Star Rating', color='steelblue')
ax1.tick_params(axis='y', labelcolor='steelblue')
ax1.axhline(df_parking['stars'].mean(), color='red', linestyle='--', alpha=0.5)

ax2 = ax1.twinx()
ax2.bar(x, rating_by_parking_count['Restaurant Count'], alpha=0.3, color='coral')
ax2.set_ylabel('Number of Restaurants', color='coral')
ax2.tick_params(axis='y', labelcolor='coral')

plt.title('Restaurant Rating vs Number of Parking Types')
plt.tight_layout()
plt.show()

## City-Level Analysis

In [None]:
# City-level statistics
city_stats = df_parking.groupby('city').agg({
    'business_id': 'count',
    'stars': 'mean',
    'review_count': 'mean',
    'parking_types_count': 'mean',
    'has_valet': 'sum'
})

city_stats.columns = ['Restaurant Count', 'Avg Rating', 'Avg Reviews', 'Avg Parking Types', 'Valet Count']
city_stats = city_stats.sort_values('Restaurant Count', ascending=False).head(15)

print("Top 15 Cities - Key Statistics:")
display(city_stats)

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Restaurant count
city_stats['Restaurant Count'].plot(kind='barh', ax=axes[0,0], color='steelblue')
axes[0,0].set_xlabel('Number of Restaurants')
axes[0,0].set_title('Restaurant Count by City')
axes[0,0].invert_yaxis()

# Average rating
city_stats['Avg Rating'].plot(kind='barh', ax=axes[0,1], color='coral')
axes[0,1].set_xlabel('Average Star Rating')
axes[0,1].set_title('Average Rating by City')
axes[0,1].invert_yaxis()

# Average parking types
city_stats['Avg Parking Types'].plot(kind='barh', ax=axes[1,0], color='mediumseagreen')
axes[1,0].set_xlabel('Average Number of Parking Types')
axes[1,0].set_title('Parking Variety by City')
axes[1,0].invert_yaxis()

# Valet availability
city_stats['Valet Count'].plot(kind='barh', ax=axes[1,1], color='gold')
axes[1,1].set_xlabel('Number of Restaurants with Valet')
axes[1,1].set_title('Valet Parking Availability by City')
axes[1,1].invert_yaxis()

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = ['stars', 'review_count', 'latitude', 'longitude',
                'has_garage', 'has_street', 'has_validated', 'has_lot', 'has_valet',
                'parking_types_count']

# Calculate correlation matrix
corr_matrix = df_parking[numeric_cols].corr()

# Visualize
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix: Features vs Rating', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

# Print correlations with stars
print("\nCorrelations with Star Rating:")
stars_corr = corr_matrix['stars'].sort_values(ascending=False)
print(stars_corr)

# Save Processed Data

In [None]:
# Save cleaned dataset to S3
output_path = f's3://{ATHENA_BUCKET}/processed-data/'

print(f"Saving processed data to S3...")
print(f"Location: {output_path}")

# Save as parquet
df_parking.to_parquet(
    f'{output_path}business_with_parking.parquet',
    index=False,
    compression='snappy'
)

if 'df_reviews' in locals():
    df_reviews.to_parquet(
        f'{output_path}reviews_sample.parquet',
        index=False,
        compression='snappy'
    )

print("\n✓ Data saved successfully!")
print("\nSaved files:")
print(f"  - business_with_parking.parquet ({len(df_parking):,} rows)")
if 'df_reviews' in locals():
    print(f"  - reviews_sample.parquet ({len(df_reviews):,} rows)")
print("\nReady for Module 3: Feature Engineering!")