In [1]:
import sqlite3
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load data from SQLite
conn = sqlite3.connect("../db/database_sqlite.db")
df = pd.read_sql_query("SELECT * FROM Combined_Restaurant_View", conn)
conn.close()

# Keep rows with valid Yelp ratings
df = df[df['Rating'].notnull()].copy()

# Encode price levels
price_map = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4}
df['Price_Encoded'] = df['Price'].map(price_map)

# Select features and target
features = [
    'Review_Count',
    'Score',
    'Median_Household_Income',
    'Median_Home_Value',
    'Population_Density_per_sq_mi',
    'Employment_Rate',
    'Total_Population',
    'Total_Employer_Establishments',
    'Price_Encoded'
]

df = df.dropna(subset=features)
X = df[features]
y = df['Rating']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

# Map feature names for readability
readable_names = {
    'Review_Count': 'Review Count',
    'Score': 'Inspection Score',
    'Median_Household_Income': 'Median Household Income',
    'Median_Home_Value': 'Median Home Value',
    'Population_Density_per_sq_mi': 'Population Density',
    'Employment_Rate': 'Employment Rate',
    'Total_Population': 'Total Population',
    'Total_Employer_Establishments': 'Employer Establishments',
    'Price_Encoded': 'Price Level'
}

# Build feature-importance table
importance_df = pd.DataFrame({
    'Feature': [readable_names[f] for f in features],
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display results
print("\nTop Features Influencing Rating:\n")
print(importance_df.to_string(index=False))
print()


üìä Top Features Influencing Rating:

                Feature  Importance
           Review Count    0.487687
            Price Level    0.129595
       Inspection Score    0.107063
      Median Home Value    0.054850
     Population Density    0.048966
Employer Establishments    0.047088
        Employment Rate    0.044551
       Total Population    0.042780
Median Household Income    0.037421



In [2]:
import pandas as pd

# Feature importance values
importance_data = [
    ('Review_Count', 0.4869),
    ('Price_Encoded', 0.1299),
    ('Score', 0.1087),
    ('Median_Home_Value', 0.0548),
    ('Population_Density_per_sq_mi', 0.0493),
    ('Total_Employer_Establishments', 0.0470),
    ('Employment_Rate', 0.0429),
    ('Total_Population', 0.0419),
    ('Median_Household_Income', 0.0385)
]

# Readable feature names
readable_names = {
    'Review_Count': 'Review Count',
    'Price_Encoded': 'Price',
    'Score': 'Inspection Score',
    'Median_Home_Value': 'Median Home Value',
    'Population_Density_per_sq_mi': 'Population Density',
    'Total_Employer_Establishments': 'Employer Establishments',
    'Employment_Rate': 'Employment Rate',
    'Total_Population': 'Total Population',
    'Median_Household_Income': 'Median Household Income'
}

# Usage across report sections
usage = {
    'Review_Count': ("‚úÖ Must include", "‚úÖ", "‚úÖ Must include", "‚úÖ"),
    'Price_Encoded': ("‚ùå", "‚ùå", "‚úÖ", "‚úÖ"),
    'Score': ("‚úÖ", "‚úÖ", "‚úÖ", "‚úÖ"),
    'Median_Home_Value': ("Optional", "‚úÖ", "Optional", "‚úÖ"),
    'Population_Density_per_sq_mi': ("Optional", "‚úÖ", "Optional", "‚úÖ"),
    'Total_Employer_Establishments': ("Optional", "‚úÖ", "Optional", "‚úÖ"),
    'Employment_Rate': ("Optional", "‚úÖ", "Optional", "‚úÖ"),
    'Total_Population': ("‚ùå Skip", "‚úÖ", "‚ùå", "‚úÖ"),
    'Median_Household_Income': ("‚ùå Skip", "‚úÖ", "‚ùå", "‚úÖ")
}

# Build table rows
table_rows = []
for feature, score in importance_data:
    row = {
        "Feature": readable_names.get(feature, feature),
        "Importance Score": round(score, 4),
        "5.1 Opportunity Score": usage[feature][0],
        "5.2 Clustering/Correlation": usage[feature][1],
        "5.3 Composite Success Index": usage[feature][2],
        "5.4 Success Prediction Model": usage[feature][3]
    }
    table_rows.append(row)

# Create and display table
feature_table = pd.DataFrame(table_rows)
print("\nFeature Usage Table:\n")

feature_table.reset_index(drop=True, inplace=True)
feature_table.index = range(1, len(feature_table) + 1)
feature_table


üìä Feature Usage Table (Sections 6.1 to 6.4):



Unnamed: 0,Feature,Importance Score,5.1 Opportunity Score,5.2 Clustering/Correlation,5.3 Composite Success Index,5.4 Success Prediction Model
1,Review Count,0.4869,‚úÖ Must include,‚úÖ,‚úÖ Must include,‚úÖ
2,Price,0.1299,‚ùå,‚ùå,‚úÖ,‚úÖ
3,Inspection Score,0.1087,‚úÖ,‚úÖ,‚úÖ,‚úÖ
4,Median Home Value,0.0548,Optional,‚úÖ,Optional,‚úÖ
5,Population Density,0.0493,Optional,‚úÖ,Optional,‚úÖ
6,Employer Establishments,0.047,Optional,‚úÖ,Optional,‚úÖ
7,Employment Rate,0.0429,Optional,‚úÖ,Optional,‚úÖ
8,Total Population,0.0419,‚ùå Skip,‚úÖ,‚ùå,‚úÖ
9,Median Household Income,0.0385,‚ùå Skip,‚úÖ,‚ùå,‚úÖ
