# SF Business Intelligence Platform - Data Exploration

This notebook is for exploring SF.gov Open Data and developing insights.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path (project root /src)
cwd = Path.cwd()
for candidate in [cwd / "src", cwd.parent / "src", cwd.parent.parent / "src"]:
    if candidate.exists():
        sys.path.insert(0, str(candidate))
        print("Added to sys.path:", candidate)
        break
else:
    raise FileNotFoundError("Could not find a 'src' directory from current working dir")

# Force reload modules to pick up any code changes
for mod in list(sys.modules.keys()):
    if any(x in mod for x in ['data_pipeline', 'utils', 'risk_engine', 'city_intelligence', 'lease_intelligence']):
        del sys.modules[mod]

# Package imports from src
from data_pipeline import (
    download_business_registry,
    download_permits,
    download_complaints,
)
from data_pipeline.clean import clean_business_data
from data_pipeline.merge import merge_datasets
from utils.config import Config

print("Using dataset IDs:")
print(f"  Business: {Config.BUSINESS_LICENSE_DATASET}")
print(f"  Permits:  {Config.PERMITS_DATASET}")
print(f"  Complaints: {Config.COMPLAINTS_DATASET}")

Added to sys.path: /Users/dhruv/SJSU/Nvidia_hack/slowcal/src


## Download Data

In [2]:
# Download business registry data
try:
    business_df = download_business_registry()
    print(f"Downloaded {len(business_df)} business records")
    print(f"Columns: {list(business_df.columns)}")
except Exception as e:
    print("Business registry download failed:", e)
    cached = Config.RAW_DATA_DIR / "business_registry.json"
    if cached.exists():
        business_df = pd.read_json(cached, orient="records")
        print("Loaded cached business_registry.json:", len(business_df), "records")
    else:
        raise

Downloaded 50000 business records
Columns: ['uniqueid', 'certificate_number', 'ttxid', 'ownership_name', 'dba_name', 'full_business_address', 'city', 'state', 'business_zip', 'dba_start_date', 'dba_end_date', 'location_start_date', 'location_end_date', 'administratively_closed', 'parking_tax', 'transient_occupancy_tax', 'location', 'data_as_of', 'data_loaded_at', 'neighborhoods_analysis_boundaries', 'supervisor_district', 'community_benefit_district', ':@computed_region_6qbp_sg9q', ':@computed_region_qgnn_b9vv', ':@computed_region_26cr_cadq', ':@computed_region_ajp5_b2md', ':@computed_region_jwn9_ihcz', 'mailing_address_1', 'mail_city', 'mail_state', 'mail_zipcode', 'naic_code', 'naic_code_description', 'naics_code_descriptions_list', 'business_corridor', 'lic', 'lic_code_description', 'lic_code_descriptions_list']


In [7]:
# Download permits data
try:
    permits_df = download_permits()
    print(f"Downloaded {len(permits_df)} permit records")
except Exception as e:
    print("Permits download failed:", e)
    cached = Config.RAW_DATA_DIR / "permits.json"
    if cached.exists():
        permits_df = pd.read_json(cached, orient="records")
        print("Loaded cached permits.json:", len(permits_df), "records")
    else:
        raise

Error downloading data: 404 Client Error: Not Found for url: https://data.sfgov.org/resource/p4e4-5k3y.json?%24limit=50000&%24order=record_id+DESC


Permits download failed: 404 Client Error: Not Found for url: https://data.sfgov.org/resource/p4e4-5k3y.json?%24limit=50000&%24order=record_id+DESC
Loaded cached permits.json: 50000 records
Loaded cached permits.json: 50000 records


In [None]:
# Download complaints data
try:
    complaints_df = download_complaints()
    print(f"Downloaded {len(complaints_df)} complaint records")
except Exception as e:
    print("Complaints download failed:", e)
    cached = Config.RAW_DATA_DIR / "complaints.json"
    if cached.exists():
        complaints_df = pd.read_json(cached, orient="records")
        print("Loaded cached complaints.json:", len(complaints_df), "records")
    else:
        raise

## Clean and Explore Data

In [None]:
# Clean business data
business_clean = clean_business_data(business_df, dataset_type='business')
business_clean.head()

In [None]:
# Basic statistics
print("Business Registry Statistics:")
print(business_clean.describe())
print("\nActive Businesses:", business_clean.get('is_active', pd.Series()).sum() if 'is_active' in business_clean.columns else 'N/A')

## Merge Datasets

In [None]:
# Clean other datasets
permits_clean = clean_business_data(permits_df, dataset_type='permits')
complaints_clean = clean_business_data(complaints_df, dataset_type='complaints')

# Merge
merged_df = merge_datasets(business_clean, permits_clean, complaints_clean)
print(f"Merged dataset: {len(merged_df)} records")
merged_df.head()

## Visualizations

In [None]:
# Business age distribution
if 'business_start_year' in merged_df.columns:
    merged_df['business_age'] = 2024 - merged_df['business_start_year']
    plt.figure(figsize=(10, 6))
    merged_df['business_age'].hist(bins=30)
    plt.title('Business Age Distribution')
    plt.xlabel('Business Age (years)')
    plt.ylabel('Number of Businesses')
    plt.show()

In [None]:
# Complaints vs Permits
if 'total_complaints' in merged_df.columns and 'total_permits' in merged_df.columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(merged_df['total_permits'], merged_df['total_complaints'], alpha=0.5)
    plt.xlabel('Total Permits')
    plt.ylabel('Total Complaints')
    plt.title('Complaints vs Permits')
    plt.show()

## Risk Model Exploration

In [None]:
from risk_engine.model import RiskPredictor

# Initialize and train model
predictor = RiskPredictor(model_type='random_forest')
metrics = predictor.train(merged_df)

print("Model Training Metrics:")
print(f"ROC-AUC: {metrics['roc_auc']:.3f}")

In [None]:
# Feature importance
feature_importance = predictor.get_feature_importance()
print("Top Risk Factors:")
print(feature_importance.head(10))

In [None]:
# Predictions
predictions = predictor.predict(merged_df)
print(f"Risk Score Distribution:")
print(predictions['risk_score'].describe())
print(f"\nRisk Levels:")
print(predictions['risk_level'].value_counts())