# SF Business Intelligence Platform - Data Exploration

This notebook is for exploring SF.gov Open Data and developing insights.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path().parent.parent / 'src'))

from data_pipeline.download import download_business_registry, download_permits, download_complaints
from data_pipeline.clean import clean_business_data
from data_pipeline.merge import merge_datasets
from utils.config import Config

## Download Data

In [None]:
# Download business registry data
business_df = download_business_registry()
print(f"Downloaded {len(business_df)} business records")
print(f"Columns: {list(business_df.columns)}")

In [None]:
# Download permits data
permits_df = download_permits()
print(f"Downloaded {len(permits_df)} permit records")

In [None]:
# Download complaints data
complaints_df = download_complaints()
print(f"Downloaded {len(complaints_df)} complaint records")

## Clean and Explore Data

In [None]:
# Clean business data
business_clean = clean_business_data(business_df, dataset_type='business')
business_clean.head()

In [None]:
# Basic statistics
print("Business Registry Statistics:")
print(business_clean.describe())
print("\nActive Businesses:", business_clean.get('is_active', pd.Series()).sum() if 'is_active' in business_clean.columns else 'N/A')

## Merge Datasets

In [None]:
# Clean other datasets
permits_clean = clean_business_data(permits_df, dataset_type='permits')
complaints_clean = clean_business_data(complaints_df, dataset_type='complaints')

# Merge
merged_df = merge_datasets(business_clean, permits_clean, complaints_clean)
print(f"Merged dataset: {len(merged_df)} records")
merged_df.head()

## Visualizations

In [None]:
# Business age distribution
if 'business_start_year' in merged_df.columns:
    merged_df['business_age'] = 2024 - merged_df['business_start_year']
    plt.figure(figsize=(10, 6))
    merged_df['business_age'].hist(bins=30)
    plt.title('Business Age Distribution')
    plt.xlabel('Business Age (years)')
    plt.ylabel('Number of Businesses')
    plt.show()

In [None]:
# Complaints vs Permits
if 'total_complaints' in merged_df.columns and 'total_permits' in merged_df.columns:
    plt.figure(figsize=(10, 6))
    plt.scatter(merged_df['total_permits'], merged_df['total_complaints'], alpha=0.5)
    plt.xlabel('Total Permits')
    plt.ylabel('Total Complaints')
    plt.title('Complaints vs Permits')
    plt.show()

## Risk Model Exploration

In [None]:
from risk_engine.model import RiskPredictor

# Initialize and train model
predictor = RiskPredictor(model_type='random_forest')
metrics = predictor.train(merged_df)

print("Model Training Metrics:")
print(f"ROC-AUC: {metrics['roc_auc']:.3f}")

In [None]:
# Feature importance
feature_importance = predictor.get_feature_importance()
print("Top Risk Factors:")
print(feature_importance.head(10))

In [None]:
# Predictions
predictions = predictor.predict(merged_df)
print(f"Risk Score Distribution:")
print(predictions['risk_score'].describe())
print(f"\nRisk Levels:")
print(predictions['risk_level'].value_counts())