# 空间冗余可视化分析

分析数据采集的空间冗余情况，量化"X%的空间包含Y%的数据"。

## 使用前准备

### 1. 创建基础表（首次运行）
```bash
python ../dataset/bbox_examples/analyze_spatial_redundancy.py --create-table
```

### 2. 生成grid数据
```bash
# 分析指定城市
python ../dataset/bbox_examples/batch_grid_analysis.py --cities A263 B001

# 或分析所有城市（不指定--cities参数）
python ../dataset/bbox_examples/batch_grid_analysis.py

# 或指定返回的网格百分比/数量
python ../dataset/bbox_examples/batch_grid_analysis.py --top-percent 5  # 前5%最密集网格
python ../dataset/bbox_examples/batch_grid_analysis.py --top-n 10      # 前10个最密集网格
```

### 3. 快速查看冗余度
```bash
python ../dataset/bbox_examples/analyze_spatial_redundancy.py --export-csv
```

## 核心指标

**冗余指数** = 场景占比 / 面积占比

- **< 5**: 数据分布均匀 ✅
- **5-10**: 轻度集中 
- **10-20**: 中度冗余 ⚠️
- **> 20**: 严重冗余 ❌

**例子**: 冗余指数=15 表示 1%的面积包含了15%的场景（15倍集中）


In [None]:
# 导入库
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text

# 设置样式
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style("whitegrid")

# 添加项目路径
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root / "src"))

from spdatalab.dataset.bbox import LOCAL_DSN

print("✅ 库导入成功")


## 1. 数据加载


In [None]:
# 连接数据库并加载城市grid数据
engine = create_engine(LOCAL_DSN)

cities_sql = """
SELECT 
    city_id,
    COUNT(*) as grid_count,
    SUM(bbox_count) as total_bboxes,
    MAX(bbox_count) as max_density,
    AVG(bbox_count)::numeric(10,2) as avg_density
FROM city_grid_density
WHERE analysis_date = CURRENT_DATE
GROUP BY city_id
ORDER BY total_bboxes DESC
"""

cities_df = pd.read_sql(cities_sql, engine)
print(f"📊 发现 {len(cities_df)} 个城市的grid数据")
cities_df.head(10)


## 2. 单城市Grid密度分析


In [None]:
# 选择要分析的城市
selected_city = cities_df.iloc[0]['city_id']
print(f"🎯 分析城市: {selected_city}")

# 加载该城市的grid数据
city_grid_sql = f"""
SELECT 
    grid_x, grid_y, bbox_count, scene_count
FROM city_grid_density
WHERE city_id = '{selected_city}' AND analysis_date = CURRENT_DATE
ORDER BY bbox_count DESC
"""

city_grid_df = pd.read_sql(city_grid_sql, engine)
print(f"Grid数量: {len(city_grid_df)}, 密度范围: {city_grid_df['bbox_count'].min()} ~ {city_grid_df['bbox_count'].max()}")

# Grid密度热力图
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(city_grid_df['grid_x'], city_grid_df['grid_y'], 
                     c=city_grid_df['bbox_count'], s=100, 
                     cmap='YlOrRd', alpha=0.7, edgecolors='black', linewidth=0.5)
ax.set_xlabel('Grid X')
ax.set_ylabel('Grid Y')
ax.set_title(f'{selected_city} - BBox密度分布')
ax.set_aspect('equal')
plt.colorbar(scatter, ax=ax, label='BBox数量/Grid')
plt.tight_layout()
plt.show()


## 3. 帕累托分析（数据集中度）


In [None]:
# 计算累积分布
sorted_counts = np.sort(city_grid_df['bbox_count'])[::-1]
cumulative_pct = np.cumsum(sorted_counts) / sorted_counts.sum() * 100
grid_pct = np.arange(1, len(sorted_counts) + 1) / len(sorted_counts) * 100

# 绘制帕累托曲线
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(grid_pct, cumulative_pct, linewidth=2, color='steelblue', label='实际分布')
ax.plot([0, 100], [0, 100], linestyle='--', color='gray', linewidth=1.5, label='均匀分布')

# 标记关键点
for pct in [1, 5, 10, 20]:
    idx = int(len(sorted_counts) * pct / 100)
    if idx < len(cumulative_pct):
        coverage = cumulative_pct[idx]
        ax.plot([pct, pct], [0, coverage], 'r:', alpha=0.5)
        ax.plot([0, pct], [coverage, coverage], 'r:', alpha=0.5)
        ax.text(pct+1, coverage+3, f'{pct}%→{coverage:.1f}%', fontsize=9)

ax.set_xlabel('累积Grid百分比 (%)')
ax.set_ylabel('累积BBox百分比 (%)')
ax.set_title(f'{selected_city} - 帕累托曲线')
ax.legend()
ax.grid(alpha=0.3)
plt.show()

# 输出冗余指标
print("冗余度指标:")
for pct in [1, 5, 10]:
    idx = int(len(sorted_counts) * pct / 100)
    if idx < len(cumulative_pct):
        coverage = cumulative_pct[idx]
        print(f"  Top {pct}%: 包含 {coverage:.1f}% 的bbox，冗余指数≈{coverage/pct:.1f}")


## 4. 多城市冗余度对比


In [None]:
# 简化的冗余度计算
def calc_redundancy(city_id, top_pct=1.0):
    sql = f"""
    WITH all_grids AS (
        SELECT bbox_count FROM city_grid_density
        WHERE city_id = '{city_id}' AND analysis_date = CURRENT_DATE
        ORDER BY bbox_count DESC
    ),
    top_grids AS (
        SELECT * FROM all_grids
        LIMIT GREATEST(1, (SELECT ROUND(COUNT(*) * {top_pct/100.0}) FROM all_grids)::int)
    )
    SELECT 
        (SELECT SUM(bbox_count) FROM top_grids)::float / NULLIF((SELECT SUM(bbox_count) FROM all_grids), 0) * 100 as bbox_pct,
        (SELECT COUNT(*) FROM top_grids)::float / NULLIF((SELECT COUNT(*) FROM all_grids), 0) * 100 as grid_pct
    """
    r = pd.read_sql(sql, engine).iloc[0]
    return r['bbox_pct'] / r['grid_pct'] if r['grid_pct'] > 0 else 0

# 计算所有城市的冗余度
print("计算各城市冗余度...")
cities_df['redundancy_index'] = cities_df['city_id'].apply(calc_redundancy)
cities_df = cities_df.sort_values('redundancy_index', ascending=False)

# 冗余度对比
top10 = cities_df.head(10)
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['red' if x > 20 else 'orange' if x > 10 else 'green' for x in top10['redundancy_index']]
bars = ax.bar(range(len(top10)), top10['redundancy_index'], color=colors, alpha=0.7, edgecolor='black')
ax.set_xticks(range(len(top10)))
ax.set_xticklabels(top10['city_id'], rotation=45)
ax.set_ylabel('冗余指数')
ax.set_title('Top 10城市冗余度对比')
ax.axhline(10, color='orange', linestyle='--', alpha=0.7, label='中度阈值')
ax.axhline(20, color='red', linestyle='--', alpha=0.7, label='严重阈值')
ax.legend()
ax.grid(axis='y', alpha=0.3)
for i, (bar, val) in enumerate(zip(bars, top10['redundancy_index'])):
    ax.text(i, val+0.5, f'{val:.1f}', ha='center', fontsize=9)
plt.tight_layout()
plt.show()

print(f"\n统计: 平均={cities_df['redundancy_index'].mean():.2f}, 中位数={cities_df['redundancy_index'].median():.2f}")


## 5. 导出结果


In [None]:
# 导出CSV
output_file = 'city_redundancy_analysis.csv'
cities_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"✅ 已导出: {output_file}")
cities_df[['city_id', 'grid_count', 'total_bboxes', 'max_density', 'redundancy_index']].head()
