# NYC电动出租车数据下载与仿真环境

本Notebook用于下载NYC出租车数据，储存数据，并通过NYCEnvironment生成仿真环境预订单。

## 目标
1. 🚕 下载真实的NYC出租车数据
2. 💾 存储和预处理数据
3. 🏙️ 初始化NYC仿真环境
4. 📋 生成预订单数据
5. 📊 可视化分析结果

---

## 1. 导入所需库
首先导入所有必要的Python库，包括数据处理、环境仿真和可视化相关的模块。

In [None]:
# 基础库
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# 添加项目路径
sys.path.append(os.path.dirname(os.getcwd()))

# 数据下载和处理
try:
    from data.download_data import download_data, describedata
    print("✓ 数据下载模块导入成功")
    DATA_DOWNLOAD_AVAILABLE = True
except ImportError as e:
    print(f"⚠️ 数据下载模块导入失败: {e}")
    DATA_DOWNLOAD_AVAILABLE = False

# NYC环境和请求类
try:
    from src.NYEEnvironment import NYEEnvironment
    print("✓ NYC环境模块导入成功")
except ImportError as e:
    print(f"⚠️ NYC环境模块导入失败: {e}")

try:
    from src.NYCRequest import NYCRequest, NYCRequestGenerator
    print("✓ NYC请求模块导入成功")
except ImportError as e:
    print(f"⚠️ NYC请求模块导入失败: {e}")

# 可视化相关
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    print("✓ Plotly可视化库导入成功")
except ImportError:
    print("⚠️ Plotly未安装，将使用matplotlib")

# 地理相关
try:
    from geopy.distance import geodesic
    import folium
    print("✓ 地理处理库导入成功")
except ImportError:
    print("⚠️ 地理处理库导入失败")

print(f"\n📅 当前时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🐍 Python版本: {sys.version}")
print("=" * 50)

## 2. 设置数据下载参数
配置NYC出租车数据的下载参数，包括时间范围、数据类型等。

In [None]:
# 数据下载配置参数
DATA_CONFIG = {
    # 数据时间范围
    'start_date': '2024-01',  # YYYY-MM 格式
    'end_date': '2024-02',
    
    # 数据类型 (Yellow Taxi)
    'data_type': 'yellow_taxi',
    
    # 采样参数
    'sample_size': 10000,  # 采样数据量
    'random_seed': 42,
    
    # 存储路径
    'data_dir': 'data/nyc_cache',
    'processed_dir': 'data/processed',
    
    # 地理范围 (Manhattan)
    'geo_bounds': {
        'min_lat': 40.7000,
        'max_lat': 40.8800,
        'min_lon': -74.0200,
        'max_lon': -73.9300
    }
}

# 仿真环境配置
SIMULATION_CONFIG = {
    # 仿真时间设置
    'start_time': datetime(2024, 10, 20, 8, 0),  # 早上8点开始
    'duration_hours': 12,  # 仿真12小时
    'time_step_minutes': 1,  # 1分钟步长
    
    # 车队配置
    'num_vehicles': 50,
    'initial_battery': 0.8,  # 初始电量80%
    
    # 需求配置
    'requests_per_hour': 100,  # 每小时100个请求
    'use_real_data': True,  # 是否使用真实数据
}

# 创建必要的目录
os.makedirs(DATA_CONFIG['data_dir'], exist_ok=True)
os.makedirs(DATA_CONFIG['processed_dir'], exist_ok=True)

print("📋 数据下载配置:")
for key, value in DATA_CONFIG.items():
    print(f"   {key}: {value}")

print("\n🚗 仿真环境配置:")
for key, value in SIMULATION_CONFIG.items():
    print(f"   {key}: {value}")

print(f"\n📁 数据存储路径: {os.path.abspath(DATA_CONFIG['data_dir'])}")
print("=" * 50)

## 3. 下载NYC环境数据
使用download_data模块下载真实的NYC Yellow Taxi数据。

In [None]:
# 下载NYC出租车数据
print("🚕 开始下载NYC Yellow Taxi数据...")
downloaded_files = []

def download_nyc_data_wrapper(year_month_str):
    """
    包装函数，将'YYYY-MM'格式转换为年和月
    """
    year, month = year_month_str.split('-')
    return download_data(int(year), int(month), 'nyc')

if DATA_DOWNLOAD_AVAILABLE:
    try:
        # 下载指定时间段的数据
        for month_str in ['2025-01', '2025-02']:
            print(f"\n📅 下载 {month_str} 数据...")
            
            try:
                year, month = month_str.split('-')
                file_path = download_data(int(year), int(month), 'nyc')
                
                # 构造预期的文件路径
                expected_file = f"data/parquet/yellow_tripdata_{month_str}.parquet"
                
                if os.path.exists(expected_file):
                    downloaded_files.append(expected_file)
                    file_size = os.path.getsize(expected_file) / (1024 * 1024)  # MB
                    print(f"   ✓ 成功下载: {expected_file}")
                    print(f"   📦 文件大小: {file_size:.1f} MB")
                else:
                    print(f"   ❌ 下载失败: {month_str}")
                    
            except Exception as e:
                print(f"   ❌ 下载 {month_str} 时出错: {e}")
                
        print(f"\n📊 总共下载了 {len(downloaded_files)} 个文件")
        
        # 如果有下载成功的文件，显示数据概述
        if downloaded_files:
            print("\n📋 数据概述:")
            try:
                # 读取第一个文件查看数据结构
                sample_file = downloaded_files[0]
                # 修复：使用head()方法而不是nrows参数
                df_sample = pd.read_parquet(sample_file)
                df_sample = df_sample.head(1000)  # 只取前1000行
                
                print(f"   🔢 样本数据形状: {df_sample.shape}")
                print(f"   📊 列名: {list(df_sample.columns)}")
                if 'tpep_pickup_datetime' in df_sample.columns:
                    print(f"   📅 时间范围: {df_sample['tpep_pickup_datetime'].min()} 到 {df_sample['tpep_pickup_datetime'].max()}")
                
                # 使用describedata函数获取更多信息
                try:
                    year, month = '2025-01'.split('-')
                    print(f"\n📊 使用describedata分析数据...")
                    describedata(int(year), int(month), 'nyc')
                except Exception as desc_error:
                    print(f"   ⚠️ describedata调用失败: {desc_error}")
                
            except Exception as e:
                print(f"   ⚠️ 读取数据概述时出错: {e}")
        else:
            print("❌ 没有成功下载任何文件")
            
    except Exception as e:
        print(f"❌ 数据下载过程出错: {e}")
        print("将继续使用合成数据进行演示")
else:
    print("⚠️ 数据下载模块不可用，跳过真实数据下载")
    print("将使用合成数据进行演示")

print("=" * 50)

## 4. 数据预处理和清理
对下载的原始数据进行清理、格式化和预处理，确保数据质量。

In [None]:
# 检查实际数据列结构
print("🔍 检查实际下载数据的列结构...")

if downloaded_files:
    try:
        # 读取第一个文件查看列名
        sample_file = downloaded_files[0]
        df_sample = pd.read_parquet(sample_file).head(1000)
        
        print(f"📊 文件: {os.path.basename(sample_file)}")
        print(f"📋 数据形状: {df_sample.shape}")
        print(f"📝 所有列名:")
        for i, col in enumerate(df_sample.columns):
            print(f"   {i+1:2d}. {col}")
        
        # 查找可能的坐标列
        coordinate_cols = [col for col in df_sample.columns if 
                          any(keyword in col.lower() for keyword in 
                              ['lat', 'lon', 'pickup', 'dropoff', 'location'])]
        
        print(f"\n🗺️ 可能的坐标相关列:")
        for col in coordinate_cols:
            print(f"   • {col}")
            if not df_sample[col].isnull().all():
                print(f"     样本值: {df_sample[col].dropna().iloc[0]}")
        
        # 显示数据样本
        print(f"\n👀 数据样本 (前3行):")
        print(df_sample.head(3))
        
    except Exception as e:
        print(f"❌ 检查数据结构时出错: {e}")
        import traceback
        traceback.print_exc()
else:
    print("⚠️ 没有下载的文件可供检查")

print("=" * 50)

In [None]:
# 数据预处理函数 (更新版 - 适配2025年数据格式)
def clean_nyc_data(df, geo_bounds=None):
    """
    清理NYC出租车数据 (2025年新格式)
    
    Args:
        df: 原始数据DataFrame
        geo_bounds: 地理边界字典 (可选，因为新格式使用LocationID)
    
    Returns:
        清理后的DataFrame
    """
    print(f"📊 原始数据形状: {df.shape}")
    
    # 1. 删除关键字段的空值
    initial_shape = df.shape[0]
    
    # 检查必要列是否存在
    required_cols = ['PULocationID', 'DOLocationID', 'trip_distance', 'fare_amount']
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        print(f"❌ 缺少必要列: {missing_cols}")
        return df
    
    # 删除LocationID为空的行
    df = df.dropna(subset=['PULocationID', 'DOLocationID'])
    print(f"🧹 删除LocationID空值: {initial_shape - df.shape[0]} 行")
    
    # 2. 过滤无效的LocationID
    # NYC LocationID通常在1-263范围内
    location_filter = (
        (df['PULocationID'] >= 1) & (df['PULocationID'] <= 263) &
        (df['DOLocationID'] >= 1) & (df['DOLocationID'] <= 263)
    )
    
    before_location = df.shape[0]
    df = df[location_filter]
    print(f"🗺️ LocationID范围过滤: 保留 {df.shape[0]} / {before_location} 行")
    
    # 3. 时间数据处理
    if 'tpep_pickup_datetime' in df.columns:
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
        df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
        df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday
        
        # 添加行程时长计算
        if 'tpep_dropoff_datetime' in df.columns:
            df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
            df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
            
            # 过滤异常行程时长 (1分钟到4小时)
            duration_filter = (df['trip_duration'] >= 1) & (df['trip_duration'] <= 240)
            before_duration = df.shape[0]
            df = df[duration_filter]
            print(f"⏱️ 行程时长过滤: 保留 {df.shape[0]} / {before_duration} 行")
    
    # 4. 行程距离过滤
    if 'trip_distance' in df.columns:
        # 过滤异常距离 (0.1km - 50km)
        distance_filter = (df['trip_distance'] >= 0.1) & (df['trip_distance'] <= 50)
        before_distance = df.shape[0]
        df = df[distance_filter]
        print(f"📏 距离过滤: 保留 {df.shape[0]} / {before_distance} 行")
    
    # 5. 费用过滤
    if 'fare_amount' in df.columns:
        # 过滤异常费用 ($1 - $200)
        fare_filter = (df['fare_amount'] >= 1) & (df['fare_amount'] <= 200)
        before_fare = df.shape[0]
        df = df[fare_filter]
        print(f"💰 费用过滤: 保留 {df.shape[0]} / {before_fare} 行")
    
    # 6. 乘客数量过滤
    if 'passenger_count' in df.columns:
        # 过滤异常乘客数 (1-6人)
        passenger_filter = (df['passenger_count'] >= 1) & (df['passenger_count'] <= 6)
        before_passenger = df.shape[0]
        df = df[passenger_filter]
        print(f"👥 乘客数过滤: 保留 {df.shape[0]} / {before_passenger} 行")
    
    # 7. 添加简化的坐标信息 (使用LocationID的近似坐标)
    # 这里我们创建一个简单的LocationID到坐标的映射
    df = add_approximate_coordinates(df)
    
    print(f"✅ 清理完成，最终数据形状: {df.shape}")
    return df.reset_index(drop=True)

def add_approximate_coordinates(df):
    """
    为LocationID添加近似坐标
    这里使用一个简化的映射，实际应用中应该使用官方的zone lookup表
    """
    try:
        # 创建一个简化的LocationID到坐标的映射
        # 这里使用曼哈顿区域的网格近似
        manhattan_bounds = [(40.7000, -74.0200), (40.8800, -73.9300)]
        
        # 为常见的LocationID创建近似坐标
        def location_to_coords(location_id):
            # 简化算法：将LocationID映射到曼哈顿网格
            if pd.isna(location_id) or location_id < 1 or location_id > 263:
                return None, None
            
            # 将ID映射到曼哈顿区域
            lat_range = manhattan_bounds[1][0] - manhattan_bounds[0][0]  # 纬度范围
            lon_range = manhattan_bounds[1][1] - manhattan_bounds[0][1]  # 经度范围
            
            # 使用简单的网格映射
            grid_size = 16  # 假设16x16网格
            row = int((location_id - 1) // grid_size) % grid_size
            col = int((location_id - 1) % grid_size)
            
            lat = manhattan_bounds[0][0] + (row / grid_size) * lat_range
            lon = manhattan_bounds[0][1] + (col / grid_size) * lon_range
            
            # 添加一些随机性模拟实际位置差异
            lat += np.random.normal(0, 0.001)  # 约100米的标准差
            lon += np.random.normal(0, 0.001)
            
            return lat, lon
        
        # 应用坐标映射
        pickup_coords = df['PULocationID'].apply(location_to_coords)
        dropoff_coords = df['DOLocationID'].apply(location_to_coords)
        
        # 分离纬度和经度
        df['pickup_latitude'] = [coord[0] for coord in pickup_coords]
        df['pickup_longitude'] = [coord[1] for coord in pickup_coords]
        df['dropoff_latitude'] = [coord[0] for coord in dropoff_coords]
        df['dropoff_longitude'] = [coord[1] for coord in dropoff_coords]
        
        # 删除坐标为None的行
        coord_filter = (
            df['pickup_latitude'].notna() & df['pickup_longitude'].notna() &
            df['dropoff_latitude'].notna() & df['dropoff_longitude'].notna()
        )
        df = df[coord_filter]
        
        print(f"🗺️ 添加近似坐标: {df.shape[0]} 条记录获得坐标")
        
    except Exception as e:
        print(f"⚠️ 坐标映射失败: {e}")
        
    return df

# 处理下载的数据 (更新版)
processed_data = []

if downloaded_files:
    print("🔧 开始数据预处理 (2025年新格式)...")
    
    for file_path in downloaded_files:
        try:
            print(f"\n📂 处理文件: {os.path.basename(file_path)}")
            
            # 读取数据
            df_raw = pd.read_parquet(file_path)
            
            # 清理数据 (使用新的清理函数)
            df_clean = clean_nyc_data(df_raw, DATA_CONFIG.get('geo_bounds'))
            
            # 采样数据 (如果数据量太大)
            if df_clean.shape[0] > DATA_CONFIG['sample_size']:
                df_sample = df_clean.sample(n=DATA_CONFIG['sample_size'], 
                                          random_state=DATA_CONFIG['random_seed'])
                print(f"🎲 随机采样: {DATA_CONFIG['sample_size']} 行")
            else:
                df_sample = df_clean
            
            processed_data.append(df_sample)
            
        except Exception as e:
            print(f"❌ 处理文件 {file_path} 时出错: {e}")
            import traceback
            traceback.print_exc()
    
    # 合并所有处理后的数据
    if processed_data:
        df_final = pd.concat(processed_data, ignore_index=True)
        print(f"\n📊 合并后的最终数据形状: {df_final.shape}")
        
        # 显示数据统计信息
        print("\n📈 数据统计:")
        if 'trip_distance' in df_final.columns:
            print(f"   平均行程距离: {df_final['trip_distance'].mean():.2f} km")
        if 'fare_amount' in df_final.columns:
            print(f"   平均费用: ${df_final['fare_amount'].mean():.2f}")
        if 'passenger_count' in df_final.columns:
            print(f"   平均乘客数: {df_final['passenger_count'].mean():.1f}")
        if 'trip_duration' in df_final.columns:
            print(f"   平均行程时长: {df_final['trip_duration'].mean():.1f} 分钟")
        
        # 显示LocationID分布
        print(f"\n🗺️ LocationID分布:")
        print(f"   上车地点范围: {df_final['PULocationID'].min():.0f} - {df_final['PULocationID'].max():.0f}")
        print(f"   下车地点范围: {df_final['DOLocationID'].min():.0f} - {df_final['DOLocationID'].max():.0f}")
        print(f"   热门上车地点: {df_final['PULocationID'].mode().iloc[0]:.0f} (ID)")
        
    else:
        df_final = pd.DataFrame()
        print("❌ 没有成功处理的数据")
else:
    print("⚠️ 没有下载数据，将使用合成数据")
    df_final = pd.DataFrame()

print("=" * 50)

## 5. 数据存储到本地文件
将处理后的数据保存到本地，便于后续使用和分析。

In [None]:
# 保存处理后的数据
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

if not df_final.empty:
    # 保存为多种格式
    save_paths = {}
    
    try:
        # 1. CSV格式 (便于查看和分析)
        csv_path = os.path.join(DATA_CONFIG['processed_dir'], f'nyc_taxi_processed_{timestamp}.csv')
        df_final.to_csv(csv_path, index=False)
        save_paths['csv'] = csv_path
        print(f"💾 CSV文件已保存: {csv_path}")
        
        # 2. Parquet格式 (高效存储)
        parquet_path = os.path.join(DATA_CONFIG['processed_dir'], f'nyc_taxi_processed_{timestamp}.parquet')
        df_final.to_parquet(parquet_path, index=False)
        save_paths['parquet'] = parquet_path
        print(f"💾 Parquet文件已保存: {parquet_path}")
        
        # 3. 数据摘要报告
        summary_path = os.path.join(DATA_CONFIG['processed_dir'], f'data_summary_{timestamp}.txt')
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write("NYC Taxi Data Processing Summary\n")
            f.write("=" * 40 + "\n")
            f.write(f"Processing Time: {datetime.now()}\n")
            f.write(f"Data Shape: {df_final.shape}\n")
            f.write(f"Columns: {list(df_final.columns)}\n\n")
            
            # 数据统计
            f.write("Data Statistics:\n")
            f.write("-" * 20 + "\n")
            if 'trip_distance' in df_final.columns:
                f.write(f"Trip Distance - Mean: {df_final['trip_distance'].mean():.2f}km, "
                       f"Std: {df_final['trip_distance'].std():.2f}km\n")
            if 'fare_amount' in df_final.columns:
                f.write(f"Fare Amount - Mean: ${df_final['fare_amount'].mean():.2f}, "
                       f"Std: ${df_final['fare_amount'].std():.2f}\n")
            
            # 时间分布
            if 'pickup_hour' in df_final.columns:
                f.write("\nHourly Distribution:\n")
                hourly_dist = df_final['pickup_hour'].value_counts().sort_index()
                for hour, count in hourly_dist.items():
                    f.write(f"  Hour {hour:2d}: {count:4d} trips\n")
        
        save_paths['summary'] = summary_path
        print(f"📋 数据摘要已保存: {summary_path}")
        
        # 显示文件大小
        for format_name, path in save_paths.items():
            size_mb = os.path.getsize(path) / (1024 * 1024)
            print(f"   {format_name.upper()}: {size_mb:.1f} MB")
            
    except Exception as e:
        print(f"❌ 保存数据时出错: {e}")

else:
    print("⚠️ 没有数据需要保存")

# 显示已保存的文件
processed_files = [f for f in os.listdir(DATA_CONFIG['processed_dir']) 
                  if f.endswith(('.csv', '.parquet', '.txt'))]

print(f"\n📁 处理后的数据目录 ({DATA_CONFIG['processed_dir']}):")
for file in sorted(processed_files)[-5:]:  # 显示最新的5个文件
    file_path = os.path.join(DATA_CONFIG['processed_dir'], file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
    print(f"   📄 {file} ({size_mb:.1f}MB, {mod_time.strftime('%Y-%m-%d %H:%M')})")

print("=" * 50)

## 6. 初始化NYC仿真环境
使用NYCEnvironment类初始化仿真环境，加载地图和基础设施数据。

In [None]:
# 初始化NYC仿真环境
print("🏙️ 初始化NYC电动出租车仿真环境...")

try:
    # 创建NYC环境实例
    nyc_env = NYEEnvironment()
    print("✓ NYC环境创建成功")
    
    # 显示环境基本信息
    print(f"\n🏗️ 环境配置信息:")
    print(f"   🗺️ 地理范围: 曼哈顿 ({nyc_env.manhattan_bounds})")
    print(f"   🔋 充电站数量: {len(nyc_env.charging_stations)}")
    print(f"   🚗 车辆数量: {len(nyc_env.vehicles)}")
    # 修复：使用current_time而不是current_time_minutes
    print(f"   ⏰ 当前时间: {nyc_env.current_time}")
    
    # 显示充电站位置
    if nyc_env.charging_stations:
        print(f"\n⚡ 充电站分布:")
        for i, (station_id, station) in enumerate(list(nyc_env.charging_stations.items())[:5]):  # 显示前5个
            # 修复：检查station的数据结构
            if hasattr(station, 'location'):
                lat, lon = station.location
                capacity = station.capacity if hasattr(station, 'capacity') else 'N/A'
            elif isinstance(station, dict):
                lat, lon = station.get('location', (0, 0))
                capacity = station.get('capacity', 'N/A')
            else:
                lat, lon = 0, 0
                capacity = 'N/A'
            
            print(f"   站点 {i+1}: ({lat:.4f}, {lon:.4f}) - {capacity}个充电桩")
        if len(nyc_env.charging_stations) > 5:
            print(f"   ... 还有 {len(nyc_env.charging_stations) - 5} 个充电站")
    else:
        print(f"\n⚡ 充电站分布: 未初始化")
    
    # 显示车辆初始状态
    print(f"\n🚗 车队状态:")
    if nyc_env.vehicles:
        available_vehicles = [v for v in nyc_env.vehicles.values() if hasattr(v, 'available') and v.available]
        charging_vehicles = [v for v in nyc_env.vehicles.values() if hasattr(v, 'is_charging') and v.is_charging]
        
        print(f"   可用车辆: {len(available_vehicles)}")
        print(f"   充电中车辆: {len(charging_vehicles)}")
        
        # 显示示例车辆信息
        sample_vehicle = list(nyc_env.vehicles.values())[0]
        if hasattr(sample_vehicle, 'location'):
            lat, lon = sample_vehicle.location
            print(f"   示例车辆位置: ({lat:.4f}, {lon:.4f})")
        if hasattr(sample_vehicle, 'battery_level'):
            print(f"   示例车辆电量: {sample_vehicle.battery_level:.1%}")
    else:
        print(f"   车辆未初始化")
    
    print("✅ 环境初始化完成")
    
except Exception as e:
    print(f"❌ 环境初始化失败: {e}")
    print("详细错误信息:")
    import traceback
    traceback.print_exc()
    nyc_env = None

print("=" * 50)

## 7. 配置环境参数
设置仿真环境的各种参数，包括车辆配置、需求模式等。

In [None]:
# 配置仿真环境参数
if nyc_env is not None:
    print("⚙️ 配置仿真环境参数...")
    
    try:
        # 1. 时间设置 - 修复：使用current_time而不是current_time_minutes
        nyc_env.current_time = SIMULATION_CONFIG['start_time']
        print(f"⏰ 设置开始时间: {SIMULATION_CONFIG['start_time'].strftime('%Y-%m-%d %H:%M')}")
        
        # 2. 车辆参数配置
        vehicle_config_applied = 0
        if nyc_env.vehicles:
            for vehicle in nyc_env.vehicles.values():
                # 修复：检查vehicle的数据结构
                if hasattr(vehicle, 'battery_level'):
                    vehicle.battery_level = SIMULATION_CONFIG['initial_battery']
                elif isinstance(vehicle, dict):
                    vehicle['battery_level'] = SIMULATION_CONFIG['initial_battery']
                vehicle_config_applied += 1
        
        print(f"🔋 配置 {vehicle_config_applied} 辆车的初始电量: {SIMULATION_CONFIG['initial_battery']:.0%}")
        
        # 3. 创建请求生成器
        try:
            request_generator = NYCRequestGenerator()
            print("✓ 请求生成器创建成功")
        except Exception as e:
            print(f"⚠️ 请求生成器创建失败: {e}")
            request_generator = None
        
        # 根据是否有真实数据调整生成器
        if not df_final.empty and SIMULATION_CONFIG['use_real_data']:
            print("📊 将使用真实数据模式生成请求")
            # 这里可以用真实数据训练生成器或直接使用数据
        else:
            print("🎲 将使用合成数据模式生成请求")
        
        # 4. 需求模式配置
        demand_config = {
            'base_requests_per_hour': SIMULATION_CONFIG['requests_per_hour'],
            'time_step_minutes': SIMULATION_CONFIG['time_step_minutes'],
            'duration_hours': SIMULATION_CONFIG['duration_hours']
        }
        
        print(f"📋 需求配置:")
        for key, value in demand_config.items():
            print(f"   {key}: {value}")
        
        # 5. 验证环境状态
        print(f"\n🔍 环境状态验证:")
        print(f"   车辆总数: {len(nyc_env.vehicles)}")
        
        if nyc_env.vehicles:
            # 修复：处理车辆可用性检查
            available_count = 0
            battery_levels = []
            
            for vehicle in nyc_env.vehicles.values():
                if hasattr(vehicle, 'available'):
                    if vehicle.available:
                        available_count += 1
                elif isinstance(vehicle, dict):
                    if vehicle.get('available', True):
                        available_count += 1
                
                if hasattr(vehicle, 'battery_level'):
                    battery_levels.append(vehicle.battery_level)
                elif isinstance(vehicle, dict):
                    battery_levels.append(vehicle.get('battery_level', 0.8))
            
            print(f"   可用车辆: {available_count}")
            if battery_levels:
                print(f"   平均电量: {np.mean(battery_levels):.1%}")
        else:
            print(f"   可用车辆: 0 (车辆未初始化)")
            
        print(f"   充电站: {len(nyc_env.charging_stations)} 个")
        
        # 计算仿真总步数
        total_steps = SIMULATION_CONFIG['duration_hours'] * 60 // SIMULATION_CONFIG['time_step_minutes']
        print(f"   预计仿真步数: {total_steps}")
        
        print("✅ 环境参数配置完成")
        
    except Exception as e:
        print(f"❌ 环境配置失败: {e}")
        import traceback
        traceback.print_exc()

else:
    print("⚠️ 环境未初始化，跳过参数配置")
    request_generator = None

print("=" * 50)

## 8. 生成预订单数据
在仿真环境中生成出租车预订单，包括起点、终点、时间等信息。

In [None]:
# 生成预订单数据
print("📋 开始生成预订单数据...")

if nyc_env is not None and request_generator is not None:
    
    # 存储生成的订单
    generated_orders = []
    simulation_log = []
    
    try:
        # 仿真参数
        current_time = SIMULATION_CONFIG['start_time']
        end_time = current_time + timedelta(hours=SIMULATION_CONFIG['duration_hours'])
        step_minutes = SIMULATION_CONFIG['time_step_minutes']
        
        step_count = 0
        total_requests = 0
        
        print(f"⏰ 仿真时间: {current_time.strftime('%H:%M')} - {end_time.strftime('%H:%M')}")
        print(f"⚡ 时间步长: {step_minutes} 分钟")
        
        # 仿真循环
        while current_time < end_time and step_count < 100:  # 限制步数避免无限循环
            
            # 1. 生成当前时间步的订单
            try:
                # 根据时间和需求模式生成订单数量
                hour = current_time.hour
                if hour in [7, 8, 17, 18, 19]:  # 高峰期
                    num_orders = np.random.poisson(SIMULATION_CONFIG['requests_per_hour'] / 60 * step_minutes * 1.5)
                elif hour in [22, 23, 0, 1, 2, 3, 4, 5]:  # 深夜
                    num_orders = np.random.poisson(SIMULATION_CONFIG['requests_per_hour'] / 60 * step_minutes * 0.3)
                else:  # 正常时段
                    num_orders = np.random.poisson(SIMULATION_CONFIG['requests_per_hour'] / 60 * step_minutes)
                
                # 生成订单
                step_orders = []
                for _ in range(num_orders):
                    try:
                        order = request_generator.generate_request(current_time)
                        step_orders.append(order)
                        total_requests += 1
                    except Exception as e:
                        print(f"⚠️ 生成订单时出错: {e}")
                
                generated_orders.extend(step_orders)
                
                # 记录仿真步骤 - 修复：处理车辆可用性检查
                available_vehicle_count = 0
                for v in nyc_env.vehicles.values():
                    if hasattr(v, 'available'):
                        if v.available:
                            available_vehicle_count += 1
                    elif isinstance(v, dict):
                        if v.get('available', True):
                            available_vehicle_count += 1
                
                log_entry = {
                    'step': step_count,
                    'time': current_time,
                    'hour': current_time.hour,
                    'orders_generated': len(step_orders),
                    'total_orders': len(generated_orders),
                    'available_vehicles': available_vehicle_count
                }
                simulation_log.append(log_entry)
                
                # 每10步显示一次进度
                if step_count % 10 == 0:
                    print(f"   Step {step_count}: {current_time.strftime('%H:%M')} - "
                          f"生成 {len(step_orders)} 个订单 (总计: {total_requests})")
                
            except Exception as e:
                print(f"❌ 第 {step_count} 步出错: {e}")
            
            # 2. 更新仿真时间
            current_time += timedelta(minutes=step_minutes)
            # 修复：更新环境时间，使用current_time而不是current_time_minutes
            nyc_env.current_time = current_time
            step_count += 1
        
        print(f"\n✅ 仿真完成!")
        print(f"   总步数: {step_count}")
        print(f"   生成订单数: {len(generated_orders)}")
        print(f"   仿真时长: {step_count * step_minutes} 分钟")
        
        # 分析生成的订单
        if generated_orders:
            print(f"\n📊 订单数据分析:")
            
            # 时间分布
            order_hours = [order.request_time.hour for order in generated_orders]
            hour_counts = pd.Series(order_hours).value_counts().sort_index()
            
            print(f"   时间分布 (前5个高峰时段):")
            for hour, count in hour_counts.head().items():
                print(f"     {hour:2d}点: {count:3d} 个订单")
            
            # 距离分析
            distances = [order.trip_distance for order in generated_orders]
            print(f"   行程距离: 平均 {np.mean(distances):.2f}km, 最大 {np.max(distances):.2f}km")
            
            # 费用分析
            fares = [order.total_amount for order in generated_orders]
            print(f"   订单价值: 平均 ${np.mean(fares):.2f}, 总计 ${np.sum(fares):.2f}")
            
            # 地理分布
            pickup_lats = [order.pickup_location[0] for order in generated_orders]
            pickup_lons = [order.pickup_location[1] for order in generated_orders]
            print(f"   地理范围: 纬度 {min(pickup_lats):.4f}-{max(pickup_lats):.4f}, "
                  f"经度 {min(pickup_lons):.4f}-{max(pickup_lons):.4f}")
        
    except Exception as e:
        print(f"❌ 订单生成过程出错: {e}")
        import traceback
        traceback.print_exc()
        
else:
    print("⚠️ 环境或生成器未初始化，生成示例订单数据...")
    
    # 生成示例数据
    generated_orders = []
    try:
        sample_generator = NYCRequestGenerator()
        for i in range(50):  # 生成50个示例订单
            sample_time = datetime(2024, 10, 20, 8, 0) + timedelta(minutes=i*5)
            order = sample_generator.generate_request(sample_time)
            generated_orders.append(order)
        
        print(f"✅ 生成了 {len(generated_orders)} 个示例订单")
    except Exception as e:
        print(f"❌ 示例订单生成失败: {e}")

print("=" * 50)

## 9. 保存仿真结果
将生成的预订单数据和仿真结果导出保存，用于进一步分析和可视化。

In [None]:
# 保存仿真结果
print("💾 保存仿真结果...")

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_dir = os.path.join('results', 'simulation_orders')
os.makedirs(results_dir, exist_ok=True)

if generated_orders:
    try:
        # 1. 转换订单数据为DataFrame
        orders_data = []
        for order in generated_orders:
            order_dict = order.to_dict()
            orders_data.append(order_dict)
        
        df_orders = pd.DataFrame(orders_data)
        
        # 2. 保存订单数据
        orders_csv_path = os.path.join(results_dir, f'simulation_orders_{timestamp}.csv')
        df_orders.to_csv(orders_csv_path, index=False)
        print(f"📋 订单数据已保存: {orders_csv_path}")
        
        orders_parquet_path = os.path.join(results_dir, f'simulation_orders_{timestamp}.parquet')
        df_orders.to_parquet(orders_parquet_path, index=False)
        print(f"📋 订单数据 (Parquet): {orders_parquet_path}")
        
        # 3. 保存仿真日志 (如果有)
        if 'simulation_log' in locals() and simulation_log:
            log_df = pd.DataFrame(simulation_log)
            log_path = os.path.join(results_dir, f'simulation_log_{timestamp}.csv')
            log_df.to_csv(log_path, index=False)
            print(f"📊 仿真日志已保存: {log_path}")
        
        # 4. 生成详细报告
        report_path = os.path.join(results_dir, f'simulation_report_{timestamp}.txt')
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("NYC Electric Taxi Simulation Report\n")
            f.write("=" * 50 + "\n")
            f.write(f"Generated: {datetime.now()}\n")
            f.write(f"Simulation Config: {SIMULATION_CONFIG}\n\n")
            
            # 订单统计
            f.write("Order Statistics:\n")
            f.write("-" * 30 + "\n")
            f.write(f"Total Orders: {len(generated_orders)}\n")
            f.write(f"Time Range: {min(order.request_time for order in generated_orders)} to "
                   f"{max(order.request_time for order in generated_orders)}\n")
            
            # 地理统计
            pickup_lats = [order.pickup_location[0] for order in generated_orders]
            pickup_lons = [order.pickup_location[1] for order in generated_orders]
            f.write(f"Geographic Range:\n")
            f.write(f"  Latitude: {min(pickup_lats):.6f} to {max(pickup_lats):.6f}\n")
            f.write(f"  Longitude: {min(pickup_lons):.6f} to {max(pickup_lons):.6f}\n")
            
            # 距离和费用统计
            distances = [order.trip_distance for order in generated_orders]
            fares = [order.total_amount for order in generated_orders]
            f.write(f"\nTrip Statistics:\n")
            f.write(f"  Average Distance: {np.mean(distances):.2f} km\n")
            f.write(f"  Average Fare: ${np.mean(fares):.2f}\n")
            f.write(f"  Total Revenue: ${np.sum(fares):.2f}\n")
            
            # 时间分布
            hour_distribution = pd.Series([order.request_time.hour for order in generated_orders]).value_counts().sort_index()
            f.write(f"\nHourly Distribution:\n")
            for hour, count in hour_distribution.items():
                f.write(f"  {hour:2d}:00 - {count:3d} orders\n")
        
        print(f"📄 仿真报告已保存: {report_path}")
        
        # 5. 显示保存的文件信息
        saved_files = [
            ('订单数据 (CSV)', orders_csv_path),
            ('订单数据 (Parquet)', orders_parquet_path),
            ('仿真报告', report_path)
        ]
        
        if 'simulation_log' in locals() and simulation_log:
            saved_files.append(('仿真日志', log_path))
        
        print(f"\n📁 保存的文件:")
        total_size = 0
        for desc, path in saved_files:
            size_mb = os.path.getsize(path) / (1024 * 1024)
            total_size += size_mb
            print(f"   {desc}: {os.path.basename(path)} ({size_mb:.2f} MB)")
        
        print(f"\n💾 总文件大小: {total_size:.2f} MB")
        
        # 6. 快速数据预览
        print(f"\n👀 订单数据预览 (前5条):")
        print(df_orders[['request_id', 'pickup_latitude', 'pickup_longitude', 
                        'dropoff_latitude', 'dropoff_longitude', 'total_amount']].head())
        
    except Exception as e:
        print(f"❌ 保存结果时出错: {e}")
        import traceback
        traceback.print_exc()

else:
    print("⚠️ 没有订单数据需要保存")

# 显示结果目录内容
if os.path.exists(results_dir):
    result_files = os.listdir(results_dir)
    print(f"\n📂 仿真结果目录 ({results_dir}):")
    for file in sorted(result_files)[-10:]:  # 显示最新的10个文件
        file_path = os.path.join(results_dir, file)
        size_kb = os.path.getsize(file_path) / 1024
        mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
        print(f"   📄 {file} ({size_kb:.1f}KB, {mod_time.strftime('%m-%d %H:%M')})")

print("=" * 50)

## 10. 数据可视化分析
对生成的订单数据进行可视化分析，包括时间分布、地理分布、需求热力图等。

In [None]:
# 数据可视化分析
print("📊 开始数据可视化分析...")

if generated_orders and len(generated_orders) > 0:
    
    # 设置matplotlib中文字体和样式
    plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
    plt.rcParams['axes.unicode_minus'] = False
    plt.style.use('default')
    
    # 创建图形目录
    viz_dir = os.path.join('results', 'visualizations')
    os.makedirs(viz_dir, exist_ok=True)
    
    try:
        # 准备数据
        hours = [order.request_time.hour for order in generated_orders]
        distances = [order.trip_distance for order in generated_orders]
        fares = [order.total_amount for order in generated_orders]
        pickup_lats = [order.pickup_location[0] for order in generated_orders]
        pickup_lons = [order.pickup_location[1] for order in generated_orders]
        
        # 1. 时间分布图
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('NYC Electric Taxi Order Analysis', fontsize=16, fontweight='bold')
        
        # 时间分布柱状图
        hour_counts = pd.Series(hours).value_counts().sort_index()
        axes[0, 0].bar(hour_counts.index, hour_counts.values, color='steelblue', alpha=0.7)
        axes[0, 0].set_title('Hourly Order Distribution')
        axes[0, 0].set_xlabel('Hour of Day')
        axes[0, 0].set_ylabel('Number of Orders')
        axes[0, 0].grid(True, alpha=0.3)
        
        # 距离分布直方图
        axes[0, 1].hist(distances, bins=30, color='green', alpha=0.7, edgecolor='black')
        axes[0, 1].set_title('Trip Distance Distribution')
        axes[0, 1].set_xlabel('Distance (km)')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].axvline(np.mean(distances), color='red', linestyle='--', 
                          label=f'Mean: {np.mean(distances):.1f}km')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # 费用分布直方图
        axes[1, 0].hist(fares, bins=30, color='orange', alpha=0.7, edgecolor='black')
        axes[1, 0].set_title('Fare Distribution')
        axes[1, 0].set_xlabel('Fare Amount ($)')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].axvline(np.mean(fares), color='red', linestyle='--',
                          label=f'Mean: ${np.mean(fares):.1f}')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # 地理分布散点图
        axes[1, 1].scatter(pickup_lons, pickup_lats, alpha=0.6, s=1, c='red')
        axes[1, 1].set_title('Pickup Locations Geographic Distribution')
        axes[1, 1].set_xlabel('Longitude')
        axes[1, 1].set_ylabel('Latitude')
        axes[1, 1].grid(True, alpha=0.3)
        
        # 设置Manhattan边界
        if nyc_env:
            bounds = nyc_env.manhattan_bounds
            axes[1, 1].set_xlim(bounds[0][1], bounds[1][1])
            axes[1, 1].set_ylim(bounds[0][0], bounds[1][0])
        
        plt.tight_layout()
        
        # 保存图片
        viz_path = os.path.join(viz_dir, f'order_analysis_{timestamp}.png')
        plt.savefig(viz_path, dpi=300, bbox_inches='tight')
        print(f"📈 订单分析图已保存: {viz_path}")
        plt.show()
        
        # 2. 需求热力图 (如果数据足够多)
        if len(generated_orders) > 50:
            fig, ax = plt.subplots(1, 1, figsize=(12, 8))
            
            # 创建热力图网格
            from scipy.stats import gaussian_kde
            
            # 计算密度
            xy = np.vstack([pickup_lons, pickup_lats])
            density = gaussian_kde(xy)(xy)
            
            scatter = ax.scatter(pickup_lons, pickup_lats, c=density, 
                               s=10, alpha=0.6, cmap='YlOrRd')
            ax.set_title('Order Demand Heatmap', fontsize=14, fontweight='bold')
            ax.set_xlabel('Longitude')
            ax.set_ylabel('Latitude')
            
            # 添加颜色条
            cbar = plt.colorbar(scatter, ax=ax)
            cbar.set_label('Demand Density')
            
            # 添加充电站位置 (如果有)
            if nyc_env and nyc_env.charging_stations:
                station_lats = [s['location'][0] for s in nyc_env.charging_stations]
                station_lons = [s['location'][1] for s in nyc_env.charging_stations]
                ax.scatter(station_lons, station_lats, c='blue', s=100, 
                          marker='^', label='Charging Stations', alpha=0.8)
                ax.legend()
            
            ax.grid(True, alpha=0.3)
            
            heatmap_path = os.path.join(viz_dir, f'demand_heatmap_{timestamp}.png')
            plt.savefig(heatmap_path, dpi=300, bbox_inches='tight')
            print(f"🔥 需求热力图已保存: {heatmap_path}")
            plt.show()
        
        # 3. 统计摘要表
        print(f"\n📋 订单数据统计摘要:")
        print(f"{'指标':<20} {'数值':<15} {'备注'}")
        print("-" * 50)
        print(f"{'订单总数':<18} {len(generated_orders):<15} {'个'}")
        print(f"{'平均距离':<18} {np.mean(distances):<15.2f} {'km'}")
        print(f"{'平均费用':<18} ${np.mean(fares):<14.2f} {'USD'}")
        print(f"{'总收入':<18} ${np.sum(fares):<14.2f} {'USD'}")
        print(f"{'最长距离':<18} {np.max(distances):<15.2f} {'km'}")
        print(f"{'最高费用':<18} ${np.max(fares):<14.2f} {'USD'}")
        
        # 高峰时段分析
        peak_hours = pd.Series(hours).value_counts().head(3)
        print(f"\n🕐 高峰时段:")
        for hour, count in peak_hours.items():
            print(f"   {hour:2d}:00 - {count:3d} 订单")
        
        # 地理覆盖范围
        lat_range = max(pickup_lats) - min(pickup_lats)
        lon_range = max(pickup_lons) - min(pickup_lons)
        print(f"\n🗺️ 地理覆盖:")
        print(f"   纬度范围: {lat_range:.4f}° ({lat_range*111:.1f}km)")
        print(f"   经度范围: {lon_range:.4f}° ({lon_range*85:.1f}km)")  # 纽约纬度约85km/度
        
    except Exception as e:
        print(f"❌ 可视化过程出错: {e}")
        import traceback
        traceback.print_exc()
        
else:
    print("⚠️ 没有订单数据可供可视化")

print("=" * 50)