In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.font_manager as fm
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
plt.style.use('seaborn-v0_8-whitegrid')  # 使用seaborn样式

# 创建中文字体对象
font_path = 'C:/Windows/Fonts/simhei.ttf'
if os.path.exists(font_path):
    chinese_font = fm.FontProperties(fname=font_path)
    print("使用SimHei字体文件路径")
else:
    # 回退到rcParams方法
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    chinese_font = None
    print("使用rcParams设置SimHei字体")

# 创建输出目录
output_dir = 'dengue_ensemble_results'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

使用SimHei字体文件路径


In [3]:
print("=== 开始加载数据 ===")
df = pd.read_csv('../pre_data/processed_dengue_data.csv')

# 确保日期列为datetime类型
df['Date'] = pd.to_datetime(df['Date'])

print(f"原始数据形状: {df.shape}")
print(f"数据时间范围: {df['Date'].min()} 至 {df['Date'].max()}")
print("数据加载完成！\n")

# 确保数据按地区和日期排序
df = df.sort_values(['Region', 'Date'])

# 特征工程
print("1. 特征工程")

# 创建滞后特征
print("  创建滞后特征...")
for lag in range(1, 4):
    df[f'Lag{lag}_Cases'] = df.groupby('Region')['Dengue_Cases'].shift(lag)

# 创建移动平均特征
print("  创建移动平均特征...")
df['MA3_Cases'] = df.groupby('Region')['Dengue_Cases'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean())
df['MA6_Cases'] = df.groupby('Region')['Dengue_Cases'].transform(
    lambda x: x.rolling(window=6, min_periods=1).mean())
df['MA12_Cases'] = df.groupby('Region')['Dengue_Cases'].transform(
    lambda x: x.rolling(window=12, min_periods=1).mean())

# 创建季节性特征
print("  创建季节性特征...")
df['Month_Sin'] = np.sin(2 * np.pi * df['MonthNum'] / 12)
df['Month_Cos'] = np.cos(2 * np.pi * df['MonthNum'] / 12)

# 创建趋势特征
print("  创建趋势特征...")
df['Year_Num'] = df['Year'] - df['Year'].min()  # 从0开始的年份数值

# 添加增长率特征
print("  创建增长率特征...")
df['MoM_Growth'] = df.groupby('Region')['Dengue_Cases'].pct_change()
df['YoY_Growth'] = df.groupby(['Region', 'MonthNum'])['Dengue_Cases'].pct_change()

# 创建地区编码（独热编码）
print("  创建地区编码...")
region_dummies = pd.get_dummies(df['Region'], prefix='region')
df = pd.concat([df, region_dummies], axis=1)

# 处理缺失值
print("  处理缺失值...")
# 对于增长率特征，将无穷大替换为NaN然后替换为0
for col in ['MoM_Growth', 'YoY_Growth']:
    df[col] = df[col].replace([np.inf, -np.inf], np.nan)

# 填充缺失值
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(0)

print(f"  特征工程完成，数据形状: {df.shape}")

=== 开始加载数据 ===
原始数据形状: (1020, 8)
数据时间范围: 2016-01-01 00:00:00 至 2020-12-01 00:00:00
数据加载完成！

1. 特征工程
  创建滞后特征...
  创建移动平均特征...
  创建季节性特征...
  创建趋势特征...
  创建增长率特征...
  创建地区编码...
  处理缺失值...
  特征工程完成，数据形状: (1020, 36)


In [4]:
print("\n2. 准备训练、验证和测试数据")

# 定义特征和目标列
target_col = 'Dengue_Cases'

# 基础特征集
feature_cols = [
    # 时间特征
    'Year_Num', 'MonthNum', 'Month_Sin', 'Month_Cos',
    # 滞后特征
    'Lag1_Cases', 'Lag2_Cases', 'Lag3_Cases',
    # 移动平均特征
    'MA3_Cases', 'MA6_Cases', 'MA12_Cases',
    # 增长率特征
    'MoM_Growth', 'YoY_Growth'
]

# 添加地区编码特征
region_cols = [col for col in df.columns if col.startswith('region_')]
feature_cols.extend(region_cols)

print(f"  选择的特征数量: {len(feature_cols)}")
print(f"  前5个特征: {feature_cols[:5]}")

# 按时间分割数据
# 使用最后12个月作为测试集
test_start_date = df['Date'].max() - pd.DateOffset(months=11)
print(f"  测试集开始日期: {test_start_date}")

# 分割数据
train_df = df[df['Date'] < test_start_date].copy()
test_df = df[df['Date'] >= test_start_date].copy()

print(f"  训练集形状: {train_df.shape}")
print(f"  测试集形状: {test_df.shape}")

# 准备X和y
X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_test = test_df[feature_cols]
y_test = test_df[target_col]


2. 准备训练、验证和测试数据
  选择的特征数量: 29
  前5个特征: ['Year_Num', 'MonthNum', 'Month_Sin', 'Month_Cos', 'Lag1_Cases']
  测试集开始日期: 2020-01-01 00:00:00
  训练集形状: (816, 36)
  测试集形状: (204, 36)


In [5]:
# 加载XGBoost模型（如果存在）或创建新模型
try:
    xgb_model = joblib.load('dengue_xgboost_results/optimized_xgboost_model.pkl')
    print("成功加载已有的XGBoost模型")
except:
    print("未找到已有的XGBoost模型，创建新模型...")
    # 创建默认XGBoost模型
    xgb_model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='reg:squarederror',
        random_state=42
    )
    # 训练模型
    xgb_model.fit(X_train, y_train)

成功加载已有的XGBoost模型


In [6]:
print("\n3. 创建基础模型集合")

# 随机森林回归器
rf_model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

# 梯度提升回归器
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

# LightGBM回归器
lgbm_model = LGBMRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

# Ridge回归器（线性模型）
ridge_model = Ridge(alpha=1.0, random_state=42)

# 定义基础模型列表
base_models = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('gb', gb_model),
    ('lgbm', lgbm_model),
    ('ridge', ridge_model)
]

# 训练所有基础模型
for name, model in base_models:
    if name != 'xgb' or not isinstance(xgb_model, XGBRegressor):  # 如果XGBoost模型是新创建的
        print(f"  训练{name}模型...")
        model.fit(X_train, y_train)
    else:
        print(f"  使用已有的{name}模型")

# 定义评估函数
def evaluate_model(model, X_test, y_test, model_name="模型"):
    """评估模型并返回多种指标"""
    # 进行预测
    y_pred = model.predict(X_test)
    
    # 计算指标
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # 计算MAPE（处理零值问题）
    mask = y_test != 0
    mape = np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100 if sum(mask) > 0 else np.nan
    
    # 汇总结果
    results = {
        'Model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        'R2': r2
    }
    
    return results, y_pred


3. 创建基础模型集合
  使用已有的xgb模型
  训练rf模型...
  训练gb模型...
  训练lgbm模型...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2061
[LightGBM] [Info] Number of data points in the train set: 816, number of used features: 29
[LightGBM] [Info] Start training from score 1294.588235
  训练ridge模型...


In [8]:
print("\n4. 评估单一模型性能")
single_model_results = []
single_model_predictions = {}

for name, model in base_models:
    print(f"  评估{name}模型...")
    result, y_pred = evaluate_model(model, X_test, y_test, name)
    single_model_results.append(result)
    single_model_predictions[name] = y_pred
    print(f"    RMSE: {result['RMSE']:.4f}")
    print(f"    MAE: {result['MAE']:.4f}")
    print(f"    MAPE: {result['MAPE']:.4f}%")
    print(f"    R2: {result['R2']:.4f}")

# 创建单一模型评估结果DataFrame
single_results_df = pd.DataFrame(single_model_results)
single_results_df.set_index('Model', inplace=True)

# 保存单一模型评估结果
single_results_df.to_csv(f'{output_dir}/single_model_results.csv')
print(f"  单一模型评估结果已保存至: {output_dir}/single_model_results.csv")

# 可视化单一模型性能比较
plt.figure(figsize=(12, 8))
sns.barplot(x=single_results_df.index, y='RMSE', data=single_results_df)
plt.title('单一模型RMSE比较', fontsize=16, fontproperties=chinese_font)
plt.xlabel('模型', fontsize=14, fontproperties=chinese_font)
plt.ylabel('RMSE', fontsize=14, fontproperties=chinese_font)
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig(f'{output_dir}/single_model_rmse_comparison.png', dpi=300)
print(f"  保存图表：{output_dir}/single_model_rmse_comparison.png")
plt.close()


4. 评估单一模型性能
  评估xgb模型...
    RMSE: 418.6332
    MAE: 159.7621
    MAPE: 118.2177%
    R2: 0.5524
  评估rf模型...
    RMSE: 91.0321
    MAE: 57.2523
    MAPE: 44.8385%
    R2: 0.9788
  评估gb模型...
    RMSE: 71.4136
    MAE: 47.5968
    MAPE: 35.1103%
    R2: 0.9870
  评估lgbm模型...
    RMSE: 726.0842
    MAE: 325.6497
    MAPE: 188.1242%
    R2: -0.3464
  评估ridge模型...
    RMSE: 247.2214
    MAE: 204.6309
    MAPE: 173.4623%
    R2: 0.8439
  单一模型评估结果已保存至: dengue_ensemble_results/single_model_results.csv
  保存图表：dengue_ensemble_results/single_model_rmse_comparison.png


In [9]:
print("\n5. 实现投票集成模型")

# 定义权重（基于单一模型的R2分数）
r2_scores = [result['R2'] for result in single_model_results]
# 处理负的R2分数
adjusted_r2 = [max(0, r2) for r2 in r2_scores]
# 如果所有R2都是0，使用等权重
if sum(adjusted_r2) == 0:
    weights = [1/len(base_models)] * len(base_models)
else:
    weights = [r2/sum(adjusted_r2) for r2 in adjusted_r2]

print(f"  计算的模型权重: {weights}")

# 创建投票集成模型
voting_regressor = VotingRegressor(
    estimators=base_models,
    weights=weights
)

# 训练投票集成模型
print("  训练投票集成模型...")
voting_regressor.fit(X_train, y_train)

# 评估投票集成模型
print("  评估投票集成模型...")
voting_result, voting_pred = evaluate_model(voting_regressor, X_test, y_test, "Voting")
print(f"    RMSE: {voting_result['RMSE']:.4f}")
print(f"    MAE: {voting_result['MAE']:.4f}")
print(f"    MAPE: {voting_result['MAPE']:.4f}%")
print(f"    R2: {voting_result['R2']:.4f}")

# 保存投票集成模型
joblib.dump(voting_regressor, f'{output_dir}/voting_ensemble_model.pkl')
print(f"  投票集成模型已保存至: {output_dir}/voting_ensemble_model.pkl")


5. 实现投票集成模型
  计算的模型权重: [0.1643070666446027, 0.29113423544021017, 0.29355502369166997, 0.0, 0.2510036742235171]
  训练投票集成模型...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2061
[LightGBM] [Info] Number of data points in the train set: 816, number of used features: 29
[LightGBM] [Info] Start training from score 1294.588235
  评估投票集成模型...
    RMSE: 102.6093
    MAE: 71.2586
    MAPE: 64.6952%
    R2: 0.9731
  投票集成模型已保存至: dengue_ensemble_results/voting_ensemble_model.pkl


In [10]:
print("\n6. 实现堆叠集成模型")

# 定义元模型（可以尝试不同的元模型）
meta_model = Ridge(alpha=1.0)

# 创建堆叠集成模型
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5  # 5折交叉验证
)

# 训练堆叠集成模型
print("  训练堆叠集成模型...")
stacking_regressor.fit(X_train, y_train)

# 评估堆叠集成模型
print("  评估堆叠集成模型...")
stacking_result, stacking_pred = evaluate_model(stacking_regressor, X_test, y_test, "Stacking")
print(f"    RMSE: {stacking_result['RMSE']:.4f}")
print(f"    MAE: {stacking_result['MAE']:.4f}")
print(f"    MAPE: {stacking_result['MAPE']:.4f}%")
print(f"    R2: {stacking_result['R2']:.4f}")

# 保存堆叠集成模型
joblib.dump(stacking_regressor, f'{output_dir}/stacking_ensemble_model.pkl')
print(f"  堆叠集成模型已保存至: {output_dir}/stacking_ensemble_model.pkl")


6. 实现堆叠集成模型
  训练堆叠集成模型...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2061
[LightGBM] [Info] Number of data points in the train set: 816, number of used features: 29
[LightGBM] [Info] Start training from score 1294.588235
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 26
[LightGBM] [Info] Start training from score 1373.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 653, number of used features: 26


In [11]:
print("\n7. 优化堆叠集成模型")

# 定义多种元学习器及其参数空间
meta_learners = {
    'ridge': {
        'model': Ridge(),
        'params': {
            'final_estimator__alpha': [0.1, 1.0, 10.0, 100.0]
        }
    },
    'lasso': {
        'model': Lasso(),
        'params': {
            'final_estimator__alpha': [0.1, 1.0, 10.0, 100.0]
        }
    },
    'gbr': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'final_estimator__n_estimators': [50, 100],
            'final_estimator__learning_rate': [0.05, 0.1],
            'final_estimator__max_depth': [3, 5]
        }
    }
}

# 时间序列交叉验证
tscv = TimeSeriesSplit(n_splits=3)

best_meta_model = None
best_meta_score = float('inf')
best_meta_name = None

# 尝试不同的元学习器
for name, meta_info in meta_learners.items():
    print(f"  尝试元学习器: {name}")
    
    # 创建堆叠集成模型
    meta_stacking = StackingRegressor(
        estimators=base_models,
        final_estimator=meta_info['model'],
        cv=3  # 使用3折交叉验证以加快速度
    )
    
    # 进行参数搜索
    search = RandomizedSearchCV(
        meta_stacking,
        param_distributions=meta_info['params'],
        n_iter=5,  # 尝试5组参数组合
        scoring='neg_root_mean_squared_error',
        cv=tscv,
        random_state=42,
        n_jobs=-1
    )
    
    # 训练并优化
    search.fit(X_train, y_train)
    
    # 评估最佳模型
    best_estimator = search.best_estimator_
    _, y_pred = evaluate_model(best_estimator, X_train, y_train)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    
    print(f"    最佳参数: {search.best_params_}")
    print(f"    训练RMSE: {rmse:.4f}")
    
    # 更新最佳模型
    if rmse < best_meta_score:
        best_meta_score = rmse
        best_meta_model = best_estimator
        best_meta_name = name
        
print(f"  最佳元学习器: {best_meta_name}，训练RMSE: {best_meta_score:.4f}")

# 在测试集上评估最佳堆叠模型
print("  评估最佳堆叠集成模型...")
optimized_stacking_result, optimized_stacking_pred = evaluate_model(best_meta_model, X_test, y_test, "Optimized_Stacking")
print(f"    RMSE: {optimized_stacking_result['RMSE']:.4f}")
print(f"    MAE: {optimized_stacking_result['MAE']:.4f}")
print(f"    MAPE: {optimized_stacking_result['MAPE']:.4f}%")
print(f"    R2: {optimized_stacking_result['R2']:.4f}")

# 保存最佳堆叠集成模型
joblib.dump(best_meta_model, f'{output_dir}/optimized_stacking_model.pkl')
print(f"  最佳堆叠集成模型已保存至: {output_dir}/optimized_stacking_model.pkl")


7. 优化堆叠集成模型
  尝试元学习器: ridge
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2061
[LightGBM] [Info] Number of data points in the train set: 816, number of used features: 29
[LightGBM] [Info] Start training from score 1294.588235
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1446
[LightGBM] [Info] Number of data points in the train set: 544, number of used features: 23
[LightGBM] [Info] Start training from score 1374.193015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1446
[LightGBM] [Info] Number of data points in the train set: 544, number of used features: 2

In [13]:
print("\n8. 比较所有模型性能")

# 汇总所有模型的评估结果
all_results = single_model_results + [voting_result, stacking_result, optimized_stacking_result]
all_results_df = pd.DataFrame(all_results)
all_results_df.set_index('Model', inplace=True)

# 保存所有模型的评估结果
all_results_df.to_csv(f'{output_dir}/all_model_results.csv')
print(f"  所有模型评估结果已保存至: {output_dir}/all_model_results.csv")

# 可视化RMSE比较
plt.figure(figsize=(14, 8))
sns.barplot(x=all_results_df.index, y='RMSE', data=all_results_df)
plt.title('所有模型RMSE比较', fontsize=16, fontproperties=chinese_font)
plt.xlabel('模型', fontsize=14, fontproperties=chinese_font)
plt.ylabel('RMSE', fontsize=14, fontproperties=chinese_font)
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig(f'{output_dir}/all_model_rmse_comparison.png', dpi=300)
print(f"  保存图表：{output_dir}/all_model_rmse_comparison.png")
plt.close()

# 可视化R2比较
plt.figure(figsize=(14, 8))
sns.barplot(x=all_results_df.index, y='R2', data=all_results_df)
plt.title('所有模型R2比较', fontsize=16, fontproperties=chinese_font)
plt.xlabel('模型', fontsize=14, fontproperties=chinese_font)
plt.ylabel('R2', fontsize=14, fontproperties=chinese_font)
plt.xticks(rotation=45)
plt.grid(True, axis='y')
plt.tight_layout()
plt.savefig(f'{output_dir}/all_model_r2_comparison.png', dpi=300)
print(f"  保存图表：{output_dir}/all_model_r2_comparison.png")
plt.close()


8. 比较所有模型性能
  所有模型评估结果已保存至: dengue_ensemble_results/all_model_results.csv
  保存图表：dengue_ensemble_results/all_model_rmse_comparison.png
  保存图表：dengue_ensemble_results/all_model_r2_comparison.png


In [20]:
print("\n9. 可视化预测结果")

# 获取主要地区
top_regions = df.groupby('Region')['Dengue_Cases'].sum().sort_values(ascending=False).head(3).index.tolist()
print(f"  主要地区: {', '.join(top_regions)}")

# 为每个主要地区创建预测可视化
for region in top_regions:
    # 获取该地区的测试数据
    region_test = test_df[test_df['Region'] == region].copy()
    if len(region_test) == 0:
        continue  # 如果测试集中没有该地区的数据，则跳过
        
    # 获取该地区的X和y
    X_region_test = region_test[feature_cols]
    y_region_test = region_test[target_col]
    
    # 获取各个模型对该地区的预测
    predictions = {}
    
    # 获取单一模型预测
    for name, model in base_models:
        predictions[name] = model.predict(X_region_test)
    
    # 获取集成模型预测
    predictions['Voting'] = voting_regressor.predict(X_region_test)
    predictions['Stacking'] = stacking_regressor.predict(X_region_test)
    predictions['Optimized_Stacking'] = best_meta_model.predict(X_region_test)
    
    # 创建预测比较图
    plt.figure(figsize=(14, 8))
    
    # 绘制实际值
    plt.plot(region_test['Date'], y_region_test, 'k-', linewidth=2, label='true value')
    
    # 绘制单一模型预测
    for name, pred in predictions.items():
        if name in ['xgb', 'Voting', 'Stacking', 'Optimized_Stacking']:  # 只显示部分模型以保持图表清晰
            plt.plot(region_test['Date'], pred, '--', linewidth=1.5, label=f'{name}')
    
    plt.title(f'{region} - 模型预测比较', fontsize=16, fontproperties=chinese_font)
    plt.xlabel('日期', fontsize=14, fontproperties=chinese_font)
    plt.ylabel('登革热病例数', fontsize=14, fontproperties=chinese_font)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    
    # 保存图表
    plt.savefig(f'{output_dir}/{region}_prediction_comparison.png', dpi=300)
    print(f"  保存图表：{output_dir}/{region}_prediction_comparison.png")
    plt.close()

# 可视化全国预测结果
# 按日期汇总预测结果
test_results = test_df[['Date', 'Dengue_Cases']].copy()
test_results.set_index('Date', inplace=True)
test_national = test_results.groupby('Date').sum()

# 将所有模型的全国预测添加到结果中
for name, model in base_models:
    test_df['Pred_' + name] = model.predict(X_test)

test_df['Pred_Voting'] = voting_regressor.predict(X_test)
test_df['Pred_Stacking'] = stacking_regressor.predict(X_test)
test_df['Pred_Optimized_Stacking'] = best_meta_model.predict(X_test)

# 计算全国总预测
national_predictions = {}
for col in test_df.columns:
    if col.startswith('Pred_'):
        name = col.replace('Pred_', '')
        national_pred = test_df.groupby('Date')[col].sum()
        national_predictions[name] = national_pred

# 创建全国预测比较图
plt.figure(figsize=(14, 8))

# 绘制实际值
plt.plot(test_national.index, test_national['Dengue_Cases'], 'k-', linewidth=2, label='true value')

# 绘制主要模型预测
for name, pred in national_predictions.items():
    if name in ['xgb', 'Voting', 'Stacking', 'Optimized_Stacking']:  # 只显示部分模型
        plt.plot(pred.index, pred.values, '--', linewidth=1.5, label=f'{name}')

plt.title('全国 - 模型预测比较', fontsize=16, fontproperties=chinese_font)
plt.xlabel('日期', fontsize=14, fontproperties=chinese_font)
plt.ylabel('登革热病例数', fontsize=14, fontproperties=chinese_font)
plt.legend()
plt.grid(True)
plt.tight_layout()

# 保存图表
plt.savefig(f'{output_dir}/national_prediction_comparison.png', dpi=300)
print(f"  保存图表：{output_dir}/national_prediction_comparison.png")
plt.close()


9. 可视化预测结果
  主要地区: Region IV-A, Region III, Region VI
  保存图表：dengue_ensemble_results/Region IV-A_prediction_comparison.png
  保存图表：dengue_ensemble_results/Region III_prediction_comparison.png
  保存图表：dengue_ensemble_results/Region VI_prediction_comparison.png
  保存图表：dengue_ensemble_results/national_prediction_comparison.png


In [24]:
print("\n10. 创建最终预测模型")

# 获取最佳模型
best_model_name = all_results_df['RMSE'].idxmin()
print(f"  最佳模型: {best_model_name}，基于RMSE指标")

# 使用最佳模型进行未来12个月的预测
# 创建用于未来预测的数据框
last_date = df['Date'].max()
print(f"  最后数据日期: {last_date}")

# 创建包含最后12个月数据的预测帧
future_df = pd.DataFrame()

# 为每个地区创建未来预测
all_future_predictions = pd.DataFrame()

for region in df['Region'].unique():
    # 获取该地区的最后12个月数据
    region_last = df[(df['Region'] == region) & (df['Date'] > last_date - pd.DateOffset(months=12))].copy()
    
    if len(region_last) == 0:
        continue  # 如果没有该地区的最近数据，则跳过
        
    # 创建未来12个月的日期
    future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=12, freq='MS')
    
    # 创建未来预测数据框
    region_future = pd.DataFrame()
    region_future['Date'] = future_dates
    region_future['Year'] = region_future['Date'].dt.year
    region_future['MonthNum'] = region_future['Date'].dt.month
    region_future['Region'] = region
    
    # 复制最近的特征值
    for col in feature_cols:
        if col in ['Year_Num', 'MonthNum', 'Month_Sin', 'Month_Cos']:
            # 为时间特征重新计算
            if col == 'Year_Num':
                region_future[col] = region_future['Year'] - df['Year'].min()
            elif col == 'Month_Sin':
                region_future[col] = np.sin(2 * np.pi * region_future['MonthNum'] / 12)
            elif col == 'Month_Cos':
                region_future[col] = np.cos(2 * np.pi * region_future['MonthNum'] / 12)
        elif col.startswith('region_'):
            # 设置地区编码
            if col == f'region_{region}':
                region_future[col] = 1
            else:
                region_future[col] = 0
        else:
            # 对于其他特征，使用历史平均值
            region_future[col] = region_last[col].mean()
    
    # 加入到未来预测数据框中
    future_df = pd.concat([future_df, region_future])
    
    # 获取对应的模型
    if best_model_name == 'Voting':
        best_model = voting_regressor
    elif best_model_name == 'Stacking':
        best_model = stacking_regressor
    elif best_model_name == 'Optimized_Stacking':
        best_model = best_meta_model
    else:
        # 从base_models中获取单一模型
        best_model = next((model for name, model in base_models if name == best_model_name), None)
    
    if best_model is None:
        print(f"  警告: 未找到名为{best_model_name}的模型")
        continue
    
    # 预测未来值
    X_future = region_future[feature_cols]
    region_future['Predicted_Cases'] = best_model.predict(X_future)
    
    # 添加到全局预测结果
    region_result = region_future[['Date', 'Region', 'Predicted_Cases']].copy()
    all_future_predictions = pd.concat([all_future_predictions, region_result])

# 保存未来预测结果
all_future_predictions.to_csv(f'{output_dir}/future_predictions.csv', index=False)
print(f"  未来预测结果已保存至: {output_dir}/future_predictions.csv")

# 可视化未来预测结果（全国总和）
national_future = all_future_predictions.groupby('Date')['Predicted_Cases'].sum().reset_index()
national_past = df.groupby('Date')['Dengue_Cases'].sum().reset_index()

plt.figure(figsize=(14, 8))
plt.plot(national_past['Date'], national_past['Dengue_Cases'], 'b-', linewidth=2, label='历史病例')
plt.plot(national_future['Date'], national_future['Predicted_Cases'], 'r--', linewidth=2, label='未来预测')

# 标记预测开始点
plt.axvline(x=last_date, color='g', linestyle='--', label='预测起点')

plt.title('全国登革热病例未来12个月预测', fontsize=16, fontproperties=chinese_font)
plt.xlabel('日期', fontsize=14, fontproperties=chinese_font)
plt.ylabel('登革热病例数', fontsize=14, fontproperties=chinese_font)
plt.legend(prop=chinese_font)
plt.grid(True)
plt.tight_layout()

# 保存图表
plt.savefig(f'{output_dir}/national_future_prediction.png', dpi=300)
print(f"  保存图表：{output_dir}/national_future_prediction.png")
plt.close()



10. 创建最终预测模型
  最佳模型: gb，基于RMSE指标
  最后数据日期: 2020-12-01 00:00:00
  未来预测结果已保存至: dengue_ensemble_results/future_predictions.csv
  保存图表：dengue_ensemble_results/national_future_prediction.png
