# 虚拟井生成


In [None]:
# 确保src目录在Python路径中
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
)
from src.feature_selection import select_best_features
from src.visualization import visualize_attribute_map

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [None]:
data_H6_2_attr = parse_petrel_file("../data/H6-2_attr")

## 导入井震数据


In [None]:
file_H6_2_well = "../data/well_processed.xlsx"
data_H6_2_well = pd.read_excel(file_H6_2_well, sheet_name="Sheet1")

# 只选择层位（Surface）为 H6-2 的行，并丢弃砂厚为 NaN 的行
data_H6_2_well_selected = (
    data_H6_2_well[data_H6_2_well["Surface"] == "H6-2"]
    .query("Well != 'PH6' and Well != 'PH8' and Well != 'PH3' and Well != 'PH2'")
    .replace(-999, np.nan)  # 将-999替换为NaN（通常-999是缺失值的代码）
    .dropna(subset=["Thickness of facies(1: Fine sand)"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

# 显示筛选后的前几行数据
data_H6_2_well_selected.head()

## 提取共同属性


In [None]:
# 获取地震属性列表
seismic_attr, _ = identify_attributes("../data/H6-2_attr")

# 提取Excel的属性列表（从第8列开始的所有列）
well_seismic_attr = data_H6_2_well.columns[7:].tolist()

# 计算两个列表的交集
common_attributes = list(set(seismic_attr) & set(well_seismic_attr))

# 打印结果
print(f"地震属性数量: {len(seismic_attr)}")
print(f"Excel属性数量: {len(well_seismic_attr)}")
print(f"共同属性数量: {len(common_attributes)}")
print("\n共同属性列表:")
for attr in common_attributes:
    print(f"- {attr}")

## 根据井点分布，缩小工区范围


In [None]:
# 限制工区范围
data_H6_2_attr_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=data_H6_2_attr,
    well_data=data_H6_2_well_selected,
    expansion_factor=1.5,  # 扩展50%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

## 生成统计摘要


In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=data_H6_2_attr_filtered,
    well_data=data_H6_2_well_selected,
    common_attributes=common_attributes,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,  # 输出图表目录
    verbose=True,  # 打印详细信息
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## 随机森林重要性和相关性分析


In [None]:
# 使用随机森林评估特征重要性并移除冗余特征
selected_features = select_best_features(
    well_data=data_H6_2_well_selected,
    attribute_columns=good_attributes,
    target_column="Thickness of facies(1: Fine sand)",
    n_features=3,
    corr_threshold=0.85,
    output_dir=output_dir,
    verbose=True,
)

# 输出特征选择结果
print("\n基于随机森林重要性和相关性分析的最佳特征:")
for i, feature in enumerate(selected_features):
    print(f"{i + 1}. {feature}")

## 提取样本，准备设置虚拟井


In [None]:
# 使用筛选后的地震数据区域提取等间距样本
seismic_samples = extract_uniform_seismic_samples(
    seismic_data=data_H6_2_attr_filtered,
    n_rows=40,
    n_cols=40,
    area_bounds=area_bounds,  # 直接传入边界字典
)

# 可视化真实井点和采样点
plt.figure(figsize=(15, 10))

# 绘制地震数据点（使用抽样）
sample_ratio = min(1.0, 5000 / len(data_H6_2_attr_filtered))
seismic_sample = data_H6_2_attr_filtered.sample(frac=sample_ratio)
plt.scatter(seismic_sample["X"], seismic_sample["Y"], color="lightgray", alpha=0.3, s=10, label="地震数据(抽样)")

# 绘制真实井点位置
plt.scatter(
    data_H6_2_well_selected["X"], data_H6_2_well_selected["Y"], color="red", s=100, marker="^", label="真实井点"
)

# 绘制等间距采样点位置
plt.scatter(seismic_samples["X"], seismic_samples["Y"], color="blue", s=50, marker="o", label="等间距采样点")

# 添加标题和图例
plt.title("真实井点与等间距采样点分布", fontsize=16)
plt.xlabel("X坐标", fontsize=14)
plt.ylabel("Y坐标", fontsize=14)
plt.legend(loc="upper right")
plt.grid(True, linestyle="--", alpha=0.7)

# 保存图片
plt.savefig(os.path.join(output_dir, "real_wells_and_seismic_samples.png"), dpi=300, bbox_inches="tight")
plt.show()


# 保存提取的样本数据
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples.csv"), index=False)
print(f"等间距地震样本数据已保存至 {os.path.join(output_dir, 'seismic_samples.csv')}")

## 多线性模型一致性预测设置虚拟井


In [None]:
# 创建融合属性和多模型预测
print("======== 创建融合属性和多模型预测 ========")
target_column = "Thickness of facies(1: Fine sand)"
min_corr_threshold = 0.2  # 最小相关性阈值，低于此值的属性将被排除

# 检查每个选定属性在井点数据中的有效性
print("检查属性在井点数据中的有效性:")
for feature in selected_features:
    nan_count = data_H6_2_well_selected[feature].isna().sum()
    print(
        f"属性 '{feature}' 在井点数据中的NaN值数量: {nan_count}/{len(data_H6_2_well_selected)} ({nan_count / len(data_H6_2_well_selected) * 100:.1f}%)"
    )

# 筛选出所有选定属性都有有效值的井点
valid_wells = data_H6_2_well_selected.dropna(subset=selected_features + [target_column])
print(f"\n所有属性都有有效值的井点数量: {len(valid_wells)} / {len(data_H6_2_well_selected)}")

# 准备训练数据
X_labeled = valid_wells[selected_features].values
y_labeled = valid_wells[target_column].values

# 准备未标记数据（地震样本点）
X_unlabeled = seismic_samples[selected_features].dropna().values
unlabeled_indices = seismic_samples[selected_features].dropna().index

# 1. 融合属性线性加权预测
print("\n=== 模型1: 融合属性线性加权 ===")

# 计算相关性权重
correlation_weights = {}
for i, feature in enumerate(selected_features):
    corr, _ = spearmanr(valid_wells[feature], valid_wells[target_column])
    if abs(corr) >= min_corr_threshold:
        correlation_weights[feature] = corr
        print(f"属性 '{feature}' 与砂厚的Spearman相关性: {corr:.4f}")
    else:
        print(f"属性 '{feature}' 与砂厚的相关性过低 ({corr:.4f})，不纳入融合")

# 如果没有有效属性，使用所有属性且权重相等
if len(correlation_weights) == 0:
    print("警告: 没有属性满足相关性阈值，将使用所有属性且权重相等")
    for feature in selected_features:
        correlation_weights[feature] = 1.0
        print(f"属性 '{feature}' 使用默认权重: 1.0")

# 标准化数据
scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled)
X_unlabeled_scaled = scaler.transform(X_unlabeled)


# 创建融合属性函数
def create_fused_attribute(X_scaled, features, weights):
    """
    基于选定特征和权重创建融合属性

    参数:
        X_scaled (ndarray): 标准化后的特征矩阵
        features (list): 特征列表
        weights (dict): 每个特征的权重

    返回:
        ndarray: 融合属性
    """
    # 初始化融合属性
    fused_attr = np.zeros(X_scaled.shape[0])
    weight_sum = 0

    # 对每个特征进行加权融合
    for i, feature in enumerate(features):
        if feature in weights:
            weight = weights[feature]
            fused_attr += X_scaled[:, i] * weight
            weight_sum += abs(weight)

    # 归一化融合结果
    if weight_sum > 0:
        fused_attr /= weight_sum

    return fused_attr


# 在训练数据上创建融合属性
fused_attr_labeled = create_fused_attribute(X_labeled_scaled, selected_features, correlation_weights)

# 在未标记数据上创建融合属性
fused_attr_unlabeled = create_fused_attribute(X_unlabeled_scaled, selected_features, correlation_weights)

# 使用融合属性拟合线性回归
linear_model = LinearRegression()
linear_model.fit(fused_attr_labeled.reshape(-1, 1), y_labeled)

# 预测
fused_pred_labeled = linear_model.predict(fused_attr_labeled.reshape(-1, 1))
fused_pred_unlabeled = linear_model.predict(fused_attr_unlabeled.reshape(-1, 1))

# 评估融合属性预测效果
fused_corr = np.corrcoef(fused_pred_labeled, y_labeled)[0, 1]
print(f"融合属性预测结果与真实砂厚的相关性: {fused_corr:.4f}")

# 2. Lasso回归 + Bootstrap
print("\n=== 模型2: LassoCV + Bootstrap ===")

n_bootstrap = 100  # Bootstrap重采样次数
alpha_values = np.logspace(-4, 1, 30)  # alpha候选值

# 主Lasso模型
lasso_model = LassoCV(alphas=alpha_values, cv=5, max_iter=10000, tol=1e-3)
lasso_model.fit(X_labeled_scaled, y_labeled)
print(f"Lasso最优alpha值: {lasso_model.alpha_:.6f}")

# 在训练数据上的预测
lasso_pred_labeled = lasso_model.predict(X_labeled_scaled)
lasso_corr = np.corrcoef(lasso_pred_labeled, y_labeled)[0, 1]
print(f"Lasso预测结果与真实砂厚的相关性: {lasso_corr:.4f}")

# Bootstrap重采样预测
lasso_bootstrap_preds = np.zeros((n_bootstrap, X_unlabeled_scaled.shape[0]))

for i in range(n_bootstrap):
    # Bootstrap重采样
    indices = np.random.choice(len(X_labeled_scaled), len(X_labeled_scaled), replace=True)
    X_boot, y_boot = X_labeled_scaled[indices], y_labeled[indices]

    # 拟合模型
    lasso_boot = LassoCV(alphas=alpha_values, cv=5, max_iter=10000, tol=1e-3)
    lasso_boot.fit(X_boot, y_boot)

    # 预测
    lasso_bootstrap_preds[i, :] = lasso_boot.predict(X_unlabeled_scaled)

# 计算预测均值和置信区间
lasso_pred_unlabeled = np.mean(lasso_bootstrap_preds, axis=0)
lasso_lower_ci = np.percentile(lasso_bootstrap_preds, 2.5, axis=0)
lasso_upper_ci = np.percentile(lasso_bootstrap_preds, 97.5, axis=0)

# 3. Ridge回归 + Bootstrap
print("\n=== 模型3: RidgeCV + Bootstrap ===")

alpha_values = np.logspace(-3, 3, 30)  # alpha候选值

# 主Ridge模型
ridge_model = RidgeCV(alphas=alpha_values, cv=5)
ridge_model.fit(X_labeled_scaled, y_labeled)
print(f"Ridge最优alpha值: {ridge_model.alpha_:.6f}")

# 在训练数据上的预测
ridge_pred_labeled = ridge_model.predict(X_labeled_scaled)
ridge_corr = np.corrcoef(ridge_pred_labeled, y_labeled)[0, 1]
print(f"Ridge预测结果与真实砂厚的相关性: {ridge_corr:.4f}")

# Bootstrap重采样预测
ridge_bootstrap_preds = np.zeros((n_bootstrap, X_unlabeled_scaled.shape[0]))

for i in range(n_bootstrap):
    # Bootstrap重采样
    indices = np.random.choice(len(X_labeled_scaled), len(X_labeled_scaled), replace=True)
    X_boot, y_boot = X_labeled_scaled[indices], y_labeled[indices]

    # 拟合模型
    ridge_boot = RidgeCV(alphas=alpha_values, cv=5)
    ridge_boot.fit(X_boot, y_boot)

    # 预测
    ridge_bootstrap_preds[i, :] = ridge_boot.predict(X_unlabeled_scaled)

# 计算预测均值和置信区间
ridge_pred_unlabeled = np.mean(ridge_bootstrap_preds, axis=0)
ridge_lower_ci = np.percentile(ridge_bootstrap_preds, 2.5, axis=0)
ridge_upper_ci = np.percentile(ridge_bootstrap_preds, 97.5, axis=0)

# 4. 一致性筛选（带详细日志）
print("\n=== 一致性筛选（详细分析）===")

# 将三个模型的预测结果整合
predictions = np.column_stack([fused_pred_unlabeled, lasso_pred_unlabeled, ridge_pred_unlabeled])

# 计算每个点三个预测值的最大差异
max_diffs = np.max(predictions, axis=1) - np.min(predictions, axis=1)

# 计算平均预测值用于相对差异计算
mean_preds = np.mean(predictions, axis=1)

# 定义分段式一致性阈值 - 只有两个区间
value_threshold = 10.0  # 区分小值和大值的边界

# 小值区域（< 10米）和大值区域（>= 10米）的掩码
small_value_mask = mean_preds < value_threshold
large_value_mask = mean_preds >= value_threshold

# 小值和大值区域的阈值设置
abs_threshold_small = 3.0  # 小值区域绝对差异阈值
abs_threshold_large = 5.0  # 大值区域绝对差异阈值
rel_threshold_small = 0.15  # 小值区域相对差异阈值
rel_threshold_large = 0.25  # 大值区域相对差异阈值

# 计算相对差异
rel_diffs = max_diffs / (mean_preds + 1e-10)  # 避免除零

# 统计各区域原始点数
total_small = np.sum(small_value_mask)
total_large = np.sum(large_value_mask)

print("\n=== 筛选前统计 ===")
print(f"总点数: {len(mean_preds)}")
print(f"小值区域 (<{value_threshold}米) 点数: {total_small} ({total_small / len(mean_preds) * 100:.1f}%)")
print(f"大值区域 (>={value_threshold}米) 点数: {total_large} ({total_large / len(mean_preds) * 100:.1f}%)")

# 对各区域应用不同的一致性标准，保持原有的"与"逻辑
consistent_small = (max_diffs <= abs_threshold_small) & (rel_diffs <= rel_threshold_small)
consistent_large = (max_diffs <= abs_threshold_large) & (rel_diffs <= rel_threshold_large)

# 统计各区域通过一致性筛选的点数
pass_abs_small = np.sum(small_value_mask & (max_diffs <= abs_threshold_small))
pass_rel_small = np.sum(small_value_mask & (rel_diffs <= rel_threshold_small))
pass_both_small = np.sum(small_value_mask & consistent_small)

pass_abs_large = np.sum(large_value_mask & (max_diffs <= abs_threshold_large))
pass_rel_large = np.sum(large_value_mask & (rel_diffs <= rel_threshold_large))
pass_both_large = np.sum(large_value_mask & consistent_large)

print("\n=== 一致性筛选统计 ===")
print(
    f"小值区域通过绝对差异 (<={abs_threshold_small}): {pass_abs_small}/{total_small} ({pass_abs_small / total_small * 100:.1f}%)"
)
print(
    f"小值区域通过相对差异 (<={rel_threshold_small * 100:.1f}%): {pass_rel_small}/{total_small} ({pass_rel_small / total_small * 100:.1f}%)"
)
print(f"小值区域同时满足两条件: {pass_both_small}/{total_small} ({pass_both_small / total_small * 100:.1f}%)")

print(
    f"大值区域通过绝对差异 (<={abs_threshold_large}): {pass_abs_large}/{total_large} ({pass_abs_large / total_large * 100:.1f}%)"
)
print(
    f"大值区域通过相对差异 (<={rel_threshold_large * 100:.1f}%): {pass_rel_large}/{total_large} ({pass_rel_large / total_large * 100:.1f}%)"
)
print(f"大值区域同时满足两条件: {pass_both_large}/{total_large} ({pass_both_large / total_large * 100:.1f}%)")

# 组合各区域的一致性掩码
consistent_mask = (small_value_mask & consistent_small) | (large_value_mask & consistent_large)
consistent_count = np.sum(consistent_mask)
print(
    f"\n通过一致性筛选的总点数: {consistent_count}/{len(mean_preds)} ({consistent_count / len(mean_preds) * 100:.1f}%)"
)

# Bootstrap预测标准差（作为不确定性）
lasso_std = np.std(lasso_bootstrap_preds, axis=0)
ridge_std = np.std(ridge_bootstrap_preds, axis=0)

# 设置分段式不确定性阈值
uncertainty_threshold_small = 2.5  # 小值区域标准差阈值
uncertainty_threshold_large = 5.0  # 大值区域标准差阈值

# 对各区域应用不同的不确定性标准
uncertainty_small = (lasso_std <= uncertainty_threshold_small) & (ridge_std <= uncertainty_threshold_small)
uncertainty_large = (lasso_std <= uncertainty_threshold_large) & (ridge_std <= uncertainty_threshold_large)

# 统计通过不确定性筛选的点数
pass_lasso_small = np.sum(small_value_mask & (lasso_std <= uncertainty_threshold_small))
pass_ridge_small = np.sum(small_value_mask & (ridge_std <= uncertainty_threshold_small))
pass_both_uncert_small = np.sum(small_value_mask & uncertainty_small)

pass_lasso_large = np.sum(large_value_mask & (lasso_std <= uncertainty_threshold_large))
pass_ridge_large = np.sum(large_value_mask & (ridge_std <= uncertainty_threshold_large))
pass_both_uncert_large = np.sum(large_value_mask & uncertainty_large)

print("\n=== 不确定性筛选统计 ===")
print(
    f"小值区域通过Lasso标准差 (<={uncertainty_threshold_small}): {pass_lasso_small}/{total_small} ({pass_lasso_small / total_small * 100:.1f}%)"
)
print(
    f"小值区域通过Ridge标准差 (<={uncertainty_threshold_small}): {pass_ridge_small}/{total_small} ({pass_ridge_small / total_small * 100:.1f}%)"
)
print(
    f"小值区域同时满足两条件: {pass_both_uncert_small}/{total_small} ({pass_both_uncert_small / total_small * 100:.1f}%)"
)

print(
    f"大值区域通过Lasso标准差 (<={uncertainty_threshold_large}): {pass_lasso_large}/{total_large} ({pass_lasso_large / total_large * 100:.1f}%)"
)
print(
    f"大值区域通过Ridge标准差 (<={uncertainty_threshold_large}): {pass_ridge_large}/{total_large} ({pass_ridge_large / total_large * 100:.1f}%)"
)
print(
    f"大值区域同时满足两条件: {pass_both_uncert_large}/{total_large} ({pass_both_uncert_large / total_large * 100:.1f}%)"
)

# 组合各区域的不确定性掩码
uncertainty_mask = (small_value_mask & uncertainty_small) | (large_value_mask & uncertainty_large)
uncertainty_count = np.sum(uncertainty_mask)
print(
    f"\n通过不确定性筛选的总点数: {uncertainty_count}/{len(mean_preds)} ({uncertainty_count / len(mean_preds) * 100:.1f}%)"
)

# ✅ 最终一致性掩码：预测一致 + 不确定性低
final_consistent_mask = consistent_mask & uncertainty_mask
final_consistent_indices = np.where(final_consistent_mask)[0]

# 统计最终筛选结果
final_small_count = np.sum(small_value_mask & final_consistent_mask)
final_large_count = np.sum(large_value_mask & final_consistent_mask)

print("\n=== 最终筛选结果 ===")
print(f"小值区域通过最终筛选: {final_small_count}/{total_small} ({final_small_count / total_small * 100:.1f}%)")
print(f"大值区域通过最终筛选: {final_large_count}/{total_large} ({final_large_count / total_large * 100:.1f}%)")
print(
    f"总计通过筛选点数: {len(final_consistent_indices)}/{len(mean_preds)} ({len(final_consistent_indices) / len(mean_preds) * 100:.1f}%)"
)

# 可选：分析大值区域中被筛除的原因
if total_large > 0 and final_large_count < total_large:
    # 分析大值区域的筛选失败原因
    large_indices = np.where(large_value_mask)[0]
    fail_abs_count = np.sum(large_value_mask & (max_diffs > abs_threshold_large))
    fail_rel_count = np.sum(large_value_mask & (rel_diffs > rel_threshold_large))
    fail_both_consist = np.sum(large_value_mask & ~consistent_large)

    fail_lasso_count = np.sum(large_value_mask & (lasso_std > uncertainty_threshold_large))
    fail_ridge_count = np.sum(large_value_mask & (ridge_std > uncertainty_threshold_large))
    fail_both_uncert = np.sum(large_value_mask & ~uncertainty_large)

    print("\n=== 大值区域筛选失败原因分析 ===")
    print(
        f"失败原因 - 绝对差异过大 (>{abs_threshold_large}): {fail_abs_count}/{total_large} ({fail_abs_count / total_large * 100:.1f}%)"
    )
    print(
        f"失败原因 - 相对差异过大 (>{rel_threshold_large * 100:.1f}%): {fail_rel_count}/{total_large} ({fail_rel_count / total_large * 100:.1f}%)"
    )
    print(
        f"失败原因 - 一致性检验不通过: {fail_both_consist}/{total_large} ({fail_both_consist / total_large * 100:.1f}%)"
    )

    print(
        f"失败原因 - Lasso标准差过大 (>{uncertainty_threshold_large}): {fail_lasso_count}/{total_large} ({fail_lasso_count / total_large * 100:.1f}%)"
    )
    print(
        f"失败原因 - Ridge标准差过大 (>{uncertainty_threshold_large}): {fail_ridge_count}/{total_large} ({fail_ridge_count / total_large * 100:.1f}%)"
    )
    print(
        f"失败原因 - 不确定性检验不通过: {fail_both_uncert}/{total_large} ({fail_both_uncert / total_large * 100:.1f}%)"
    )

    # 输出一些大值区域样本的详细信息
    large_sample_indices = np.random.choice(large_indices, min(5, len(large_indices)), replace=False)
    print("\n大值区域样本详情 (随机5个):")
    for idx in large_sample_indices:
        print(
            f"  样本 #{idx}: 平均值={mean_preds[idx]:.2f}, 最大差异={max_diffs[idx]:.2f}, 相对差异={rel_diffs[idx] * 100:.1f}%, "
            f"Lasso标准差={lasso_std[idx]:.2f}, Ridge标准差={ridge_std[idx]:.2f}"
        )
        print(
            f"    模型预测值: Fused={fused_pred_unlabeled[idx]:.2f}, Lasso={lasso_pred_unlabeled[idx]:.2f}, Ridge={ridge_pred_unlabeled[idx]:.2f}"
        )
        print(f"    通过一致性检验: {consistent_large[idx]}, 通过不确定性检验: {uncertainty_large[idx]}")

# 创建伪标记数据
X_pseudo = X_unlabeled[final_consistent_indices]
y_pseudo = mean_preds[final_consistent_indices]

# 获取一致性样本在原始seismic_samples中的索引
consistent_orig_indices = unlabeled_indices[final_consistent_indices]

# 添加预测结果到seismic_samples
seismic_samples["Fused_Pred"] = np.nan
seismic_samples["Lasso_Pred"] = np.nan
seismic_samples["Ridge_Pred"] = np.nan
seismic_samples["Mean_Pred"] = np.nan
seismic_samples["Max_Diff"] = np.nan
seismic_samples["Is_Consistent"] = False

# 填充预测结果
seismic_samples.loc[unlabeled_indices, "Fused_Pred"] = fused_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Lasso_Pred"] = lasso_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Ridge_Pred"] = ridge_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Mean_Pred"] = mean_preds
seismic_samples.loc[unlabeled_indices, "Max_Diff"] = max_diffs
seismic_samples.loc[consistent_orig_indices, "Is_Consistent"] = True

# 统计并报告负值数量
neg_count = (seismic_samples["Mean_Pred"] < 0).sum()
if neg_count > 0:
    print(f"\n注意: 有 {neg_count} 个负的砂厚预测值已被替换为0")

# 将负数的Mean_Pred值置为0
seismic_samples["Mean_Pred"] = seismic_samples["Mean_Pred"].clip(lower=0)

# 输出结果分布统计
if len(y_pseudo) > 0:
    bins = [0, 5, 10, 15, 20, np.inf]
    labels = ["0-5", "5-10", "10-15", "15-20", ">20"]
    y_binned = pd.cut(y_pseudo, bins=bins, labels=labels)
    bin_counts = y_binned.value_counts().sort_index()

    print("\n=== 虚拟井砂厚分布统计 ===")
    for i, count in enumerate(bin_counts):
        print(f"砂厚范围 {labels[i]}米: {count}个 ({count / len(y_pseudo) * 100:.1f}%)")

# 保存预测结果
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples_with_predictions.csv"), index=False)
print(f"\n预测结果已保存至 {os.path.join(output_dir, 'seismic_samples_with_predictions.csv')}")

# 保存一致性虚拟井数据
consistent_samples = seismic_samples[seismic_samples["Is_Consistent"] == True].copy()
consistent_samples.to_csv(os.path.join(output_dir, "consistent_pseudo_wells.csv"), index=False)
print(f"一致性虚拟井数据已保存至 {os.path.join(output_dir, 'consistent_pseudo_wells.csv')}")


## 展示


In [None]:
# 准备小工区地震数据（使用已筛选的data_H6_2_attr_filtered）
# 1. 首先创建融合属性
print("为整个小工区地震数据创建融合属性...")

# 标准化数据
scaler = StandardScaler()
attr_data = data_H6_2_attr_filtered[selected_features].copy()
attr_data_scaled = scaler.fit_transform(attr_data)

# 使用之前定义的create_fused_attribute函数创建融合属性
fused_attr = create_fused_attribute(attr_data_scaled, selected_features, correlation_weights)

# 将融合属性添加到地震数据中
data_H6_2_attr_filtered["Fused_Attribute"] = fused_attr

# 2. 准备真实井点数据
real_wells = data_H6_2_well_selected

# 3. 准备虚拟井点数据（使用之前保存的一致性虚拟井）
pseudo_wells = seismic_samples[seismic_samples["Is_Consistent"] == True].copy()

# 可视化融合属性分布与井点位置
visualize_attribute_map(
    data_points=data_H6_2_attr_filtered,
    attribute_name="Fused_Attribute",
    attribute_label="地震融合属性值",
    real_wells=real_wells,
    pseudo_wells=pseudo_wells,
    target_column="Thickness of facies(1: Fine sand)",
    output_dir=output_dir,
    filename_prefix="fused_attribute",
    class_thresholds=[0.1, 10],
    figsize=(16, 14),
    dpi=300,
    cmap="viridis",
    point_size=140,
    well_size=200,
)

print("融合属性分布与井点位置可视化完成")

## SVR、随机森林（限制树深）、XGBoost（控制学习率和复杂度）


In [None]:
# 组合真实样本和伪样本进行建模

# 设置随机种子以确保结果可复现
np.random.seed(42)

print("======== 真实井点与虚拟井点结合建模 ========")

# 步骤1: 准备数据 - 合并真实样本和伪样本
# 读取虚拟井数据
pseudo_wells = pd.read_csv(os.path.join(output_dir, "consistent_pseudo_wells.csv"))
print(f"加载了 {len(pseudo_wells)} 个虚拟井点数据")

# 创建用于建模的真实井点数据
real_wells = data_H6_2_well_selected.copy()
print(f"真实井点数据数量: {len(real_wells)}")

# 确定共同的特征列（选用之前筛选出的最佳特征）
common_features = selected_features.copy()
print(f"使用的特征: {common_features}")

# 确保真实井点和虚拟井点都有这些特征列
real_wells_valid = real_wells.dropna(subset=common_features + ["Thickness of facies(1: Fine sand)"])
print(f"有效真实井点数据数量: {len(real_wells_valid)} (丢弃了缺失值)")

# 创建合并数据集
# 从真实井点中提取特征和目标
X_real = real_wells_valid[common_features].values
y_real = real_wells_valid["Thickness of facies(1: Fine sand)"].values

# 从虚拟井点中提取特征和目标(使用平均预测作为虚拟井的砂厚)
X_pseudo = pseudo_wells[common_features].values
y_pseudo = pseudo_wells["Mean_Pred"].values  # 使用平均预测作为目标值

# 合并数据
X_combined = np.vstack((X_real, X_pseudo))
y_combined = np.concatenate((y_real, y_pseudo))
print(f"合并后的样本数量: {len(X_combined)}")

# 创建样本权重 - 给真实样本更高的权重
sample_weights = np.ones(len(X_combined))
# 设置真实样本权重为虚拟样本的2倍
sample_weights[: len(X_real)] = 2.0

# 步骤2: 划分训练集和测试集(仅使用真实样本做测试)
# 首先划分真实样本
X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(X_real, y_real, test_size=0.3, random_state=42)

# 合并真实训练样本和所有虚拟样本作为完整训练集
X_train = np.vstack((X_real_train, X_pseudo))
y_train = np.concatenate((y_real_train, y_pseudo))

# 创建训练样本权重
train_weights = np.ones(len(X_train))
# 设置真实样本权重为虚拟样本的2倍
train_weights[: len(X_real_train)] = 2.0

print(f"训练集大小: {len(X_train)}, 测试集大小: {len(X_real_test)}")

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_real_test)


# 步骤3: 模型训练与评估
# 创建评估函数
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name, sample_weights=None):
    """评估模型性能并返回指标"""
    # 训练模型
    if sample_weights is not None:
        model.fit(X_train, y_train, sample_weight=sample_weights)
    else:
        model.fit(X_train, y_train)

    # 在测试集上预测
    y_pred = model.predict(X_test)

    # 计算指标
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"\n{model_name} 模型评估:")
    print(f"  - RMSE: {rmse:.4f}")
    print(f"  - R²: {r2:.4f}")
    print(f"  - MAE: {mae:.4f}")

    # 绘制真实值vs预测值散点图
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "r--")
    plt.xlabel("真实值")
    plt.ylabel("预测值")
    plt.title(f"{model_name}: 真实值 vs 预测值")
    plt.grid(True, alpha=0.3)
    plt.savefig(
        os.path.join(output_dir, f"{model_name.lower().replace(' ', '_')}_pred_vs_true.png"),
        dpi=300,
        bbox_inches="tight",
    )
    plt.show()

    return {"model": model, "rmse": rmse, "r2": r2, "mae": mae, "y_pred": y_pred}


# 模型1: 支持向量回归(SVR)
print("\n训练SVR模型...")
# SVR对特征数量敏感，使用较少特征以避免维度灾难
# 如果特征数量较多，可以考虑使用前2-3个最重要的特征
svr_features = min(3, len(common_features))
X_train_svr = X_train_scaled[:, :svr_features]
X_test_svr = X_test_scaled[:, :svr_features]

# 设置SVR参数网格
param_grid_svr = {"C": [0.1, 1, 10], "gamma": ["scale", 0.01, 0.1], "epsilon": [0.1, 0.2], "kernel": ["rbf"]}

# 创建并训练SVR模型
svr = SVR()
grid_search_svr = GridSearchCV(svr, param_grid_svr, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search_svr.fit(X_train_svr, y_train, sample_weight=train_weights)
best_svr = grid_search_svr.best_estimator_

print(f"SVR最佳参数: {grid_search_svr.best_params_}")
svr_results = evaluate_model(best_svr, X_train_svr, y_train, X_test_svr, y_real_test, "SVR", train_weights)

# 模型2: 随机森林(RandomForest) - 限制树深以避免过拟合
print("\n训练随机森林模型...")
param_grid_rf = {
    "n_estimators": [50, 100],
    "max_depth": [3, 4, 5],
    "min_samples_leaf": [3, 5],
    "max_features": ["sqrt", "log2"],
}

rf = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_train, sample_weight=train_weights)
best_rf = grid_search_rf.best_estimator_

print(f"随机森林最佳参数: {grid_search_rf.best_params_}")
rf_results = evaluate_model(best_rf, X_train_scaled, y_train, X_test_scaled, y_real_test, "随机森林", train_weights)

# 模型3: XGBoost - 控制学习率和复杂度
print("\n训练XGBoost模型...")
param_grid_xgb = {
    "n_estimators": [50, 100],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "min_child_weight": [3, 5],
}

xgb = XGBRegressor(random_state=42)
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search_xgb.fit(X_train_scaled, y_train, sample_weight=train_weights)
best_xgb = grid_search_xgb.best_estimator_

print(f"XGBoost最佳参数: {grid_search_xgb.best_params_}")
xgb_results = evaluate_model(best_xgb, X_train_scaled, y_train, X_test_scaled, y_real_test, "XGBoost", train_weights)

# 步骤4: 比较模型性能
models_comparison = pd.DataFrame(
    {
        "模型": ["SVR", "随机森林", "XGBoost"],
        "RMSE": [svr_results["rmse"], rf_results["rmse"], xgb_results["rmse"]],
        "R²": [svr_results["r2"], rf_results["r2"], xgb_results["r2"]],
        "MAE": [svr_results["mae"], rf_results["mae"], xgb_results["mae"]],
    }
)

print("\n模型性能比较:")
print(models_comparison)

# 保存比较结果
models_comparison.to_csv(os.path.join(output_dir, "model_comparison.csv"), index=False)

# 选择最佳模型(基于R²)
best_model_idx = models_comparison["R²"].idxmax()
best_model_name = models_comparison.loc[best_model_idx, "模型"]
print(f"\n最佳模型: {best_model_name}")

# 为地震数据创建预测
print("\n使用最佳模型为整个工区生成预测...")

# 准备地震数据
seismic_data = data_H6_2_attr_filtered.copy()
X_seismic = seismic_data[common_features].fillna(seismic_data[common_features].mean())

# 标准化特征
X_seismic_scaled = scaler.transform(X_seismic)

# 根据最佳模型选择预测方法
if best_model_name == "SVR":
    X_seismic_model = X_seismic_scaled[:, :svr_features]
    predictions = best_svr.predict(X_seismic_model)
    best_model = best_svr
elif best_model_name == "随机森林":
    predictions = best_rf.predict(X_seismic_scaled)
    best_model = best_rf
else:  # XGBoost
    predictions = best_xgb.predict(X_seismic_scaled)
    best_model = best_xgb

# 将预测结果添加到地震数据
seismic_data["Predicted_Sand_Thickness"] = predictions

# 将负值预测设为0
if (predictions < 0).any():
    neg_count = (predictions < 0).sum()
    print(f"注意: {neg_count} 个负的砂厚预测值已被替换为0")
    seismic_data["Predicted_Sand_Thickness"] = seismic_data["Predicted_Sand_Thickness"].clip(lower=0)

# 保存预测结果
seismic_data.to_csv(os.path.join(output_dir, "seismic_with_predictions.csv"), index=False)

# 步骤5: 可视化预测结果
print("\n可视化预测结果...")

# 使用最佳模型进行可视化
visualize_attribute_map(
    data_points=seismic_data,
    attribute_name="Predicted_Sand_Thickness",
    attribute_label=f"砂厚预测值(米) - {best_model_name}模型",
    real_wells=real_wells_valid,
    pseudo_wells=None,
    target_column="Thickness of facies(1: Fine sand)",
    output_dir=output_dir,
    filename_prefix=f"predicted_sand_thickness_{best_model_name.lower()}",
    class_thresholds=[0.1, 10],  # 分类阈值：低值(<0.1)、中值(0.1-10)、高值(>10)
    figsize=(16, 14),
    dpi=300,
    cmap="viridis",
    point_size=150,
    well_size=200,
)

# 可选：特征重要性分析（对于树模型）
if best_model_name in ["随机森林", "XGBoost"]:
    print("\n分析特征重要性...")

    # 获取特征重要性
    if best_model_name == "随机森林":
        importances = best_rf.feature_importances_
    else:
        importances = best_xgb.feature_importances_

    # 创建特征重要性DataFrame
    feature_importance_df = pd.DataFrame({"特征": common_features, "重要性": importances})
    feature_importance_df = feature_importance_df.sort_values("重要性", ascending=False)

    # 绘制特征重要性
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df["特征"], feature_importance_df["重要性"])
    plt.xlabel("特征重要性")
    plt.title(f"{best_model_name}模型特征重要性")
    plt.tight_layout()
    plt.savefig(
        os.path.join(output_dir, f"{best_model_name.lower()}_feature_importance.png"), dpi=300, bbox_inches="tight"
    )
    plt.show()

    # 保存特征重要性
    feature_importance_df.to_csv(
        os.path.join(output_dir, f"{best_model_name.lower()}_feature_importance.csv"), index=False
    )

print("\n建模与预测完成。所有结果已保存到输出目录。")