# 虚拟井生成


In [None]:
# 确保src目录在Python路径中
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    filter_seismic_by_wells,
    identify_attributes,
    parse_petrel_file,
)
from src.feature_selection import select_best_features
from src.visualization import visualize_attribute_map

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [None]:
data_H6_2_attr = parse_petrel_file("../data/H6-2_attr")

## 导入井震数据


In [None]:
file_H6_2_well = "../data/well_processed.xlsx"
data_H6_2_well = pd.read_excel(file_H6_2_well, sheet_name="Sheet1")

# 只选择层位（Surface）为 H6-2 的行，并丢弃砂厚为 NaN 的行
data_H6_2_well_selected = (
    data_H6_2_well[data_H6_2_well["Surface"] == "H6-2"]
    .query("Well != 'PH6' and Well != 'PH8' and Well != 'PH3' and Well != 'PH2'")
    .replace(-999, np.nan)  # 将-999替换为NaN（通常-999是缺失值的代码）
    .dropna(subset=["Thickness of facies(1: Fine sand)"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

# 显示筛选后的前几行数据
data_H6_2_well_selected.head()

## 提取共同属性


In [None]:
# 获取地震属性列表
seismic_attr, _ = identify_attributes("../data/H6-2_attr")

# 提取Excel的属性列表（从第8列开始的所有列）
well_seismic_attr = data_H6_2_well.columns[7:].tolist()

# 计算两个列表的交集
common_attributes = list(set(seismic_attr) & set(well_seismic_attr))

# 打印结果
print(f"地震属性数量: {len(seismic_attr)}")
print(f"Excel属性数量: {len(well_seismic_attr)}")
print(f"共同属性数量: {len(common_attributes)}")
print("\n共同属性列表:")
for attr in common_attributes:
    print(f"- {attr}")

## 根据井点分布，缩小工区范围


In [None]:
# 限制工区范围
data_H6_2_attr_filtered, area_bounds = filter_seismic_by_wells(
    seismic_data=data_H6_2_attr,
    well_data=data_H6_2_well_selected,
    expansion_factor=1.5,  # 扩展50%
    plot=True,
    output_dir=output_dir,
)

# 后续可以直接使用area_bounds中的边界信息
print("区域边界信息:")
for key, value in area_bounds.items():
    print(f"  {key}: {value}")

## 生成统计摘要


In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=data_H6_2_attr_filtered,
    well_data=data_H6_2_well_selected,
    common_attributes=common_attributes,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,  # 输出图表目录
    verbose=True,  # 打印详细信息
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## 随机森林重要性和相关性分析


In [None]:
# 使用随机森林评估特征重要性并移除冗余特征
selected_features = select_best_features(
    well_data=data_H6_2_well_selected,
    attribute_columns=good_attributes,
    target_column="Thickness of facies(1: Fine sand)",
    n_features=3,
    corr_threshold=0.85,
    output_dir=output_dir,
    verbose=True,
)

# 输出特征选择结果
print("\n基于随机森林重要性和相关性分析的最佳特征:")
for i, feature in enumerate(selected_features):
    print(f"{i + 1}. {feature}")

## 提取样本，准备设置虚拟井


In [None]:
# 使用筛选后的地震数据区域提取等间距样本
seismic_samples = extract_uniform_seismic_samples(
    seismic_data=data_H6_2_attr_filtered,
    n_rows=40,
    n_cols=40,
    area_bounds=area_bounds,  # 直接传入边界字典
)

# 可视化真实井点和采样点
plt.figure(figsize=(15, 10))

# 绘制地震数据点（使用抽样）
sample_ratio = min(1.0, 5000 / len(data_H6_2_attr_filtered))
seismic_sample = data_H6_2_attr_filtered.sample(frac=sample_ratio)
plt.scatter(seismic_sample["X"], seismic_sample["Y"], color="lightgray", alpha=0.3, s=10, label="地震数据(抽样)")

# 绘制真实井点位置
plt.scatter(
    data_H6_2_well_selected["X"], data_H6_2_well_selected["Y"], color="red", s=100, marker="^", label="真实井点"
)

# 绘制等间距采样点位置
plt.scatter(seismic_samples["X"], seismic_samples["Y"], color="blue", s=50, marker="o", label="等间距采样点")

# 添加标题和图例
plt.title("真实井点与等间距采样点分布", fontsize=16)
plt.xlabel("X坐标", fontsize=14)
plt.ylabel("Y坐标", fontsize=14)
plt.legend(loc="upper right")
plt.grid(True, linestyle="--", alpha=0.7)

# 保存图片
plt.savefig(os.path.join(output_dir, "real_wells_and_seismic_samples.png"), dpi=300, bbox_inches="tight")
plt.show()


# 保存提取的样本数据
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples.csv"), index=False)
print(f"等间距地震样本数据已保存至 {os.path.join(output_dir, 'seismic_samples.csv')}")

## 多线性模型一致性预测设置虚拟井


In [None]:
# 创建融合属性和多模型预测
print("======== 创建融合属性和多模型预测 ========")
target_column = "Thickness of facies(1: Fine sand)"
min_corr_threshold = 0.2  # 最小相关性阈值，低于此值的属性将被排除

# 检查每个选定属性在井点数据中的有效性
print("检查属性在井点数据中的有效性:")
for feature in selected_features:
    nan_count = data_H6_2_well_selected[feature].isna().sum()
    print(
        f"属性 '{feature}' 在井点数据中的NaN值数量: {nan_count}/{len(data_H6_2_well_selected)} ({nan_count / len(data_H6_2_well_selected) * 100:.1f}%)"
    )

# 筛选出所有选定属性都有有效值的井点
valid_wells = data_H6_2_well_selected.dropna(subset=selected_features + [target_column])
print(f"\n所有属性都有有效值的井点数量: {len(valid_wells)} / {len(data_H6_2_well_selected)}")

# 准备训练数据
X_labeled = valid_wells[selected_features].values
y_labeled = valid_wells[target_column].values

# 准备未标记数据（地震样本点）
X_unlabeled = seismic_samples[selected_features].dropna().values
unlabeled_indices = seismic_samples[selected_features].dropna().index

# 1. 融合属性线性加权预测
print("\n=== 模型1: 融合属性线性加权 ===")

# 计算相关性权重
correlation_weights = {}
for i, feature in enumerate(selected_features):
    corr, _ = spearmanr(valid_wells[feature], valid_wells[target_column])
    if abs(corr) >= min_corr_threshold:
        correlation_weights[feature] = corr
        print(f"属性 '{feature}' 与砂厚的Spearman相关性: {corr:.4f}")
    else:
        print(f"属性 '{feature}' 与砂厚的相关性过低 ({corr:.4f})，不纳入融合")

# 如果没有有效属性，使用所有属性且权重相等
if len(correlation_weights) == 0:
    print("警告: 没有属性满足相关性阈值，将使用所有属性且权重相等")
    for feature in selected_features:
        correlation_weights[feature] = 1.0
        print(f"属性 '{feature}' 使用默认权重: 1.0")

# 标准化数据
scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled)
X_unlabeled_scaled = scaler.transform(X_unlabeled)


# 创建融合属性函数
def create_fused_attribute(X_scaled, features, weights):
    """
    基于选定特征和权重创建融合属性

    参数:
        X_scaled (ndarray): 标准化后的特征矩阵
        features (list): 特征列表
        weights (dict): 每个特征的权重

    返回:
        ndarray: 融合属性
    """
    # 初始化融合属性
    fused_attr = np.zeros(X_scaled.shape[0])
    weight_sum = 0

    # 对每个特征进行加权融合
    for i, feature in enumerate(features):
        if feature in weights:
            weight = weights[feature]
            fused_attr += X_scaled[:, i] * weight
            weight_sum += abs(weight)

    # 归一化融合结果
    if weight_sum > 0:
        fused_attr /= weight_sum

    return fused_attr


# 在训练数据上创建融合属性
fused_attr_labeled = create_fused_attribute(X_labeled_scaled, selected_features, correlation_weights)

# 在未标记数据上创建融合属性
fused_attr_unlabeled = create_fused_attribute(X_unlabeled_scaled, selected_features, correlation_weights)

# 使用融合属性拟合线性回归
linear_model = LinearRegression()
linear_model.fit(fused_attr_labeled.reshape(-1, 1), y_labeled)

# 预测
fused_pred_labeled = linear_model.predict(fused_attr_labeled.reshape(-1, 1))
fused_pred_unlabeled = linear_model.predict(fused_attr_unlabeled.reshape(-1, 1))

# 评估融合属性预测效果
fused_corr = np.corrcoef(fused_pred_labeled, y_labeled)[0, 1]
print(f"融合属性预测结果与真实砂厚的相关性: {fused_corr:.4f}")

# 2. Lasso回归 + Bootstrap
print("\n=== 模型2: LassoCV + Bootstrap ===")

n_bootstrap = 100  # Bootstrap重采样次数
alpha_values = np.logspace(-4, 1, 30)  # alpha候选值

# 主Lasso模型
lasso_model = LassoCV(alphas=alpha_values, cv=5, max_iter=10000, tol=1e-3)
lasso_model.fit(X_labeled_scaled, y_labeled)
print(f"Lasso最优alpha值: {lasso_model.alpha_:.6f}")

# 在训练数据上的预测
lasso_pred_labeled = lasso_model.predict(X_labeled_scaled)
lasso_corr = np.corrcoef(lasso_pred_labeled, y_labeled)[0, 1]
print(f"Lasso预测结果与真实砂厚的相关性: {lasso_corr:.4f}")

# Bootstrap重采样预测
lasso_bootstrap_preds = np.zeros((n_bootstrap, X_unlabeled_scaled.shape[0]))

for i in range(n_bootstrap):
    # Bootstrap重采样
    indices = np.random.choice(len(X_labeled_scaled), len(X_labeled_scaled), replace=True)
    X_boot, y_boot = X_labeled_scaled[indices], y_labeled[indices]

    # 拟合模型
    lasso_boot = LassoCV(alphas=alpha_values, cv=5, max_iter=10000, tol=1e-3)
    lasso_boot.fit(X_boot, y_boot)

    # 预测
    lasso_bootstrap_preds[i, :] = lasso_boot.predict(X_unlabeled_scaled)

# 计算预测均值和置信区间
lasso_pred_unlabeled = np.mean(lasso_bootstrap_preds, axis=0)
lasso_lower_ci = np.percentile(lasso_bootstrap_preds, 2.5, axis=0)
lasso_upper_ci = np.percentile(lasso_bootstrap_preds, 97.5, axis=0)

# 3. Ridge回归 + Bootstrap
print("\n=== 模型3: RidgeCV + Bootstrap ===")

alpha_values = np.logspace(-3, 3, 30)  # alpha候选值

# 主Ridge模型
ridge_model = RidgeCV(alphas=alpha_values, cv=5)
ridge_model.fit(X_labeled_scaled, y_labeled)
print(f"Ridge最优alpha值: {ridge_model.alpha_:.6f}")

# 在训练数据上的预测
ridge_pred_labeled = ridge_model.predict(X_labeled_scaled)
ridge_corr = np.corrcoef(ridge_pred_labeled, y_labeled)[0, 1]
print(f"Ridge预测结果与真实砂厚的相关性: {ridge_corr:.4f}")

# Bootstrap重采样预测
ridge_bootstrap_preds = np.zeros((n_bootstrap, X_unlabeled_scaled.shape[0]))

for i in range(n_bootstrap):
    # Bootstrap重采样
    indices = np.random.choice(len(X_labeled_scaled), len(X_labeled_scaled), replace=True)
    X_boot, y_boot = X_labeled_scaled[indices], y_labeled[indices]

    # 拟合模型
    ridge_boot = RidgeCV(alphas=alpha_values, cv=5)
    ridge_boot.fit(X_boot, y_boot)

    # 预测
    ridge_bootstrap_preds[i, :] = ridge_boot.predict(X_unlabeled_scaled)

# 计算预测均值和置信区间
ridge_pred_unlabeled = np.mean(ridge_bootstrap_preds, axis=0)
ridge_lower_ci = np.percentile(ridge_bootstrap_preds, 2.5, axis=0)
ridge_upper_ci = np.percentile(ridge_bootstrap_preds, 97.5, axis=0)

## 平衡区间分布与质量控制的伪样本筛选

In [None]:
# 4. 平衡分布与质量控制的优化采样
print("\n=== 平衡分布与质量控制的优化采样 ===")

# 将三个模型的预测结果整合
predictions = np.column_stack([fused_pred_unlabeled, lasso_pred_unlabeled, ridge_pred_unlabeled])

# 计算每个点三个预测值的最大差异
max_diffs = np.max(predictions, axis=1) - np.min(predictions, axis=1)

# 计算平均预测值用于相对差异计算
mean_preds = np.mean(predictions, axis=1)

# 预先处理预测值，将负值替换为0
mean_preds = np.maximum(mean_preds, 0)
fused_pred_unlabeled = np.maximum(fused_pred_unlabeled, 0)
lasso_pred_unlabeled = np.maximum(lasso_pred_unlabeled, 0)
ridge_pred_unlabeled = np.maximum(ridge_pred_unlabeled, 0)

# 计算bootstrap结果的标准差
lasso_std = np.std(lasso_bootstrap_preds, axis=0)
ridge_std = np.std(ridge_bootstrap_preds, axis=0)

# 定义砂厚区间
bins = [0, 0.1, 5, 10, np.inf]
bin_labels = ["0-0.1", "0.1-5", "5-10", ">10"]

# 将预测结果分配到各个区间
bin_indices = []
for i in range(len(bins) - 1):
    mask = (mean_preds >= bins[i]) & (mean_preds < bins[i + 1])
    bin_indices.append(np.where(mask)[0])

# 计算各区间样本数量
bin_counts = [len(indices) for indices in bin_indices]
print("各砂厚区间原始分布:")
for i, count in enumerate(bin_counts):
    print(f"  {bin_labels[i]}: {count}个 ({count / len(mean_preds) * 100:.1f}%)")

# 设定目标分布 - 根据实际数据调整
# 使用更接近现实数据分布的目标分布
target_distribution = np.array([0.40, 0.30, 0.20, 0.10])

# 目标样本总数（最小200个）
target_total_samples = max(200, int(0.3 * len(mean_preds)))
min_samples_per_bin = 15  # 每个区间的最小样本数

# 计算每个区间的目标样本数
target_bin_samples = np.round(target_distribution * target_total_samples).astype(int)
# 确保每个区间至少有最小样本数
target_bin_samples = np.maximum(target_bin_samples, min_samples_per_bin)
# 调整以匹配总目标
target_bin_samples = np.round(target_bin_samples / sum(target_bin_samples) * target_total_samples).astype(int)

print("\n目标样本分布:")
for i, count in enumerate(target_bin_samples):
    print(f"  {bin_labels[i]}: 目标{count}个样本 ({count / sum(target_bin_samples) * 100:.1f}%)")


# 计算连续的质量分数函数
def calculate_quality_score(mean_pred, max_diff, rel_diff, lasso_std, ridge_std):
    """计算样本的质量分数（0-1，越高越好）"""
    # 基础权重
    w_consistency = 0.6  # 一致性权重
    w_uncertainty = 0.4  # 不确定性权重

    # 统一使用基本阈值，不再区分小值/大值区域
    abs_threshold = 4.0  # 统一的绝对差异阈值
    rel_threshold = 0.2  # 统一的相对差异阈值
    std_threshold = 3.5  # 统一的标准差阈值

    # 计算一致性得分 (0-1)
    abs_score = max(0, 1 - max_diff / abs_threshold)
    rel_score = max(0, 1 - rel_diff / rel_threshold)
    consistency_score = 0.5 * abs_score + 0.5 * rel_score

    # 计算不确定性得分 (0-1)
    lasso_std_score = max(0, 1 - lasso_std / std_threshold)
    ridge_std_score = max(0, 1 - ridge_std / std_threshold)
    uncertainty_score = 0.5 * lasso_std_score + 0.5 * ridge_std_score

    # 组合得分
    final_score = w_consistency * consistency_score + w_uncertainty * uncertainty_score

    return final_score


# 计算所有样本的质量分数
rel_diffs = max_diffs / (mean_preds + 1e-10)  # 避免除零
quality_scores = np.zeros(len(mean_preds))

for i in range(len(mean_preds)):
    quality_scores[i] = calculate_quality_score(mean_preds[i], max_diffs[i], rel_diffs[i], lasso_std[i], ridge_std[i])

# 定义最低质量阈值
min_quality_threshold = 0.5  # 不接受低于此质量的样本


# 动态阈值调整：为稀缺区间放宽标准
def get_dynamic_quality_threshold(bin_idx, sample_ratio):
    """
    根据区间样本充足程度动态调整质量阈值

    参数:
        bin_idx: 区间索引
        sample_ratio: 当前样本数/目标样本数

    返回:
        调整后的质量阈值
    """
    base_threshold = min_quality_threshold

    # 高砂厚区间（bin_idx为2或3）对质量更加宽容
    if bin_idx >= 2:
        base_threshold = max(0.4, base_threshold - 0.1)  # 高砂厚区间基准阈值更低

    # 如果样本不足，降低阈值，但不低于最低阈值
    if sample_ratio < 0.5:  # 严重不足
        return max(0.3, base_threshold - 0.2)
    elif sample_ratio < 0.8:  # 轻微不足
        return max(0.4, base_threshold - 0.1)
    else:  # 接近或超过目标
        return base_threshold


# 计算J评分函数
def calculate_j_score(selected_indices, quality_scores, mean_preds, target_dist, bins):
    """计算当前选择的J评分"""
    if len(selected_indices) == 0:
        return 0

    # 1. 计算质量分数
    selected_quality = quality_scores[selected_indices]
    Q_total = np.mean(selected_quality)

    # 2. 计算分布相似度
    selected_values = mean_preds[selected_indices]
    hist, _ = np.histogram(selected_values, bins=bins)
    current_dist = hist / (np.sum(hist) + 1e-10)

    # Jensen-Shannon散度
    from scipy.spatial.distance import jensenshannon

    jsd = jensenshannon(current_dist, target_dist)
    dist_similarity = 1 - jsd

    # 3. 计算样本数量得分
    n_samples = len(selected_indices)
    min_samples = 200
    max_samples = 1000
    quantity_score = min(1.0, max(0.0, (n_samples - min_samples) / (max_samples - min_samples)))

    # 计算J值 - 平衡质量和分布
    # 降低分布的权重，增加质量权重，使算法更灵活地选择高质量样本
    J = 0.5 * Q_total + 0.3 * dist_similarity + 0.2 * quantity_score

    return J


# 迭代优化样本选择
print("\n开始迭代优化样本选择...")

# 初始化选择的样本
selected_indices = np.array([], dtype=int)  # 使用空NumPy数组而不是空列表
max_iterations = 10
current_j = 0

for iteration in range(max_iterations):
    print(f"\n迭代 {iteration + 1}/{max_iterations}")

    # 1. 计算当前各区间已选样本数
    current_bin_samples = [0] * len(bin_labels)
    if len(selected_indices) > 0:  # 修改条件检查方式
        selected_values = mean_preds[selected_indices]
        for i in range(len(bins) - 1):
            current_bin_samples[i] = np.sum((selected_values >= bins[i]) & (selected_values < bins[i + 1]))

    # 2. 识别样本不足的区间
    bin_ratios = np.zeros(len(bins) - 1)
    for i in range(len(bins) - 1):
        bin_ratios[i] = current_bin_samples[i] / target_bin_samples[i] if target_bin_samples[i] > 0 else 1.0

    # 按照样本充足率排序区间（优先处理最不足的区间）
    bin_priority = np.argsort(bin_ratios)

    improved = False

    # 3. 逐个处理各区间
    for bin_idx in bin_priority:
        if bin_ratios[bin_idx] >= 1.0:  # 该区间样本已经足够
            continue

        # 4. 动态调整质量阈值
        quality_threshold = get_dynamic_quality_threshold(bin_idx, bin_ratios[bin_idx])

        # 5. 获取该区间的候选样本
        candidates = bin_indices[bin_idx]

        # 排除已选择的样本
        candidates = np.setdiff1d(candidates, selected_indices)

        if len(candidates) == 0:
            print(f"  区间 {bin_labels[bin_idx]} 没有更多可用样本")
            continue

        # 6. 按质量排序
        candidates = candidates[np.argsort(-quality_scores[candidates])]

        # 7. 筛选符合质量要求的样本
        qualified_candidates = candidates[quality_scores[candidates] >= quality_threshold]

        if len(qualified_candidates) == 0:
            print(f"  区间 {bin_labels[bin_idx]} 没有符合质量要求(>{quality_threshold:.2f})的样本")
            continue

        # 8. 计算需要添加的样本数
        needed = target_bin_samples[bin_idx] - current_bin_samples[bin_idx]
        to_add = min(needed, len(qualified_candidates))

        if to_add <= 0:
            continue

        print(f"  向区间 {bin_labels[bin_idx]} 添加 {to_add} 个样本 (质量阈值>{quality_threshold:.2f})")

        # 9. 添加样本并评估J值变化
        new_selected = (
            np.concatenate([selected_indices, qualified_candidates[:to_add]])
            if len(selected_indices) > 0
            else qualified_candidates[:to_add]
        )
        new_j = calculate_j_score(new_selected, quality_scores, mean_preds, target_distribution, bins)

        # 10. 如果J值提高，接受新样本
        if new_j > current_j or len(selected_indices) == 0:
            selected_indices = new_selected
            current_j = new_j
            improved = True
            print(f"  J值提高到 {current_j:.4f}")

    # 如果本轮迭代没有改进，提前结束
    if not improved:
        print("  本轮迭代无改进，提前结束")
        break

    # 打印当前状态
    selected_values = mean_preds[selected_indices]
    current_hist, _ = np.histogram(selected_values, bins=bins)
    print(f"  当前已选择 {len(selected_indices)} 个样本，J值 = {current_j:.4f}")
    for i in range(len(bins) - 1):
        print(f"    {bin_labels[i]}: {current_hist[i]}个 ({current_hist[i] / len(selected_indices) * 100:.1f}%)")

# 最终确定的样本索引
final_selected_indices = selected_indices

# 创建最终掩码
final_mask = np.zeros(len(mean_preds), dtype=bool)
final_mask[final_selected_indices] = True

# 统计最终结果
print("\n=== 最终样本选择结果 ===")
final_hist, _ = np.histogram(mean_preds[final_selected_indices], bins=bins)
print(f"总计选择样本: {len(final_selected_indices)}个")
for i in range(len(bins) - 1):
    original_count = bin_counts[i]
    final_count = final_hist[i]
    print(
        f"砂厚范围 {bin_labels[i]}: {final_count}个 ({final_count / len(final_selected_indices) * 100:.1f}%) | 原始: {original_count}个"
    )

# 统计最终质量分数
final_quality = quality_scores[final_selected_indices]
print(f"\n样本质量统计:")
print(f"  平均质量分数: {np.mean(final_quality):.4f}")
print(f"  最低质量分数: {np.min(final_quality):.4f}")
print(f"  最高质量分数: {np.max(final_quality):.4f}")

# 创建伪标记数据
X_pseudo = X_unlabeled[final_selected_indices]
y_pseudo = mean_preds[final_selected_indices]

# 获取优化样本在原始seismic_samples中的索引
optimized_orig_indices = unlabeled_indices[final_selected_indices]

# 添加预测结果到seismic_samples
seismic_samples["Fused_Pred"] = np.nan
seismic_samples["Lasso_Pred"] = np.nan
seismic_samples["Ridge_Pred"] = np.nan
seismic_samples["Mean_Pred"] = np.nan
seismic_samples["Max_Diff"] = np.nan
seismic_samples["Quality_Score"] = np.nan
seismic_samples["Is_Selected"] = False

# 填充预测结果
seismic_samples.loc[unlabeled_indices, "Fused_Pred"] = fused_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Lasso_Pred"] = lasso_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Ridge_Pred"] = ridge_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Mean_Pred"] = mean_preds
seismic_samples.loc[unlabeled_indices, "Max_Diff"] = max_diffs
seismic_samples.loc[unlabeled_indices, "Quality_Score"] = quality_scores
seismic_samples.loc[optimized_orig_indices, "Is_Selected"] = True

# 统计并报告负值数量
neg_count = (seismic_samples["Mean_Pred"] < 0).sum()
if neg_count > 0:
    print(f"\n注意: 有 {neg_count} 个负的砂厚预测值已被替换为0")

# 将负数的Mean_Pred值置为0
seismic_samples["Mean_Pred"] = seismic_samples["Mean_Pred"].clip(lower=0)

# 可视化优化前后的分布变化
plt.figure(figsize=(12, 6))

# 原始分布
bin_percents = np.array(bin_counts) / sum(bin_counts) * 100
plt.bar(np.arange(len(bin_labels)) - 0.2, bin_percents, width=0.4, label="原始分布", color="lightblue")

# 优化后分布
optimized_percents = final_hist / sum(final_hist) * 100
plt.bar(np.arange(len(bin_labels)) + 0.2, optimized_percents, width=0.4, label="优化后分布", color="orange")

# 添加目标分布线
target_percents = target_distribution / sum(target_distribution) * 100
plt.plot(np.arange(len(bin_labels)), target_percents, "r--", label="目标分布", linewidth=2)

plt.xlabel("砂厚区间")
plt.ylabel("百分比 (%)")
plt.title("样本分布优化前后对比")
plt.xticks(np.arange(len(bin_labels)), bin_labels)
plt.legend()
plt.grid(True, linestyle="--", alpha=0.7)
plt.savefig(os.path.join(output_dir, "distribution_optimization.png"), dpi=300, bbox_inches="tight")
plt.show()

# 保存预测结果
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples_with_predictions.csv"), index=False)
print(f"\n预测结果已保存至 {os.path.join(output_dir, 'seismic_samples_with_predictions.csv')}")

# 保存优化后的虚拟井数据
optimized_samples = seismic_samples[seismic_samples["Is_Selected"] == True].copy()
optimized_samples.to_csv(os.path.join(output_dir, "optimized_pseudo_wells.csv"), index=False)
print(f"优化后的虚拟井数据已保存至 {os.path.join(output_dir, 'optimized_pseudo_wells.csv')}")

# 计算质量-分布的J指标
final_j = calculate_j_score(final_selected_indices, quality_scores, mean_preds, target_distribution, bins)
print(f"\n最终优化J评分: {final_j:.4f}")

## 展示


In [None]:
# 准备小工区地震数据（使用已筛选的data_H6_2_attr_filtered）
# 1. 首先创建融合属性
print("为整个小工区地震数据创建融合属性...")

# 标准化数据
scaler = StandardScaler()
attr_data = data_H6_2_attr_filtered[selected_features].copy()
attr_data_scaled = scaler.fit_transform(attr_data)

# 使用之前定义的create_fused_attribute函数创建融合属性
fused_attr = create_fused_attribute(attr_data_scaled, selected_features, correlation_weights)

# 将融合属性添加到地震数据中
data_H6_2_attr_filtered["Fused_Attribute"] = fused_attr

# 2. 准备真实井点数据
real_wells = data_H6_2_well_selected

# 3. 准备虚拟井点数据（使用之前优化选择的虚拟井）
pseudo_wells = seismic_samples[seismic_samples["Is_Selected"] == True].copy()

# 可视化融合属性分布与井点位置
visualize_attribute_map(
    data_points=data_H6_2_attr_filtered,
    attribute_name="Fused_Attribute",
    attribute_label="地震融合属性值",
    real_wells=real_wells,
    pseudo_wells=pseudo_wells,
    target_column="Thickness of facies(1: Fine sand)",
    output_dir=output_dir,
    filename_prefix="fused_attribute",
    class_thresholds=[0.1, 10],
    figsize=(16, 14),
    dpi=300,
    cmap="viridis",
    point_size=140,
    well_size=200,
)

print("融合属性分布与井点位置可视化完成")

## SVR、随机森林（限制树深）、XGBoost（控制学习率和复杂度）


In [None]:
# 设置随机种子以确保结果可复现
np.random.seed(42)

print("======== 真实井点与虚拟井点结合建模 ========")

# 步骤1: 准备数据 - 合并真实样本和伪样本
pseudo_wells = pd.read_csv(os.path.join(output_dir, "optimized_pseudo_wells.csv"))
print(f"加载了 {len(pseudo_wells)} 个虚拟井点数据")

# 创建用于建模的真实井点数据
real_wells = data_H6_2_well_selected.copy()
print(f"真实井点数据数量: {len(real_wells)}")

# 确定共同的特征列（选用之前筛选出的最佳特征）
common_features = selected_features.copy()
print(f"使用的特征: {common_features}")

# 确保真实井点和虚拟井点都有这些特征列
real_wells_valid = real_wells.dropna(subset=common_features + ["Thickness of facies(1: Fine sand)"])
print(f"有效真实井点数据数量: {len(real_wells_valid)} (丢弃了缺失值)")

# 创建合并数据集
# 从真实井点中提取特征和目标
X_real = real_wells_valid[common_features].values
y_real = real_wells_valid["Thickness of facies(1: Fine sand)"].values

# 从虚拟井点中提取特征和目标(使用平均预测作为虚拟井的砂厚)
X_pseudo = pseudo_wells[common_features].values
y_pseudo = pseudo_wells["Mean_Pred"].values  # 使用平均预测作为目标值

# 合并数据
X_combined = np.vstack((X_real, X_pseudo))
y_combined = np.concatenate((y_real, y_pseudo))
print(f"合并后的样本数量: {len(X_combined)}")

# 创建样本权重 - 给真实样本更高的权重
sample_weights = np.ones(len(X_combined))
# 设置真实样本权重为虚拟样本的2倍
sample_weights[: len(X_real)] = 2.0

# 步骤2: 划分训练集和测试集(仅使用真实样本做测试)
# 首先划分真实样本
X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(X_real, y_real, test_size=0.3, random_state=42)

# 合并真实训练样本和所有虚拟样本作为完整训练集
X_train = np.vstack((X_real_train, X_pseudo))
y_train = np.concatenate((y_real_train, y_pseudo))

# 创建训练样本权重
train_weights = np.ones(len(X_train))
# 设置真实样本权重为虚拟样本的2倍
train_weights[: len(X_real_train)] = 2.0

print(f"训练集大小: {len(X_train)}, 测试集大小: {len(X_real_test)}")

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_real_test)


# 步骤3: 模型训练与评估
# 创建评估函数
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name, sample_weights=None):
    """评估模型性能并返回指标"""
    # 训练模型
    if sample_weights is not None:
        model.fit(X_train, y_train, sample_weight=sample_weights)
    else:
        model.fit(X_train, y_train)

    # 在测试集上预测
    y_pred = model.predict(X_test)

    # 计算指标
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"\n{model_name} 模型评估:")
    print(f"  - RMSE: {rmse:.4f}")
    print(f"  - R²: {r2:.4f}")
    print(f"  - MAE: {mae:.4f}")

    # 绘制真实值vs预测值散点图
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "r--")
    plt.xlabel("真实值")
    plt.ylabel("预测值")
    plt.title(f"{model_name}: 真实值 vs 预测值")
    plt.grid(True, alpha=0.3)
    plt.savefig(
        os.path.join(output_dir, f"{model_name.lower().replace(' ', '_')}_pred_vs_true.png"),
        dpi=300,
        bbox_inches="tight",
    )
    plt.show()

    return {"model": model, "rmse": rmse, "r2": r2, "mae": mae, "y_pred": y_pred}


# 模型1: 支持向量回归(SVR)
print("\n训练SVR模型...")
# SVR对特征数量敏感，使用较少特征以避免维度灾难
# 如果特征数量较多，可以考虑使用前2-3个最重要的特征
svr_features = min(3, len(common_features))
X_train_svr = X_train_scaled[:, :svr_features]
X_test_svr = X_test_scaled[:, :svr_features]

# 设置SVR参数网格
param_grid_svr = {"C": [0.1, 1, 10], "gamma": ["scale", 0.01, 0.1], "epsilon": [0.1, 0.2], "kernel": ["rbf"]}

# 创建并训练SVR模型
svr = SVR()
grid_search_svr = GridSearchCV(svr, param_grid_svr, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search_svr.fit(X_train_svr, y_train, sample_weight=train_weights)
best_svr = grid_search_svr.best_estimator_

print(f"SVR最佳参数: {grid_search_svr.best_params_}")
svr_results = evaluate_model(best_svr, X_train_svr, y_train, X_test_svr, y_real_test, "SVR", train_weights)

# 模型2: 随机森林(RandomForest) - 限制树深以避免过拟合
print("\n训练随机森林模型...")
param_grid_rf = {
    "n_estimators": [50, 100],
    "max_depth": [3, 4, 5],
    "min_samples_leaf": [3, 5],
    "max_features": ["sqrt", "log2"],
}

rf = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_train, sample_weight=train_weights)
best_rf = grid_search_rf.best_estimator_

print(f"随机森林最佳参数: {grid_search_rf.best_params_}")
rf_results = evaluate_model(best_rf, X_train_scaled, y_train, X_test_scaled, y_real_test, "随机森林", train_weights)

# 模型3: XGBoost - 控制学习率和复杂度
print("\n训练XGBoost模型...")
param_grid_xgb = {
    "n_estimators": [50, 100],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "min_child_weight": [3, 5],
}

xgb = XGBRegressor(random_state=42)
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search_xgb.fit(X_train_scaled, y_train, sample_weight=train_weights)
best_xgb = grid_search_xgb.best_estimator_

print(f"XGBoost最佳参数: {grid_search_xgb.best_params_}")
xgb_results = evaluate_model(best_xgb, X_train_scaled, y_train, X_test_scaled, y_real_test, "XGBoost", train_weights)

# 步骤4: 比较模型性能
models_comparison = pd.DataFrame(
    {
        "模型": ["SVR", "随机森林", "XGBoost"],
        "RMSE": [svr_results["rmse"], rf_results["rmse"], xgb_results["rmse"]],
        "R²": [svr_results["r2"], rf_results["r2"], xgb_results["r2"]],
        "MAE": [svr_results["mae"], rf_results["mae"], xgb_results["mae"]],
    }
)

print("\n模型性能比较:")
print(models_comparison)

# 保存比较结果
models_comparison.to_csv(os.path.join(output_dir, "model_comparison.csv"), index=False)

# 选择最佳模型(基于R²)
best_model_idx = models_comparison["R²"].idxmax()
best_model_name = models_comparison.loc[best_model_idx, "模型"]
print(f"\n最佳模型: {best_model_name}")

# 为地震数据创建预测
print("\n使用最佳模型为整个工区生成预测...")

# 准备地震数据
seismic_data = data_H6_2_attr.copy()
X_seismic = seismic_data[common_features].fillna(seismic_data[common_features].mean())

# 标准化特征
X_seismic_scaled = scaler.transform(X_seismic)

# 根据最佳模型选择预测方法
if best_model_name == "SVR":
    X_seismic_model = X_seismic_scaled[:, :svr_features]
    predictions = best_svr.predict(X_seismic_model)
    best_model = best_svr
elif best_model_name == "随机森林":
    predictions = best_rf.predict(X_seismic_scaled)
    best_model = best_rf
else:  # XGBoost
    predictions = best_xgb.predict(X_seismic_scaled)
    best_model = best_xgb

# 将预测结果添加到地震数据
seismic_data["Predicted_Sand_Thickness"] = predictions

# 将负值预测设为0
if (predictions < 0).any():
    neg_count = (predictions < 0).sum()
    print(f"注意: {neg_count} 个负的砂厚预测值已被替换为0")
    seismic_data["Predicted_Sand_Thickness"] = seismic_data["Predicted_Sand_Thickness"].clip(lower=0)

# 保存预测结果
seismic_data.to_csv(os.path.join(output_dir, "seismic_with_predictions.csv"), index=False)

# 步骤5: 可视化预测结果
print("\n可视化预测结果...")

# 使用最佳模型进行可视化
visualize_attribute_map(
    data_points=seismic_data,
    attribute_name="Predicted_Sand_Thickness",
    attribute_label=f"砂厚预测值(米) - {best_model_name}模型",
    real_wells=real_wells_valid,
    pseudo_wells=None,
    target_column="Thickness of facies(1: Fine sand)",
    output_dir=output_dir,
    filename_prefix=f"predicted_sand_thickness_{best_model_name.lower()}",
    class_thresholds=[0.1, 10],  # 分类阈值：低值(<0.1)、中值(0.1-10)、高值(>10)
    figsize=(16, 14),
    dpi=300,
    cmap="viridis",
    point_size=10,
    well_size=50,
)

# 可选：特征重要性分析（对于树模型）
if best_model_name in ["随机森林", "XGBoost"]:
    print("\n分析特征重要性...")

    # 获取特征重要性
    if best_model_name == "随机森林":
        importances = best_rf.feature_importances_
    else:
        importances = best_xgb.feature_importances_

    # 创建特征重要性DataFrame
    feature_importance_df = pd.DataFrame({"特征": common_features, "重要性": importances})
    feature_importance_df = feature_importance_df.sort_values("重要性", ascending=False)

    # 绘制特征重要性
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df["特征"], feature_importance_df["重要性"])
    plt.xlabel("特征重要性")
    plt.title(f"{best_model_name}模型特征重要性")
    plt.tight_layout()
    plt.savefig(
        os.path.join(output_dir, f"{best_model_name.lower()}_feature_importance.png"), dpi=300, bbox_inches="tight"
    )
    plt.show()

    # 保存特征重要性
    feature_importance_df.to_csv(
        os.path.join(output_dir, f"{best_model_name.lower()}_feature_importance.csv"), index=False
    )

print("\n建模与预测完成。所有结果已保存到输出目录。")