# 虚拟井生成


In [None]:
# 确保src目录在Python路径中
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV
from sklearn.preprocessing import StandardScaler

sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import (
    extract_uniform_seismic_samples,
    filter_anomalous_attributes,
    identify_attributes,
    parse_petrel_file,
    preprocess_features,
)
from src.feature_selection import select_best_features
from src.visualization import visualize_attribute_map

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [None]:
data_H6_2_attr = parse_petrel_file("../data/H6-2_attr")

## 导入井震数据


In [None]:
file_H6_2_well = "../data/well_processed.xlsx"
data_H6_2_well = pd.read_excel(file_H6_2_well, sheet_name="Sheet1")

# 只选择层位（Surface）为 H6-2 的行，并丢弃砂厚为 NaN 的行
data_H6_2_well_selected = (
    data_H6_2_well[data_H6_2_well["Surface"] == "H6-2"]
    .query("Well != 'PH6' and Well != 'PH8' and Well != 'PH3' and Well != 'PH2'")
    .replace(-999, np.nan)  # 将-999替换为NaN（通常-999是缺失值的代码）
    .dropna(subset=["Thickness of facies(1: Fine sand)"])  # 丢弃砂厚为NaN的行
    .reset_index(drop=True)  # 重置索引
)

# 显示筛选后的前几行数据
data_H6_2_well_selected.head()

## 提取共同属性


In [None]:
# 获取地震属性列表
seismic_attr, _ = identify_attributes("../data/H6-2_attr")

# 提取Excel的属性列表（从第8列开始的所有列）
well_seismic_attr = data_H6_2_well.columns[7:].tolist()

# 计算两个列表的交集
common_attributes = list(set(seismic_attr) & set(well_seismic_attr))

# 打印结果
print(f"地震属性数量: {len(seismic_attr)}")
print(f"Excel属性数量: {len(well_seismic_attr)}")
print(f"共同属性数量: {len(common_attributes)}")
print("\n共同属性列表:")
for attr in common_attributes:
    print(f"- {attr}")

## 根据井点分布，缩小工区范围


In [None]:
# 根据井点分布缩小工区范围
# 获取井点数据的X、Y范围
well_x_min = data_H6_2_well_selected["X"].min()
well_x_max = data_H6_2_well_selected["X"].max()
well_y_min = data_H6_2_well_selected["Y"].min()
well_y_max = data_H6_2_well_selected["Y"].max()

# 打印井点区域范围
print(f"井点数据X轴范围: {well_x_min:.2f} 到 {well_x_max:.2f}")
print(f"井点数据Y轴范围: {well_y_min:.2f} 到 {well_y_max:.2f}")

# 可选：为了展示井点聚集区域，可以扩大一定比例
expansion_factor = 1.5  # 扩展50%
x_padding = (well_x_max - well_x_min) * (expansion_factor - 1) / 2
y_padding = (well_y_max - well_y_min) * (expansion_factor - 1) / 2

# 应用扩展后的范围
well_area_x_min = well_x_min - x_padding
well_area_x_max = well_x_max + x_padding
well_area_y_min = well_y_min - y_padding
well_area_y_max = well_y_max + y_padding

# 筛选出井点范围内的地震数据
data_H6_2_attr_filtered = data_H6_2_attr[
    (data_H6_2_attr["X"] >= well_area_x_min)
    & (data_H6_2_attr["X"] <= well_area_x_max)
    & (data_H6_2_attr["Y"] >= well_area_y_min)
    & (data_H6_2_attr["Y"] <= well_area_y_max)
].copy()

# 统计过滤前后的数据量
original_size = len(data_H6_2_attr)
filtered_size = len(data_H6_2_attr_filtered)
reduction_percent = (1 - filtered_size / original_size) * 100

print(f"原始地震数据点数: {original_size}")
print(f"缩小范围后的地震数据点数: {filtered_size}")
print(f"数据量减少了: {reduction_percent:.2f}%")

# 可视化原始数据与筛选后的数据分布
plt.figure(figsize=(15, 10))

# 绘制地震数据点（使用抽样以避免过多点导致图像渲染缓慢）
sample_ratio = min(1.0, 5000 / len(data_H6_2_attr))
seismic_sample = data_H6_2_attr.sample(frac=sample_ratio)
plt.scatter(seismic_sample["X"], seismic_sample["Y"], color="lightgray", alpha=0.3, s=10, label="原始地震数据(抽样)")

# 绘制筛选后的地震数据
filtered_sample_ratio = min(1.0, 3000 / len(data_H6_2_attr_filtered))
filtered_sample = data_H6_2_attr_filtered.sample(frac=filtered_sample_ratio)
plt.scatter(filtered_sample["X"], filtered_sample["Y"], color="blue", alpha=0.5, s=15, label="筛选后的地震数据(抽样)")

# 绘制井点位置
plt.scatter(data_H6_2_well_selected["X"], data_H6_2_well_selected["Y"], color="red", s=80, marker="^", label="井点位置")

# 绘制筛选边界框
plt.axvline(x=well_area_x_min, color="red", linestyle="--", alpha=0.8)
plt.axvline(x=well_area_x_max, color="red", linestyle="--", alpha=0.8)
plt.axhline(y=well_area_y_min, color="red", linestyle="--", alpha=0.8)
plt.axhline(y=well_area_y_max, color="red", linestyle="--", alpha=0.8)

# 添加标题和图例
plt.title("地震数据与井点分布", fontsize=16)
plt.xlabel("X坐标", fontsize=14)
plt.ylabel("Y坐标", fontsize=14)
plt.legend(loc="upper right")
plt.grid(True, linestyle="--", alpha=0.7)

# 保存图片
plt.savefig(os.path.join(output_dir, "seismic_well_distribution.png"), dpi=300, bbox_inches="tight")
plt.show()

## 生成统计摘要


In [None]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=data_H6_2_attr_filtered,
    well_data=data_H6_2_well_selected,
    common_attributes=common_attributes,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=None,  # 输出图表目录
    verbose=True,  # 打印详细信息
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")

## 随机森林重要性和相关性分析


In [None]:
# 使用随机森林评估特征重要性并移除冗余特征
selected_features = select_best_features(
    well_data=data_H6_2_well_selected,
    attribute_columns=good_attributes,
    target_column="Thickness of facies(1: Fine sand)",
    n_features=5,
    corr_threshold=0.85,
    output_dir=output_dir,
    verbose=True,
)

# 输出特征选择结果
print("\n基于随机森林重要性和相关性分析的最佳特征:")
for i, feature in enumerate(selected_features):
    print(f"{i + 1}. {feature}")

## 提取样本，准备设置虚拟井


In [None]:
# 使用筛选后的地震数据区域提取等间距样本
seismic_samples = extract_uniform_seismic_samples(
    seismic_data=data_H6_2_attr_filtered,
    n_rows=25,  # 行数
    n_cols=25,  # 列数
    area_bounds={
        "x_min": well_area_x_min,
        "x_max": well_area_x_max,
        "y_min": well_area_y_min,
        "y_max": well_area_y_max,
    },
)

# 可视化真实井点和采样点
plt.figure(figsize=(15, 10))

# 绘制地震数据点（使用抽样）
sample_ratio = min(1.0, 5000 / len(data_H6_2_attr_filtered))
seismic_sample = data_H6_2_attr_filtered.sample(frac=sample_ratio)
plt.scatter(seismic_sample["X"], seismic_sample["Y"], color="lightgray", alpha=0.3, s=10, label="地震数据(抽样)")

# 绘制真实井点位置
plt.scatter(
    data_H6_2_well_selected["X"], data_H6_2_well_selected["Y"], color="red", s=100, marker="^", label="真实井点"
)

# 绘制等间距采样点位置
plt.scatter(seismic_samples["X"], seismic_samples["Y"], color="blue", s=50, marker="o", label="等间距采样点")

# 添加标题和图例
plt.title("真实井点与等间距采样点分布", fontsize=16)
plt.xlabel("X坐标", fontsize=14)
plt.ylabel("Y坐标", fontsize=14)
plt.legend(loc="upper right")
plt.grid(True, linestyle="--", alpha=0.7)

# 保存图片
plt.savefig(os.path.join(output_dir, "real_wells_and_seismic_samples.png"), dpi=300, bbox_inches="tight")
plt.show()


# 保存提取的样本数据
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples.csv"), index=False)
print(f"等间距地震样本数据已保存至 {os.path.join(output_dir, 'seismic_samples.csv')}")

## 多线性模型一致性预测设置虚拟井


In [None]:
# 创建融合属性和多模型预测
print("======== 创建融合属性和多模型预测 ========")
target_column = "Thickness of facies(1: Fine sand)"
min_corr_threshold = 0.2  # 最小相关性阈值，低于此值的属性将被排除

# 检查每个选定属性在井点数据中的有效性
print("检查属性在井点数据中的有效性:")
for feature in selected_features:
    nan_count = data_H6_2_well_selected[feature].isna().sum()
    print(
        f"属性 '{feature}' 在井点数据中的NaN值数量: {nan_count}/{len(data_H6_2_well_selected)} ({nan_count / len(data_H6_2_well_selected) * 100:.1f}%)"
    )

# 筛选出所有选定属性都有有效值的井点
valid_wells = data_H6_2_well_selected.dropna(subset=selected_features + [target_column])
print(f"\n所有属性都有有效值的井点数量: {len(valid_wells)} / {len(data_H6_2_well_selected)}")

# 准备训练数据
X_labeled = valid_wells[selected_features].values
y_labeled = valid_wells[target_column].values

# 准备未标记数据（地震样本点）
X_unlabeled = seismic_samples[selected_features].dropna().values
unlabeled_indices = seismic_samples[selected_features].dropna().index

# 1. 融合属性线性加权预测
print("\n=== 模型1: 融合属性线性加权 ===")

# 计算相关性权重
correlation_weights = {}
for i, feature in enumerate(selected_features):
    corr, _ = spearmanr(valid_wells[feature], valid_wells[target_column])
    if abs(corr) >= min_corr_threshold:
        correlation_weights[feature] = corr
        print(f"属性 '{feature}' 与砂厚的Spearman相关性: {corr:.4f}")
    else:
        print(f"属性 '{feature}' 与砂厚的相关性过低 ({corr:.4f})，不纳入融合")

# 如果没有有效属性，使用所有属性且权重相等
if len(correlation_weights) == 0:
    print("警告: 没有属性满足相关性阈值，将使用所有属性且权重相等")
    for feature in selected_features:
        correlation_weights[feature] = 1.0
        print(f"属性 '{feature}' 使用默认权重: 1.0")

# 标准化数据
scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled)
X_unlabeled_scaled = scaler.transform(X_unlabeled)


# 创建融合属性函数
def create_fused_attribute(X_scaled, features, weights):
    """
    基于选定特征和权重创建融合属性

    参数:
        X_scaled (ndarray): 标准化后的特征矩阵
        features (list): 特征列表
        weights (dict): 每个特征的权重

    返回:
        ndarray: 融合属性
    """
    # 初始化融合属性
    fused_attr = np.zeros(X_scaled.shape[0])
    weight_sum = 0

    # 对每个特征进行加权融合
    for i, feature in enumerate(features):
        if feature in weights:
            weight = weights[feature]
            fused_attr += X_scaled[:, i] * weight
            weight_sum += abs(weight)

    # 归一化融合结果
    if weight_sum > 0:
        fused_attr /= weight_sum

    return fused_attr


# 在训练数据上创建融合属性
fused_attr_labeled = create_fused_attribute(X_labeled_scaled, selected_features, correlation_weights)

# 在未标记数据上创建融合属性
fused_attr_unlabeled = create_fused_attribute(X_unlabeled_scaled, selected_features, correlation_weights)

# 使用融合属性拟合线性回归
linear_model = LinearRegression()
linear_model.fit(fused_attr_labeled.reshape(-1, 1), y_labeled)

# 预测
fused_pred_labeled = linear_model.predict(fused_attr_labeled.reshape(-1, 1))
fused_pred_unlabeled = linear_model.predict(fused_attr_unlabeled.reshape(-1, 1))

# 评估融合属性预测效果
fused_corr = np.corrcoef(fused_pred_labeled, y_labeled)[0, 1]
print(f"融合属性预测结果与真实砂厚的相关性: {fused_corr:.4f}")

# 2. Lasso回归 + Bootstrap
print("\n=== 模型2: LassoCV + Bootstrap ===")

n_bootstrap = 100  # Bootstrap重采样次数
alpha_values = np.logspace(-4, 1, 30)  # alpha候选值

# 主Lasso模型
lasso_model = LassoCV(alphas=alpha_values, cv=5, max_iter=10000, tol=1e-3)
lasso_model.fit(X_labeled_scaled, y_labeled)
print(f"Lasso最优alpha值: {lasso_model.alpha_:.6f}")

# 在训练数据上的预测
lasso_pred_labeled = lasso_model.predict(X_labeled_scaled)
lasso_corr = np.corrcoef(lasso_pred_labeled, y_labeled)[0, 1]
print(f"Lasso预测结果与真实砂厚的相关性: {lasso_corr:.4f}")

# Bootstrap重采样预测
lasso_bootstrap_preds = np.zeros((n_bootstrap, X_unlabeled_scaled.shape[0]))

for i in range(n_bootstrap):
    # Bootstrap重采样
    indices = np.random.choice(len(X_labeled_scaled), len(X_labeled_scaled), replace=True)
    X_boot, y_boot = X_labeled_scaled[indices], y_labeled[indices]

    # 拟合模型
    lasso_boot = LassoCV(alphas=alpha_values, cv=5, max_iter=10000, tol=1e-3)
    lasso_boot.fit(X_boot, y_boot)

    # 预测
    lasso_bootstrap_preds[i, :] = lasso_boot.predict(X_unlabeled_scaled)

# 计算预测均值和置信区间
lasso_pred_unlabeled = np.mean(lasso_bootstrap_preds, axis=0)
lasso_lower_ci = np.percentile(lasso_bootstrap_preds, 2.5, axis=0)
lasso_upper_ci = np.percentile(lasso_bootstrap_preds, 97.5, axis=0)

# 3. Ridge回归 + Bootstrap
print("\n=== 模型3: RidgeCV + Bootstrap ===")

alpha_values = np.logspace(-3, 3, 30)  # alpha候选值

# 主Ridge模型
ridge_model = RidgeCV(alphas=alpha_values, cv=5)
ridge_model.fit(X_labeled_scaled, y_labeled)
print(f"Ridge最优alpha值: {ridge_model.alpha_:.6f}")

# 在训练数据上的预测
ridge_pred_labeled = ridge_model.predict(X_labeled_scaled)
ridge_corr = np.corrcoef(ridge_pred_labeled, y_labeled)[0, 1]
print(f"Ridge预测结果与真实砂厚的相关性: {ridge_corr:.4f}")

# Bootstrap重采样预测
ridge_bootstrap_preds = np.zeros((n_bootstrap, X_unlabeled_scaled.shape[0]))

for i in range(n_bootstrap):
    # Bootstrap重采样
    indices = np.random.choice(len(X_labeled_scaled), len(X_labeled_scaled), replace=True)
    X_boot, y_boot = X_labeled_scaled[indices], y_labeled[indices]

    # 拟合模型
    ridge_boot = RidgeCV(alphas=alpha_values, cv=5)
    ridge_boot.fit(X_boot, y_boot)

    # 预测
    ridge_bootstrap_preds[i, :] = ridge_boot.predict(X_unlabeled_scaled)

# 计算预测均值和置信区间
ridge_pred_unlabeled = np.mean(ridge_bootstrap_preds, axis=0)
ridge_lower_ci = np.percentile(ridge_bootstrap_preds, 2.5, axis=0)
ridge_upper_ci = np.percentile(ridge_bootstrap_preds, 97.5, axis=0)

# 4. 一致性筛选
print("\n=== 一致性筛选 ===")

# 将三个模型的预测结果整合
predictions = np.column_stack([fused_pred_unlabeled, lasso_pred_unlabeled, ridge_pred_unlabeled])

# 计算每个点三个预测值的最大差异
max_diffs = np.max(predictions, axis=1) - np.min(predictions, axis=1)

# 定义一致性阈值 - 绝对差异和相对差异两种标准
abs_threshold = 3.0  # 绝对差异阈值，单位与砂厚相同
rel_threshold = 0.10  # 相对差异阈值，10%

# 计算平均预测值用于相对差异计算
mean_preds = np.mean(predictions, axis=1)
rel_diffs = max_diffs / (mean_preds + 1e-10)  # 避免除零

# Bootstrap预测标准差（作为不确定性）
lasso_std = np.std(lasso_bootstrap_preds, axis=0)
ridge_std = np.std(ridge_bootstrap_preds, axis=0)

# 设置不确定性阈值
uncertainty_threshold = 2.5  # 单位与砂厚相同

# 构造三个掩码
consistent_mask = (max_diffs <= abs_threshold) & (rel_diffs <= rel_threshold)
uncertainty_mask = (lasso_std <= uncertainty_threshold) & (ridge_std <= uncertainty_threshold)

# ✅ 最终一致性掩码：预测一致 + 不确定性低
final_consistent_mask = consistent_mask & uncertainty_mask
final_consistent_indices = np.where(final_consistent_mask)[0]

# 创建伪标记数据
X_pseudo = X_unlabeled[final_consistent_indices]
y_pseudo = mean_preds[final_consistent_indices]

# 获取一致性样本在原始seismic_samples中的索引
consistent_orig_indices = unlabeled_indices[final_consistent_indices]

# 添加预测结果到seismic_samples
seismic_samples["Fused_Pred"] = np.nan
seismic_samples["Lasso_Pred"] = np.nan
seismic_samples["Ridge_Pred"] = np.nan
seismic_samples["Mean_Pred"] = np.nan
seismic_samples["Max_Diff"] = np.nan
seismic_samples["Is_Consistent"] = False

# 填充预测结果
seismic_samples.loc[unlabeled_indices, "Fused_Pred"] = fused_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Lasso_Pred"] = lasso_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Ridge_Pred"] = ridge_pred_unlabeled
seismic_samples.loc[unlabeled_indices, "Mean_Pred"] = mean_preds
seismic_samples.loc[unlabeled_indices, "Max_Diff"] = max_diffs
seismic_samples.loc[consistent_orig_indices, "Is_Consistent"] = True

# 输出结果统计
print(f"未标记样本总数: {X_unlabeled.shape[0]}")
print(f"满足一致性 + 高置信度条件的样本数: {X_pseudo.shape[0]} ({X_pseudo.shape[0] / X_unlabeled.shape[0] * 100:.2f}%)")
print(f"使用的绝对差异阈值: {abs_threshold}")
print(f"使用的相对差异阈值: {rel_threshold * 100:.1f}%")
print(f"使用的不确定性阈值（Lasso/Ridge预测标准差 ≤ {uncertainty_threshold}）")

# 统计并报告负值数量
neg_count = (seismic_samples["Mean_Pred"] < 0).sum()
if neg_count > 0:
    print(f"注意: 有 {neg_count} 个负的砂厚预测值已被替换为0")

# 将负数的Mean_Pred值置为0
seismic_samples["Mean_Pred"] = seismic_samples["Mean_Pred"].clip(lower=0)

# 保存预测结果
seismic_samples.to_csv(os.path.join(output_dir, "seismic_samples_with_predictions.csv"), index=False)
print(f"\n预测结果已保存至 {os.path.join(output_dir, 'seismic_samples_with_predictions.csv')}")

# 保存一致性虚拟井数据
consistent_samples = seismic_samples[seismic_samples["Is_Consistent"] == True].copy()
consistent_samples.to_csv(os.path.join(output_dir, "consistent_pseudo_wells.csv"), index=False)
print(f"一致性虚拟井数据已保存至 {os.path.join(output_dir, 'consistent_pseudo_wells.csv')}")


## 展示


In [None]:
# 准备小工区地震数据（使用已筛选的data_H6_2_attr_filtered）
# 1. 首先创建融合属性
print("为整个小工区地震数据创建融合属性...")

# 标准化数据
scaler = StandardScaler()
attr_data = data_H6_2_attr_filtered[selected_features].copy()
attr_data_scaled = scaler.fit_transform(attr_data)

# 使用之前定义的create_fused_attribute函数创建融合属性
fused_attr = create_fused_attribute(attr_data_scaled, selected_features, correlation_weights)

# 将融合属性添加到地震数据中
data_H6_2_attr_filtered["Fused_Attribute"] = fused_attr

# 2. 准备真实井点数据
real_wells = data_H6_2_well_selected

# 3. 准备虚拟井点数据（使用之前保存的一致性虚拟井）
pseudo_wells = seismic_samples[seismic_samples["Is_Consistent"] == True].copy()

# 可视化融合属性分布与井点位置
visualize_attribute_map(
    data_points=data_H6_2_attr_filtered,
    attribute_name="Fused_Attribute",
    attribute_label="地震融合属性值",
    real_wells=real_wells,
    pseudo_wells=pseudo_wells,
    target_column="Thickness of facies(1: Fine sand)",
    output_dir=output_dir,
    filename_prefix="fused_attribute",
    class_thresholds=[0.1, 25],  # 分类阈值：低值(<0.1)、中值(0.1-25)、高值(>25)
    figsize=(16, 14),
    dpi=300,
    cmap="viridis",
    point_size=140,
    well_size=200,
)

print("融合属性分布与井点位置可视化完成")