## 地震属性融合：基线模型测试


In [None]:
# 确保src目录在Python路径中
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 全井震数据


In [None]:
# 导入处理好的井点数据
file_well = "data/well_processed.xlsx"
data_well = pd.read_excel(file_well)

# 显示数据基本信息
print(f"数据形状: {data_well.shape}")
print("数据列名:", data_well.columns.tolist())
data_well.head()

## 分类函数（根据经验）


In [None]:
def classify_lithofacies(sand_thickness, sand_ratio):
    if sand_thickness <= 0.1:
        return "Mudstone"
    elif sand_ratio >= 70:
        return "Sandstone"
    elif sand_ratio <= 30:
        return "Mudstone"
    else:
        return "Interbedded"


## 数据预处理


In [None]:
# 定义列名映射（确保使用正确的列名）
sand_thickness_col = data_well.columns[5]  # 第六列是砂厚
sand_ratio_col = data_well.columns[6]  # 第七列是砂地比

# 根据classify_lithofacies函数添加岩性标签
data_well_processed = data_well.copy()
data_well_processed["Lithofacies"] = data_well_processed.apply(
    lambda row: classify_lithofacies(row[sand_thickness_col], row[sand_ratio_col]), axis=1
)

# 创建数值标签映射
lithofacies_mapping = {"Mudstone": 0, "Sandstone": 1, "Interbedded": 2}
data_well_processed["Lithofacies_Code"] = data_well_processed["Lithofacies"].map(lithofacies_mapping)

# 重命名列
data_well_processed = data_well_processed.rename(
    columns={
        "Thickness of facies(1: Fine sand)": "Sand Thickness",
        "facies(1: Fine sand)": "Sand Ratio",
    }
)

# 显示前几行结果查看处理效果
print(f"处理后的数据形状: {data_well_processed.shape}")

# 将处理后的数据保存到output目录
data_well_processed.to_csv(os.path.join(output_dir, "processed_well_data.csv"), index=False)

In [None]:
data_well_processed.head()

## 基线模型测试：回归任务


In [None]:
# 导入必要的库
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings("ignore")

# 读取处理好的数据
unique_surfaces = data_well_processed["Surface"].unique()

# 初始化结果DataFrame
results_columns = [
    "Surface",
    "Target",
    "Model",
    "R2_mean",
    "RMSE_mean",
    "MAE_mean",
    "Top_Features",
]
results_df = pd.DataFrame(columns=results_columns)


# 定义模型评估函数
def evaluate_model(model, X, y, cv=5):
    # 定义评估指标
    scoring = {
        "r2": "r2",
        "neg_rmse": "neg_root_mean_squared_error",
        "neg_mae": "neg_mean_absolute_error",
    }

    # 执行交叉验证
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)

    # 提取并转换结果
    r2_mean = cv_results["test_r2"].mean()
    rmse_mean = -cv_results["test_neg_rmse"].mean()
    mae_mean = -cv_results["test_neg_mae"].mean()

    return r2_mean, rmse_mean, mae_mean


# 定义特征重要性获取函数
def get_top_features(X, rf_model, xgb_model, n_top=10):
    # 获取随机森林特征重要性
    rf_importance = rf_model.feature_importances_
    rf_indices = np.argsort(rf_importance)[::-1]

    # 获取XGBoost特征重要性
    xgb_importance = xgb_model.feature_importances_
    xgb_indices = np.argsort(xgb_importance)[::-1]

    # 计算平均排名
    feature_names = X.columns
    rank_dict = {}

    for i, feature_idx in enumerate(rf_indices):
        feature_name = feature_names[feature_idx]
        rank_dict[feature_name] = rank_dict.get(feature_name, 0) + i

    for i, feature_idx in enumerate(xgb_indices):
        feature_name = feature_names[feature_idx]
        rank_dict[feature_name] = rank_dict.get(feature_name, 0) + i

    # 根据平均排名排序并获取前N个特征
    sorted_features = sorted(rank_dict.items(), key=lambda x: x[1])
    top_features = [f[0] for f in sorted_features[:n_top]]

    return top_features


# 定义IQR方法移除离群值的函数
def remove_outliers_iqr(df, column, factor=3.0):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR

    # 返回处于范围内的值和离群值掩码
    mask = (df[column] >= lower_bound) & (df[column] <= upper_bound)
    return df[mask], ~mask


# 修改交叉验证和索引处理的部分
# 循环处理每个层位
for surface in unique_surfaces:
    print(f"\n处理层位: {surface}")

    # 提取当前层位的数据
    surface_data = data_well_processed[data_well_processed["Surface"] == surface].copy()
    print(f"该层位样本数: {len(surface_data)}")

    # 非特征列列表
    non_feature_cols = [
        "X",
        "Y",
        "Z",
        "Surface",
        "Well",
        "Sand Thickness",
        "Sand Ratio",
        "Lithofacies",
        "Lithofacies_Code",
    ]

    # 计算每列的缺失值百分比
    missing_percent = surface_data.isnull().mean() * 100

    # 删除缺失值超过30%的列
    cols_to_drop = missing_percent[missing_percent > 30].index.tolist()
    surface_data = surface_data.drop(columns=[col for col in cols_to_drop if col not in non_feature_cols])
    print(f"删除了缺失值超过30%的列: {[col for col in cols_to_drop if col not in non_feature_cols]}")

    # 获取所有特征列
    feature_cols = [col for col in surface_data.columns if col not in non_feature_cols]
    print(f"保留的特征数量: {len(feature_cols)}")

    # 对每个特征列处理离群值并填充缺失值
    for col in feature_cols:
        # 移除离群值
        clean_data, outlier_mask = remove_outliers_iqr(surface_data, col)

        # 计算清洗后数据的平均值
        col_mean = clean_data[col].mean()

        # 填充原始数据中的缺失值
        surface_data[col].fillna(col_mean, inplace=True)

        # 输出离群值信息
        if outlier_mask.sum() > 0:
            print(f"特征 '{col}' 中发现 {outlier_mask.sum()} 个离群值，已使用平均值 {col_mean:.4f} 进行填充")

    # 确认没有缺失值
    assert surface_data.isnull().sum().sum() == 0, "数据中仍然存在缺失值!"

    # 对两个目标变量分别进行建模
    for target in ["Sand Thickness", "Sand Ratio"]:
        print(f"\n目标变量: {target}")

        # 准备特征和目标变量
        X = surface_data[feature_cols]
        y = surface_data[target]

        # 基于岩性分为两类
        class_a_mask = surface_data["Lithofacies_Code"] == 0
        class_b_mask = surface_data["Lithofacies_Code"].isin([1, 2])

        class_a_indices = np.where(class_a_mask)[0]
        class_b_indices = np.where(class_b_mask)[0]

        print(f"泥岩(Lithofacies_Code=0)样本数: {len(class_a_indices)}")
        print(f"非泥岩(Lithofacies_Code=1,2)样本数: {len(class_b_indices)}")

        # 创建标准K折交叉验证
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # 确定是否能够进行分层交叉验证
        if len(class_a_indices) >= 5 and len(class_b_indices) >= 5:
            print("使用分层交叉验证（按岩性类别划分）")

            # 创建自定义交叉验证分割
            train_indices_list = []
            test_indices_list = []

            # 对类别A进行K折划分
            a_folds = list(kf.split(class_a_indices))

            # 对类别B进行K折划分
            b_folds = list(kf.split(class_b_indices))

            # 确保折数匹配
            n_folds = min(len(a_folds), len(b_folds))

            for i in range(n_folds):
                # 获取类别A的训练/测试索引
                a_train_idx, a_test_idx = a_folds[i]
                # 将索引映射到原始数据集中的实际位置
                a_train = class_a_indices[a_train_idx]
                a_test = class_a_indices[a_test_idx]

                # 获取类别B的训练/测试索引
                b_train_idx, b_test_idx = b_folds[i]
                # 将索引映射到原始数据集中的实际位置
                b_train = class_b_indices[b_train_idx]
                b_test = class_b_indices[b_test_idx]

                # 合并两类的训练和测试索引
                train_indices = np.concatenate([a_train, b_train])
                test_indices = np.concatenate([a_test, b_test])

                train_indices_list.append(train_indices)
                test_indices_list.append(test_indices)

            # 使用自定义的交叉验证索引
            cv_splits = [(train, test) for train, test in zip(train_indices_list, test_indices_list)]

        else:
            print("警告：某个岩性类别样本数不足5个，使用标准交叉验证")
            # 使用标准KFold直接生成划分
            cv_splits = list(kf.split(X))

        # 1. 随机森林模型训练与评估
        rf_model = RandomForestRegressor(n_estimators=50, max_depth=3, min_samples_leaf=3, random_state=42)

        r2_rf, rmse_rf, mae_rf = evaluate_model(rf_model, X, y, cv=cv_splits)
        print(f"随机森林 - R²: {r2_rf:.4f}, RMSE: {rmse_rf:.4f}, MAE: {mae_rf:.4f}")

        # 使用全部数据拟合随机森林模型以获取特征重要性
        rf_model.fit(X, y)

        # 2. XGBoost模型训练与评估
        xgb_model = XGBRegressor(
            n_estimators=50,
            max_depth=3,
            learning_rate=0.1,
            min_child_weight=3,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
        )

        r2_xgb, rmse_xgb, mae_xgb = evaluate_model(xgb_model, X, y, cv=cv_splits)
        print(f"XGBoost - R²: {r2_xgb:.4f}, RMSE: {rmse_xgb:.4f}, MAE: {mae_xgb:.4f}")

        # 使用全部数据拟合XGBoost模型以获取特征重要性
        xgb_model.fit(X, y)

        # 获取Top 10特征
        top_features = get_top_features(X, rf_model, xgb_model, n_top=5)
        print(f"Top 10特征: {top_features}")

        # 3. 使用Top 10特征训练SVR模型
        X_top = X[top_features]

        # 标准化特征
        scaler = StandardScaler()
        X_top_scaled = scaler.fit_transform(X_top)

        # 设置SVR参数网格
        param_grid = {
            "C": [0.1, 1, 10],
            "epsilon": [0.01, 0.1],
            "kernel": ["rbf"],
            "gamma": ["scale", 0.01, 0.1],
        }

        # 网格搜索
        svr = SVR()
        grid_search = GridSearchCV(svr, param_grid, cv=cv_splits, scoring="neg_mean_squared_error", n_jobs=-1)

        grid_search.fit(X_top_scaled, y)
        best_svr = grid_search.best_estimator_
        print(f"SVR最佳参数: {grid_search.best_params_}")

        # 使用最佳SVR模型进行评估
        r2_svr, rmse_svr, mae_svr = evaluate_model(best_svr, X_top_scaled, y, cv=cv_splits)
        print(f"SVR(Top 10特征) - R²: {r2_svr:.4f}, RMSE: {rmse_svr:.4f}, MAE: {mae_svr:.4f}")

        # 将结果添加到结果DataFrame
        for model_name, r2, rmse, mae in [
            ("RandomForest", r2_rf, rmse_rf, mae_rf),
            ("XGBoost", r2_xgb, rmse_xgb, mae_xgb),
            ("SVR_Top10", r2_svr, rmse_svr, mae_svr),
        ]:
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {
                            "Surface": [surface],
                            "Target": [target],
                            "Model": [model_name],
                            "R2_mean": [r2],
                            "RMSE_mean": [rmse],
                            "MAE_mean": [mae],
                            "Top_Features": [",".join(top_features)],
                        }
                    ),
                ],
                ignore_index=True,
            )

# 保存结果
results_file = os.path.join(output_dir, "regression_results.csv")
results_df.to_csv(results_file, index=False)
print(f"\n结果已保存到: {results_file}")

In [None]:
# 可视化模型性能对比 - 只展示R²指标
plt.figure(figsize=(20, 8))  # 增加图表宽度以容纳多个层位

# R²对比
pivot_r2 = results_df.pivot_table(index=["Surface", "Target"], columns="Model", values="R2_mean")
pivot_r2.plot(kind="bar", ax=plt.gca())
plt.title("各模型R^2对比", fontsize=14)
plt.ylabel("R^2值", fontsize=12)
plt.xlabel("层位和目标变量", fontsize=12)

# 调整y轴范围以显示负值
plt.ylim([-1, 1])  # 设置y轴范围，确保能显示负值

# 添加水平线标记0值
plt.axhline(y=0, color="gray", linestyle="-", alpha=0.3)

# 添加图例并调整位置
plt.legend(title="模型", bbox_to_anchor=(1.05, 1), loc="upper left")

# 调整布局和标签
plt.xticks(rotation=90)  # 旋转x轴标签以避免重叠
plt.tight_layout()

# 保存和显示
plt.savefig(os.path.join(output_dir, "r2_model_comparison.png"), dpi=300, bbox_inches="tight")
plt.show()

## 基线模型测试：分类任务

In [None]:
# 导入分类所需的库
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore")

# 初始化分类结果DataFrame
classification_results_columns = [
    "Surface",
    "Model",
    "Accuracy_mean",
    "F1_macro_mean",
    "Precision_macro_mean",
    "Recall_macro_mean",
    "Top_Features",
    "Classes_Present",  # 添加一列记录存在的类别
]
classification_results_df = pd.DataFrame(columns=classification_results_columns)


# 定义分类模型评估函数
def evaluate_classification_model(model, X, y, cv=5):
    # 定义评估指标
    scoring = {
        "accuracy": "accuracy",
        "f1_macro": "f1_macro",
        "precision_macro": "precision_macro",
        "recall_macro": "recall_macro",
    }

    # 执行交叉验证
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)

    # 提取结果
    accuracy_mean = cv_results["test_accuracy"].mean()
    f1_macro_mean = cv_results["test_f1_macro"].mean()
    precision_macro_mean = cv_results["test_precision_macro"].mean()
    recall_macro_mean = cv_results["test_recall_macro"].mean()

    return accuracy_mean, f1_macro_mean, precision_macro_mean, recall_macro_mean


# 循环处理每个层位进行岩性分类
print("\n=============== 开始岩性分类任务 ===============")

for surface in unique_surfaces:
    print(f"\n处理层位: {surface}")

    # 提取当前层位的数据
    surface_data = data_well_processed[data_well_processed["Surface"] == surface].copy()
    print(f"该层位样本数: {len(surface_data)}")

    # 检查类别分布
    class_counts = surface_data["Lithofacies_Code"].value_counts()
    print(f"岩性类别分布:\n{class_counts}")

    # 记录存在的类别
    classes_present = sorted(class_counts.index.tolist())
    classes_present_str = ", ".join(map(str, classes_present))
    print(f"存在的类别: {classes_present_str}")

    # 检查是否所有类别都有至少2个样本
    if len(class_counts) < 2 or any(count < 2 for count in class_counts.values):
        print(f"跳过层位 {surface}: 某些岩性类别样本数不足2个，无法进行分类")
        continue

    # 非特征列列表
    non_feature_cols = [
        "X",
        "Y",
        "Z",
        "Surface",
        "Well",
        "Sand Thickness",
        "Sand Ratio",
        "Lithofacies",
        "Lithofacies_Code",
    ]

    # 计算每列的缺失值百分比
    missing_percent = surface_data.isnull().mean() * 100

    # 删除缺失值超过30%的列
    cols_to_drop = missing_percent[missing_percent > 30].index.tolist()
    surface_data = surface_data.drop(columns=[col for col in cols_to_drop if col not in non_feature_cols])
    print(f"删除了缺失值超过30%的列: {[col for col in cols_to_drop if col not in non_feature_cols]}")

    # 获取所有特征列
    feature_cols = [col for col in surface_data.columns if col not in non_feature_cols]
    print(f"保留的特征数量: {len(feature_cols)}")

    # 对每个特征列处理离群值并填充缺失值
    for col in feature_cols:
        # 移除离群值
        clean_data, outlier_mask = remove_outliers_iqr(surface_data, col)

        # 计算清洗后数据的平均值
        col_mean = clean_data[col].mean()

        # 填充原始数据中的缺失值
        surface_data[col].fillna(col_mean, inplace=True)

        # 输出离群值信息
        if outlier_mask.sum() > 0:
            print(f"特征 '{col}' 中发现 {outlier_mask.sum()} 个离群值，已使用平均值 {col_mean:.4f} 进行填充")

    # 确认没有缺失值
    assert surface_data.isnull().sum().sum() == 0, "数据中仍然存在缺失值!"

    # 准备特征和目标变量
    X = surface_data[feature_cols]
    y_original = surface_data["Lithofacies_Code"]  # 保存原始标签

    # 对标签进行重新编码，使其从0开始连续
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_original)

    # 打印标签编码映射关系
    print(f"标签编码映射: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

    # 设置分层K折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_splits = list(skf.split(X, y))

    # 1. 随机森林分类模型
    rf_clf = RandomForestClassifier(
        n_estimators=100, max_depth=5, min_samples_leaf=2, class_weight="balanced", random_state=42
    )

    accuracy_rf, f1_rf, precision_rf, recall_rf = evaluate_classification_model(rf_clf, X, y, cv=cv_splits)
    print(
        f"随机森林 - 准确率: {accuracy_rf:.4f}, F1(宏平均): {f1_rf:.4f}, "
        f"精确率: {precision_rf:.4f}, 召回率: {recall_rf:.4f}"
    )

    # 使用全部数据拟合随机森林模型以获取特征重要性
    rf_clf.fit(X, y)

    # 2. XGBoost分类模型 - 确保目标类别数量正确
    xgb_clf = XGBClassifier(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        objective="multi:softprob",
        num_class=len(np.unique(y)),  # 明确指定类别数量
        random_state=42,
    )

    accuracy_xgb, f1_xgb, precision_xgb, recall_xgb = evaluate_classification_model(xgb_clf, X, y, cv=cv_splits)
    print(
        f"XGBoost - 准确率: {accuracy_xgb:.4f}, F1(宏平均): {f1_xgb:.4f}, "
        f"精确率: {precision_xgb:.4f}, 召回率: {recall_xgb:.4f}"
    )

    # 使用全部数据拟合XGBoost模型以获取特征重要性
    xgb_clf.fit(X, y)

    # 获取Top 10特征
    top_features = get_top_features(X, rf_clf, xgb_clf, n_top=5)
    print(f"Top 10特征: {top_features}")

    # 3. 使用Top 10特征训练SVM模型
    X_top = X[top_features]

    # 标准化特征
    scaler = StandardScaler()
    X_top_scaled = scaler.fit_transform(X_top)

    # 设置SVC参数网格
    param_grid = {"C": [0.1, 1, 10], "gamma": ["scale", 0.01, 0.1], "kernel": ["rbf"], "class_weight": ["balanced"]}

    # 网格搜索
    svc = SVC(probability=True)
    grid_search = GridSearchCV(svc, param_grid, cv=cv_splits, scoring="f1_macro", n_jobs=-1)

    grid_search.fit(X_top_scaled, y)
    best_svc = grid_search.best_estimator_
    print(f"SVC最佳参数: {grid_search.best_params_}")

    # 使用最佳SVC模型进行评估
    accuracy_svc, f1_svc, precision_svc, recall_svc = evaluate_classification_model(
        best_svc, X_top_scaled, y, cv=cv_splits
    )
    print(
        f"SVC(Top 10特征) - 准确率: {accuracy_svc:.4f}, F1(宏平均): {f1_svc:.4f}, "
        f"精确率: {precision_svc:.4f}, 召回率: {recall_svc:.4f}"
    )

    # 将结果添加到结果DataFrame
    for model_name, accuracy, f1, precision, recall in [
        ("RandomForest", accuracy_rf, f1_rf, precision_rf, recall_rf),
        ("XGBoost", accuracy_xgb, f1_xgb, precision_xgb, recall_xgb),
        ("SVC_Top10", accuracy_svc, f1_svc, precision_svc, recall_svc),
    ]:
        classification_results_df = pd.concat(
            [
                classification_results_df,
                pd.DataFrame(
                    {
                        "Surface": [surface],
                        "Model": [model_name],
                        "Accuracy_mean": [accuracy],
                        "F1_macro_mean": [f1],
                        "Precision_macro_mean": [precision],
                        "Recall_macro_mean": [recall],
                        "Top_Features": [",".join(top_features)],
                        "Classes_Present": [classes_present_str],
                    }
                ),
            ],
            ignore_index=True,
        )

# 保存分类结果
classification_results_file = os.path.join(output_dir, "classification_results.csv")
classification_results_df.to_csv(classification_results_file, index=False)
print(f"\n分类结果已保存到: {classification_results_file}")

In [None]:
# 可视化分类模型性能对比
plt.figure(figsize=(20, 8))

# F1分数对比
pivot_f1 = classification_results_df.pivot_table(index=["Surface"], columns="Model", values="F1_macro_mean")
pivot_f1.plot(kind="bar", ax=plt.gca())
plt.title("各模型F1分数对比", fontsize=14)
plt.ylabel("F1分数(宏平均)", fontsize=12)
plt.xlabel("层位", fontsize=12)

# 调整y轴范围
plt.ylim([0, 1])

# 添加水平线标记0.5值
plt.axhline(y=0.5, color="gray", linestyle="--", alpha=0.3)

# 添加图例并调整位置
plt.legend(title="模型", bbox_to_anchor=(1.05, 1), loc="upper left")

# 调整布局和标签
plt.xticks(rotation=90)
plt.tight_layout()

# 保存和显示
plt.savefig(os.path.join(output_dir, "f1_classification_comparison.png"), dpi=300, bbox_inches="tight")
plt.show()

## H6-2 预测

In [None]:
# 导入必要的库
import sys

sys.path.append("src")  # 确保src目录在Python路径中
from data_utils import parse_petrel_file, preprocess_features
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib import cm
import os

# 设置输出目录
prediction_dir = os.path.join(output_dir, "predictions")
if not os.path.exists(prediction_dir):
    os.makedirs(prediction_dir)


# 定义可视化函数
def plot_prediction_map(x, y, z, values, title, filename, cmap="viridis", vmin=None, vmax=None, classes=None):
    """
    绘制预测结果的空间分布图

    参数:
        x, y, z: 坐标数组
        values: 预测值数组
        title: 图表标题
        filename: 保存文件名
        cmap: 颜色映射
        vmin, vmax: 值的范围
        classes: 分类任务的类别标签
    """
    plt.figure(figsize=(12, 10))

    # 对于分类结果使用离散颜色映射
    if classes is not None:
        # 创建离散颜色映射
        n_classes = len(classes)
        if n_classes <= 3:  # 对于我们的3类问题
            colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]  # 蓝色、橙色、绿色
            cmap = ListedColormap(colors[:n_classes])
        else:
            cmap = plt.cm.get_cmap("tab10", n_classes)

        scatter = plt.scatter(x, y, c=values, cmap=cmap, s=10, alpha=0.7)

        # 添加图例
        handles = [
            plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=cmap(i), markersize=8, label=classes[i])
            for i in range(n_classes)
        ]
        plt.legend(handles=handles, title="岩性", loc="upper right")

    else:  # 对于回归结果使用连续颜色映射
        scatter = plt.scatter(x, y, c=values, cmap=cmap, s=10, alpha=0.7, vmin=vmin, vmax=vmax)
        plt.colorbar(scatter, label=title)

    plt.title(title, fontsize=14)
    plt.xlabel("X坐标", fontsize=12)
    plt.ylabel("Y坐标", fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

    # 保存图像
    plt.savefig(os.path.join(prediction_dir, filename), dpi=300, bbox_inches="tight")
    plt.show()


# 1. 加载H6-2的地震数据
print("正在加载H6-2地震属性数据...")
seismic_file = "data/H6-2_attr"
seismic_data = parse_petrel_file(seismic_file)

if seismic_data is None:
    raise ValueError("无法解析地震属性文件")

print(f"地震数据形状: {seismic_data.shape}")
print(f"地震数据列: {seismic_data.columns.tolist()}")

# 2. 加载H6-2层的训练数据和模型
print("\n加载H6-2层的训练数据...")
h6_2_data = data_well_processed[data_well_processed["Surface"] == "H6-2"].copy()
print(f"H6-2训练数据样本数: {len(h6_2_data)}")

# 3. 准备预测特征
print("\n准备预测特征...")
# 非特征列列表
non_feature_cols = ["X", "Y", "Z", "Surface", "Well", "Sand Thickness", "Sand Ratio", "Lithofacies", "Lithofacies_Code"]

# 获取训练集中使用的特征列
feature_cols = [col for col in h6_2_data.columns if col not in non_feature_cols]
print(f"特征数量: {len(feature_cols)}")

# 对地震数据进行预处理
# 首先找出地震数据中与训练数据对应的特征列
seismic_feature_cols = []
for col in feature_cols:
    if col in seismic_data.columns:
        seismic_feature_cols.append(col)
    else:
        print(f"警告: 特征 '{col}' 在地震数据中不存在")

print(f"地震数据中可用特征: {len(seismic_feature_cols)}")

# 4. 使用训练好的模型进行预测
# 提取坐标和特征
X_coords = seismic_data["X"].values
Y_coords = seismic_data["Y"].values
Z_coords = seismic_data["Z"].values

# 确保没有缺失值
seismic_features = seismic_data[seismic_feature_cols].copy()
for col in seismic_feature_cols:
    if seismic_features[col].isnull().any():
        mean_val = seismic_features[col].mean()
        seismic_features[col] = seismic_features[col].fillna(mean_val)
        print(f"特征 '{col}' 中的缺失值已填充为均值: {mean_val:.4f}")

print("\n开始进行预测...")

# 5. 砂厚预测 (Sand Thickness)
print("\n预测砂厚...")

# 训练随机森林模型 (使用所有H6-2的数据)
X_train = h6_2_data[seismic_feature_cols]
y_sand_thickness = h6_2_data["Sand Thickness"]

# 清理训练数据中的任何缺失值
for col in seismic_feature_cols:
    if X_train[col].isnull().any():
        mean_val = X_train[col].mean()
        X_train[col] = X_train[col].fillna(mean_val)

# 训练砂厚预测模型
rf_sand_thickness = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=2, random_state=42)
rf_sand_thickness.fit(X_train, y_sand_thickness)

# 预测砂厚
sand_thickness_pred = rf_sand_thickness.predict(seismic_features)
print(f"砂厚预测范围: {sand_thickness_pred.min():.4f} - {sand_thickness_pred.max():.4f}")

# 可视化砂厚预测结果
plot_prediction_map(
    X_coords,
    Y_coords,
    Z_coords,
    sand_thickness_pred,
    "H6-2层砂厚预测 (m)",
    "H6-2_sand_thickness_prediction.png",
    cmap="plasma",
    vmin=0,
    vmax=max(sand_thickness_pred.max(), y_sand_thickness.max()) * 1.1,
)

# 6. 砂地比预测 (Sand Ratio)
print("\n预测砂地比...")

# 训练砂地比预测模型
y_sand_ratio = h6_2_data["Sand Ratio"]
rf_sand_ratio = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=2, random_state=42)
rf_sand_ratio.fit(X_train, y_sand_ratio)

# 预测砂地比
sand_ratio_pred = rf_sand_ratio.predict(seismic_features)
print(f"砂地比预测范围: {sand_ratio_pred.min():.4f} - {sand_ratio_pred.max():.4f}")

# 确保预测值在有效范围内 (0-100%)
sand_ratio_pred = np.clip(sand_ratio_pred, 0, 100)

# 可视化砂地比预测结果
plot_prediction_map(
    X_coords,
    Y_coords,
    Z_coords,
    sand_ratio_pred,
    "H6-2层砂地比预测 (%)",
    "H6-2_sand_ratio_prediction.png",
    cmap="YlOrBr",
    vmin=0,
    vmax=100,
)

# 7. 岩相预测
print("\n预测岩相...")

# 训练岩相分类模型
y_lithofacies = h6_2_data["Lithofacies_Code"]
rf_lithofacies = RandomForestClassifier(
    n_estimators=100, max_depth=5, min_samples_leaf=2, class_weight="balanced", random_state=42
)
rf_lithofacies.fit(X_train, y_lithofacies)

# 预测岩相
lithofacies_pred = rf_lithofacies.predict(seismic_features)
lithofacies_prob = rf_lithofacies.predict_proba(seismic_features)

# 统计预测的岩性分布
unique_lithofacies, counts = np.unique(lithofacies_pred, return_counts=True)
print("岩性预测分布:")
for lith, count in zip(unique_lithofacies, counts):
    lithofacies_name = "未知"
    if lith == 0:
        lithofacies_name = "泥岩"
    elif lith == 1:
        lithofacies_name = "砂岩"
    elif lith == 2:
        lithofacies_name = "砂泥互层"

    percentage = count / len(lithofacies_pred) * 100
    print(f"  - {lithofacies_name} (编码 {lith}): {count} 个点 ({percentage:.2f}%)")

# 可视化岩相预测结果
lithofacies_names = ["泥岩", "砂岩", "砂泥互层"]
plot_prediction_map(
    X_coords,
    Y_coords,
    Z_coords,
    lithofacies_pred,
    "H6-2层岩相预测",
    "H6-2_lithofacies_prediction.png",
    classes=lithofacies_names,
)

# 8. 将预测结果合并到DataFrame并保存
prediction_df = pd.DataFrame(
    {
        "X": X_coords,
        "Y": Y_coords,
        "Z": Z_coords,
        "Sand_Thickness_Pred": sand_thickness_pred,
        "Sand_Ratio_Pred": sand_ratio_pred,
        "Lithofacies_Pred": lithofacies_pred,
    }
)

# 添加岩相概率列
for i, name in enumerate(lithofacies_names):
    if i < lithofacies_prob.shape[1]:  # 确保概率矩阵有足够的列
        prediction_df[f"{name}_Prob"] = lithofacies_prob[:, i]

# 保存预测结果
prediction_file = os.path.join(prediction_dir, "H6-2_predictions.csv")
prediction_df.to_csv(prediction_file, index=False)
print(f"\n预测结果已保存到: {prediction_file}")

# 9. 生成砂厚和砂地比的联合图
plt.figure(figsize=(15, 7))

# 创建两个子图
plt.subplot(1, 2, 1)
plt.scatter(X_coords, Y_coords, c=sand_thickness_pred, cmap="viridis", s=10, alpha=0.7)
plt.colorbar(label="砂厚 (m)")
plt.title("H6-2层砂厚预测", fontsize=14)
plt.xlabel("X坐标", fontsize=12)
plt.ylabel("Y坐标", fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(X_coords, Y_coords, c=sand_ratio_pred, cmap="viridis", s=10, alpha=0.7, vmin=0, vmax=100)
plt.colorbar(label="砂地比 (%)")
plt.title("H6-2层砂地比预测", fontsize=14)
plt.xlabel("X坐标", fontsize=12)
plt.ylabel("Y坐标", fontsize=12)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(prediction_dir, "H6-2_combined_predictions.png"), dpi=300, bbox_inches="tight")
plt.show()

# 10. 砂厚与砂地比的散点关系图
plt.figure(figsize=(10, 8))
plt.scatter(
    sand_thickness_pred,
    sand_ratio_pred,
    c=lithofacies_pred,
    cmap=ListedColormap(["#1f77b4", "#ff7f0e", "#2ca02c"]),
    s=10,
    alpha=0.6,
)

# 添加图例
handles = [
    plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=c, markersize=8, label=l)
    for c, l in zip(["#1f77b4", "#ff7f0e", "#2ca02c"], lithofacies_names)
]
plt.legend(handles=handles, title="岩性", loc="upper left")

plt.title("H6-2层砂厚与砂地比关系图", fontsize=14)
plt.xlabel("砂厚 (m)", fontsize=12)
plt.ylabel("砂地比 (%)", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(prediction_dir, "H6-2_thickness_ratio_relationship.png"), dpi=300, bbox_inches="tight")
plt.show()

print("\nH6-2层预测与可视化完成！")