## 尝试删除一些明显异常的（或者不想要的）井或层位


In [None]:
import numpy as np
import pandas as pd

In [None]:
# 读取数据
file_well = "../data/well_horizon.xlsx"
data_well = pd.read_excel(file_well, sheet_name="Sand Thickness")

print("原始数据形状：", data_well.shape)
print("原始数据前5行：")
print(data_well.head())

In [None]:
# 列名
xyz_columns = ["X", "Y", "Z"]
twt_column = "TWT auto"
md_column = "MD"
pvd_column = "PVD auto"
surface_column = "Surface"
well_column = "Well"
sand_thickness_column = "Sand Thickness"
count_column = "Count"

In [None]:
# 设置要删除的层位列表（默认为空，可手动添加其他）
horizons_to_delete = []

# 设置要删除的井点列表（默认为空，可手动添加）
wells_to_delete = [
    "A10s",
    "A3",
    "A7",
    "BG2",
    "BG3",
    "BG5S",
    "BO2_H4B_B1",
    "BO2_H4B_M",
    "BO2_H4C_B1",
    "BO2_H4C_M",
    "PH11",
    "PH18",
    "SHX36-5-A4",
    "AA7_1",
    "AA7_M",
]

In [None]:
# 开始数据处理
print(f"\n=== 开始数据处理 ===")

# 删除指定层位
data_filtered = data_well.copy()
if horizons_to_delete:
    before_count = len(data_filtered)
    data_filtered = data_filtered[~data_filtered[surface_column].isin(horizons_to_delete)]
    after_count = len(data_filtered)
    print(f"删除层位 {horizons_to_delete}，删除了 {before_count - after_count} 行数据")

# 删除指定井点
if wells_to_delete:
    before_count = len(data_filtered)
    data_filtered = data_filtered[~data_filtered[well_column].isin(wells_to_delete)]
    after_count = len(data_filtered)
    print(f"删除井点 {wells_to_delete}，删除了 {before_count - after_count} 行数据")


In [None]:
# # 4. 删除Sand Thickness=-999的行
# missing_value_mask = data_filtered[sand_thickness_column] == -999
# missing_count = missing_value_mask.sum()

# if missing_count > 0:
#     print(f"发现 {missing_count} 个砂厚值为-999的数据，将直接删除")

#     # 显示一些要删除的数据示例
#     missing_data = data_filtered[missing_value_mask]
#     print("要删除的数据示例（前5行）：")
#     for idx, row in missing_data.head().iterrows():
#         print(f"  井: {row[well_column]}, 层位: {row[surface_column]}, 砂厚: {row[sand_thickness_column]}")

#     # 删除砂厚为-999的行
#     data_filtered = data_filtered[~missing_value_mask].reset_index(drop=True)
#     print(f"已删除 {missing_count} 行砂厚为-999的数据，剩余数据 {len(data_filtered)} 行")
# else:
#     print("未发现砂厚值为-999的数据")

# 将Sand Thickness=-999替换为0
missing_value_mask = data_filtered[sand_thickness_column] == -999
missing_count = missing_value_mask.sum()

if missing_count > 0:
    print(f"\n发现 {missing_count} 个砂厚值为-999的数据，将替换为0")

    # 显示一些要替换的数据示例
    missing_data = data_filtered[missing_value_mask]
    print("要替换的数据示例（前5行）：")
    for idx, row in missing_data.head().iterrows():
        print(f"  井: {row[well_column]}, 层位: {row[surface_column]}, 砂厚: {row[sand_thickness_column]}")

    # 将砂厚为-999的值替换为0
    data_filtered.loc[missing_value_mask, sand_thickness_column] = 0
    print(f"已将 {missing_count} 个砂厚值从-999替换为0")
else:
    print("\n未发现砂厚值为-999的数据")

In [None]:
# 处理重复计数大于1的情况，只保留砂厚值最大的一行
duplicate_mask = data_filtered[count_column] > 1
duplicate_count = duplicate_mask.sum()

if duplicate_count > 0:
    print(f"\n发现 {duplicate_count} 行重复计数大于1的数据")

    # 对于重复计数大于1的数据，按井名和层位分组，保留砂厚最大的一行
    duplicates = data_filtered[duplicate_mask]
    non_duplicates = data_filtered[~duplicate_mask]

    # 显示重复数据
    print("重复数据示例：")
    for (well, surface), group in duplicates.groupby([well_column, surface_column]):
        if len(group) > 0:
            print(
                f"  井 {well}, 层位 {surface}: {len(group)} 行数据，砂厚范围 {group[sand_thickness_column].min():.2f} - {group[sand_thickness_column].max():.2f}"
            )

    # 保留砂厚最大的一行
    max_thickness_duplicates = duplicates.loc[
        duplicates.groupby([well_column, surface_column])[sand_thickness_column].idxmax()
    ]

    # 合并非重复数据和处理后的重复数据
    data_processed = pd.concat([non_duplicates, max_thickness_duplicates], ignore_index=True)

    removed_count = len(data_filtered) - len(data_processed)
    print(f"从重复数据中移除了 {removed_count} 行，保留了砂厚最大的行")
else:
    data_processed = data_filtered
    print("没有发现重复计数大于1的数据")

In [None]:
# 最终统计
print(f"\n=== 处理结果统计 ===")
print(f"原始数据: {len(data_well)} 行")
print(f"处理后数据: {len(data_processed)} 行")
print(f"共有 {len(data_processed[well_column].unique())} 个不同的井点")
print(f"共有 {len(data_processed[surface_column].unique())} 个不同的层位")

# 统计每个井的数据量
well_counts = data_processed[well_column].value_counts()
print(f"\n每个井的数据量统计:")
print(f"最多: {well_counts.max()} 行 (井: {well_counts.idxmax()})")
print(f"最少: {well_counts.min()} 行 (井: {well_counts.idxmin()})")
print(f"平均: {well_counts.mean():.1f} 行")

# 统计砂厚分布
sand_thickness_stats = data_processed[sand_thickness_column].describe()
print(f"\n砂厚分布统计:")
print(sand_thickness_stats)

zero_thickness_count = (data_processed[sand_thickness_column] == 0).sum()
positive_thickness_count = (data_processed[sand_thickness_column] > 0).sum()
print(f"砂厚为0的样本: {zero_thickness_count} 个 ({zero_thickness_count / len(data_processed) * 100:.1f}%)")
print(f"砂厚大于0的样本: {positive_thickness_count} 个 ({positive_thickness_count / len(data_processed) * 100:.1f}%)")

# 统计Surface和井缺失情况
print(f"\n=== Surface 完整性分析 ===")

# 获取所有唯一的Surface和Well
all_surfaces = sorted(data_processed[surface_column].unique())
all_wells = sorted(data_processed[well_column].unique())

print(f"共有 {len(all_surfaces)} 个不同的 Surface:")
print(f"  {all_surfaces}")

# 创建井-层位矩阵
well_surface_matrix = data_processed.pivot_table(
    index=well_column, columns=surface_column, values=sand_thickness_column, aggfunc="count", fill_value=0
)

# 统计每个井缺少哪些Surface
print(f"\n每个井缺少的 Surface 统计:")
missing_info = []
for well in all_wells:
    if well in well_surface_matrix.index:
        missing_surfaces = [s for s in all_surfaces if well_surface_matrix.loc[well, s] == 0]  # type:ignore
    else:
        missing_surfaces = all_surfaces.copy()

    if missing_surfaces:
        missing_info.append(
            {"Well": well, "Missing_Count": len(missing_surfaces), "Missing_Surfaces": ", ".join(missing_surfaces)}
        )
        print(f"  井 {well}: 缺少 {len(missing_surfaces)} 个 Surface - {missing_surfaces}")

# 统计每个Surface在多少井中缺失
print(f"\n每个 Surface 的缺失情况:")
for surface in all_surfaces:
    wells_with_surface = data_processed[data_processed[surface_column] == surface][well_column].unique()
    missing_wells = [w for w in all_wells if w not in wells_with_surface]
    coverage = len(wells_with_surface) / len(all_wells) * 100
    print(
        f"  {surface}: 存在于 {len(wells_with_surface)}/{len(all_wells)} 个井 ({coverage:.1f}%), 缺失井: {missing_wells if missing_wells else '无'}"
    )

# 可选：保存缺失信息到Excel
if missing_info:
    missing_df = pd.DataFrame(missing_info)
    missing_output = "../data/well_surface_missing.xlsx"
    missing_df.to_excel(missing_output, index=False)
    print(f"\n缺失信息已保存到: {missing_output}")

# 显示处理后的前几行数据
print(f"\n处理后数据前5行:")
print(data_processed.head())

In [None]:
# 6. 保存处理后的数据
output_file = "../data/well_horizon_processed.xlsx"
data_processed.to_excel(output_file, index=False)
print(f"\n数据已保存到: {output_file}")