## 井点数据预处理


In [17]:
import numpy as np
import pandas as pd

In [18]:
file_well = "data/well.xlsx"
data_well = pd.read_excel(file_well, sheet_name="Sheet1")

data_well.head()

Unnamed: 0,X,Y,Z,TWT picked,TWT auto,Geological age,MD,PVD auto,Type,Surface,...,Used by geo mod,Zone log,Edited by user,Symbol,Locked to fault,"FLOAT,Thickness of facies(1: Fine sand)","FLOAT,facies(1: Fine sand)","FLOAT,Average instantaneous frequency","FLOAT,Average instantaneous frequency-s6","FLOAT,Average instantaneous frequency-x6"
0,686332.8,3217090.3,-2431.9,-999,-2054.65,-999,2505.7,-2431.9,Horizon,H4-1,...,True,13,False,0,0,14.41,87.31,20.5,34.2,28.3
1,686332.3,3217085.0,-2448.4,-999,-2063.95,-999,2523.03,-2448.4,Horizon,H4-2,...,True,14,False,0,0,14.16,66.18,20.1,33.7,27.1
2,686331.7,3217078.2,-2469.8,-999,-2076.06,-999,2545.5,-2469.8,Horizon,H4-3,...,True,15,False,0,0,4.19,16.35,19.6,33.6,25.6
3,686330.9,3217070.0,-2495.4,-999,-2090.63,-999,2572.41,-2495.4,Horizon,H4-4,...,True,16,False,0,0,18.36,45.73,18.9,33.4,23.7
4,686329.5,3217057.2,-2535.6,-999,-2113.26,-999,2614.6,-2535.6,Horizon,H5-1,...,True,17,False,0,0,0.0,0.0,15.7,33.2,22.4


In [19]:
# 只保留excel的A~C, J, K, AB~AR列
# 获取所有列名
all_columns = data_well.columns
# 选择需要保留的列
selected_columns = list(all_columns[0:3])  # A~C列
selected_columns.extend(list(all_columns[9:11]))  # J, K列
selected_columns.extend(list(all_columns[27:29]))  # AB, AC列

# 只保留需要的列，并删除层位是P0的行
filtered_data = data_well[selected_columns]
filtered_data = filtered_data[filtered_data["Surface"] != "P0"]
print("筛选后的列：", filtered_data.columns.tolist())
print("筛选后的数据形状：", filtered_data.shape)
print("\n筛选后的数据前5行：")
print(filtered_data.head())

筛选后的列： ['X', 'Y', 'Z', 'Surface', 'Well', 'FLOAT,Thickness of facies(1: Fine sand)', 'FLOAT,facies(1: Fine sand)']
筛选后的数据形状： (1017, 7)

筛选后的数据前5行：
          X          Y       Z Surface Well  \
0  686332.8  3217090.3 -2431.9    H4-1   A1   
1  686332.3  3217085.0 -2448.4    H4-2   A1   
2  686331.7  3217078.2 -2469.8    H4-3   A1   
3  686330.9  3217070.0 -2495.4    H4-4   A1   
4  686329.5  3217057.2 -2535.6    H5-1   A1   

   FLOAT,Thickness of facies(1: Fine sand)  FLOAT,facies(1: Fine sand)  
0                                    14.41                       87.31  
1                                    14.16                       66.18  
2                                     4.19                       16.35  
3                                    18.36                       45.73  
4                                     0.00                        0.00  


In [20]:
# 统计"Well"这一列有多少个值，并处理每个Well
well_column = filtered_data.columns[4]  # 井名在第4列
sand_thickness_column = filtered_data.columns[5]  # 砂厚在第5列
sand_ratio_column = filtered_data.columns[6]  # 砂地比在第6列

unique_wells = filtered_data[well_column].unique()
print(f"\n共有 {len(unique_wells)} 个不同的井点值")

# 存储处理后的数据
final_data = pd.DataFrame(columns=filtered_data.columns)

# 要删除的井点列表
wells_to_delete = []

# 循环处理每个井点
for well in unique_wells:
    # 1. 删除指定井点
    if well in wells_to_delete:
        print(f"\n井点 {well} 在指定删除列表中，已删除。")
        continue

    # 获取当前井的数据
    well_data = filtered_data[filtered_data[well_column] == well]

    # 2. 删除少于10行的井点
    if len(well_data) < 10:
        print(f"\n井点 {well} 只有 {len(well_data)} 行数据，已删除。")
        continue

    # 3. 检查砂厚和砂地比列的异常值比例
    sand_thickness_anomaly_ratio = (well_data[sand_thickness_column] == -999).mean()
    sand_ratio_anomaly_ratio = (well_data[sand_ratio_column] == -999).mean()

    # 如果砂厚或砂地比的异常值比例超过70%，则删除该井
    if sand_thickness_anomaly_ratio > 0.7 or sand_ratio_anomaly_ratio > 0.7:
        print(
            f"\n井点 {well} 的砂厚异常值比例为 {sand_thickness_anomaly_ratio:.2%}，砂地比异常值比例为 {sand_ratio_anomaly_ratio:.2%}，已删除。"
        )
        continue

    # 保留符合条件的井点数据
    final_data = pd.concat([final_data, well_data], ignore_index=True)
    print(f"\n井点 {well} 有 {len(well_data)} 行数据，已保留。前5行数据：")
    print(well_data.head())

print(f"\n处理后总共保留了 {len(final_data)} 行数据，包含 {len(final_data[well_column].unique())} 个井点")

# 保存处理后的数据
final_data.to_excel("data/well_processed.xlsx", index=False)
print("\n处理后的数据已保存到 'data/well_processed.xlsx'")


共有 39 个不同的井点值

井点 A1 有 27 行数据，已保留。前5行数据：
          X          Y       Z Surface Well  \
0  686332.8  3217090.3 -2431.9    H4-1   A1   
1  686332.3  3217085.0 -2448.4    H4-2   A1   
2  686331.7  3217078.2 -2469.8    H4-3   A1   
3  686330.9  3217070.0 -2495.4    H4-4   A1   
4  686329.5  3217057.2 -2535.6    H5-1   A1   

   FLOAT,Thickness of facies(1: Fine sand)  FLOAT,facies(1: Fine sand)  
0                                    14.41                       87.31  
1                                    14.16                       66.18  
2                                     4.19                       16.35  
3                                    18.36                       45.73  
4                                     0.00                        0.00  

井点 A10 有 25 行数据，已保留。前5行数据：
           X          Y       Z Surface Well  \
14  686528.2  3217397.5 -2120.9    H1-1  A10   
15  686533.3  3217398.5 -2153.7    H1-2  A10   
16  686541.2  3217400.0 -2202.8    H1-3  A10   
17  686548.8  321

  final_data = pd.concat([final_data, well_data], ignore_index=True)
