## 数据质控


In [1]:
# 确保src目录在Python路径中
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append(os.path.abspath("../"))

# 导入模块
from src.data_utils import filter_anomalous_attributes, identify_attributes, parse_petrel_file

output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# 设置中文字体
plt.rcParams["font.family"] = "SimHei"  # 黑体 SimHei 支持中文
plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号

## 导入地震数据


In [2]:
data_H6_2_attr = parse_petrel_file("../data/H6-2_attr")

正在解析文件: ../data/H6-2_attr
正在识别文件属性: ../data/H6-2_attr
识别到 END ATTRIBUTES 位于第 31 行
识别到 15 个属性:
  - Average energy
  - Average envelope
  - Average instantaneous frequency
  - Average instantaneous phase
  - Average peak value
  - Geometric mean
  - Half energy
  - Harmonic mean
  - Maximum amplitude
  - Mean amplitude
  - Minimum amplitude
  - Most of
  - RMS amplitude
  - Sum of amplitudes
  - Sum of energy
识别到的列含义: ['X', 'Y', 'Z', 'column', 'row']
数据行有 20 列，列名列表有 20 个


  df = pd.read_csv(


成功读取数据，共 51714 行

数据预览:
               X             Y            Z  column  row  Average energy  \
0  688546.288809  3.211900e+06 -2472.896240   128.0  1.0      3538352.25   
1  688596.288809  3.211900e+06 -2475.649902   129.0  1.0      6365790.00   
2  688446.288809  3.211950e+06 -2465.910889   126.0  2.0      9748781.00   
3  688496.288809  3.211950e+06 -2469.487061   127.0  2.0      5062806.00   
4  688546.288809  3.211950e+06 -2472.695801   128.0  2.0      2440221.75   

   Average envelope  Average instantaneous frequency  \
0       3114.544678                        32.982338   
1       3344.748779                        33.155682   
2       4269.621094                        35.096291   
3       3648.766113                        31.971563   
4       2865.032471                        31.293999   

   Average instantaneous phase  Average peak value  Geometric mean  \
0                   -75.810104              -999.0          -999.0   
1                   -86.797714            

## 导入井震数据


In [3]:
file_H6_2_well = "../data/well_processed.xlsx"
data_H6_2_well = pd.read_excel(file_H6_2_well, sheet_name="Sheet1")

# 只选择 Surface 为 H6-2 的行，并丢弃砂厚为 NaN 的行
data_H6_2_well_selected = (
    data_H6_2_well[data_H6_2_well["Surface"] == "H6-2"]
    .replace(-999, np.nan)
    .dropna(subset=["Thickness of facies(1: Fine sand)"])
    .reset_index(drop=True)
)

data_H6_2_well_selected.head()

Unnamed: 0,X,Y,Z,Surface,Well,Thickness of facies(1: Fine sand),facies(1: Fine sand),Average energy,Average envelope,Average instantaneous frequency,...,Average peak value,Half energy,Harmonic mean,Maximum amplitude,Mean amplitude,Minimum amplitude,Most of,RMS amplitude,Sum of amplitudes,Sum of energy
0,686325.6,3217019.1,-2649.7,H6-2,A1,0.0,0.0,89001976.0,11011.2,21.9,...,,5.2,-4573.6,-1763.2,-8415.1,-14124.2,-13431.1,9511.8,-66162.7,718160960
1,686616.5,3217415.2,-2633.0,H6-2,A10,7.87,45.82,76951152.0,11667.5,25.9,...,,4.0,-43402.7,4934.6,-4832.2,-15217.6,-13626.4,8764.0,-36986.6,724726848
2,686278.0,3217627.9,-2650.4,H6-2,A11,0.0,0.0,6199530.0,2127.4,49.7,...,,7.2,-1264.8,4551.3,623.9,-2246.4,-556.0,2459.4,2576.6,26151754
3,686149.5,3216665.5,-2642.5,H6-2,A2,0.75,5.16,88260688.0,12334.4,21.0,...,,4.0,-7747.0,1608.4,-7710.8,-15313.4,-14014.2,9393.7,-57804.2,667979712
4,685921.1,3216986.2,-2644.7,H6-2,A4,0.0,0.0,31338386.0,5349.9,25.9,...,,7.0,3644.1,7899.5,4837.1,624.7,4900.9,5553.1,15000.3,97357456


## 提取共同属性


In [4]:
# 获取地震属性列表
seismic_attr, _ = identify_attributes("../data/H6-2_attr")

# 提取Excel的属性列表（从第8列开始的所有列）
well_seismic_attr = data_H6_2_well.columns[7:].tolist()

# 计算两个列表的交集
common_attributes = list(set(seismic_attr) & set(well_seismic_attr))

# 打印结果
print(f"地震属性数量: {len(seismic_attr)}")
print(f"Excel属性数量: {len(well_seismic_attr)}")
print(f"共同属性数量: {len(common_attributes)}")
print("\n共同属性列表:")
for attr in common_attributes:
    print(f"- {attr}")

正在识别文件属性: ../data/H6-2_attr
识别到 END ATTRIBUTES 位于第 31 行
识别到 15 个属性:
  - Average energy
  - Average envelope
  - Average instantaneous frequency
  - Average instantaneous phase
  - Average peak value
  - Geometric mean
  - Half energy
  - Harmonic mean
  - Maximum amplitude
  - Mean amplitude
  - Minimum amplitude
  - Most of
  - RMS amplitude
  - Sum of amplitudes
  - Sum of energy
地震属性数量: 15
Excel属性数量: 14
共同属性数量: 14

共同属性列表:
- Maximum amplitude
- Mean amplitude
- Average peak value
- RMS amplitude
- Sum of energy
- Half energy
- Most of
- Minimum amplitude
- Average energy
- Harmonic mean
- Average envelope
- Average instantaneous frequency
- Average instantaneous phase
- Sum of amplitudes


## 生成统计摘要


In [5]:
# 筛选出质量良好的属性
good_attributes, anomalous_attributes, attribute_stats = filter_anomalous_attributes(
    seismic_data=data_H6_2_attr,
    well_data=data_H6_2_well_selected,
    common_attributes=common_attributes,
    ratio_threshold=5.0,  # 均值比值阈值
    range_ratio_threshold=10.0,  # 数值范围比值阈值
    std_ratio_threshold=10.0,  # 标准差比值阈值
    output_dir=output_dir,  # 输出图表目录
    verbose=True,  # 打印详细信息
)

print("\n筛选后保留的质量良好属性:")
for attr in good_attributes:
    print(f"- {attr}")


共分析了 14 个共同属性
发现 3 个异常属性
保留 11 个质量良好的属性

异常属性及原因:
  - Mean amplitude: 均值比值为负 (-2.7440)
  - Harmonic mean: 标准差比值异常 (123.5894), 数值范围比值异常 (5348.7889)
  - Average peak value: 均值比值异常 (0.0963)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x="属性", y="质量分数", data=plot_df, palette=colors)



筛选后保留的质量良好属性:
- Sum of amplitudes
- Average instantaneous phase
- Sum of energy
- Maximum amplitude
- Minimum amplitude
- Half energy
- Most of
- Average energy
- RMS amplitude
- Average instantaneous frequency
- Average envelope
