本笔记本加载原始大庆数据集，存储为HDF5文件，以便在其他笔记本中使用。

大庆数据

1. 定义常量和配置

In [29]:
# constants.py
# files_descriptions.py

# 井的头部信息
WELLLOG_HEAD = ['DEPT', 'RMG', 'RMN', 'RMN-RMG', 'CAL', 'SP', 'GR', 'HAC', 'BHC', 'DEN']

# 井的数量
WELLLOG_NUM = 6

# 数据文件路径前缀
DATA_PREFIX = 'data/vertical_all_A{}.csv'

# 测试ID列表
TEST_ID = [6,]

# 目标列
COLUMNS_TARGET = ['HAC', 'BHC', 'DEN']

# 输入列
COLUMNS_INPUT = ['DEPT', 'RMN-RMG', 'CAL', 'SP', 'GR']

# 训练数据长度
TRAIN_LEN = 200

# 窗口步长
WINDOW_STEP = 100

# 列描述
COLUMN_DESCRIPTIONS = {
    'DEPT': {
        'unit': 'm',
        'description': '深度，单位为米 (.M)'
    },
    'RMG': {
        'unit': 'ohmm',
        'description': '电阻率，单位为欧姆米 (.ohmm)'
    },
    'RMN': {
        'unit': 'ohmm',
        'description': '电阻率，单位为欧姆米 (.ohmm)'
    },
    'RMN-RMG': {
        'unit': 'ohmm',
        'description': '电阻率差值，没有明确单位，但由于是RMN与RMG的差值，其单位应该也是欧姆米 (.ohmm)'
    },
    'CAL': {
        'unit': 'cm',
        'description': '孔隙径，单位为厘米 (.cm)'
    },
    'SP': {
        'unit': 'mv',
        'description': '自发电位，单位为毫伏 (.mv)'
    },
    'GR': {
        'unit': 'API or unitless',
        'description': '伽马射线，单位未明确指出，但通常伽马射线的单位是API（美国石油学会单位）或者无单位'
    },
    'HAC': {
        'unit': 'us/m',
        'description': '声波时差，单位为微秒每米 (.us/m)'
    },
    'BHC': {
        'unit': 'us/m',
        'description': '表面声波时差，没有明确单位，可能是微秒每米 (.us/m)，因为与HAC类似'
    },
    'DEN': {
        'unit': 'g/cm3',
        'description': '密度，单位为克/立方厘米 (.g/cm3)'
    }
}

file_description = """This HDF5 file contains oil well log data collected from various wells in the North Sea region. The data includes measurements of depth, resistivity, caliper, sound velocity, gamma ray, and density, among others. The measurements were taken using a combination of logging tools and techniques, including electrical resistivity logging (ERL), caliper logging, and gamma ray logging. The data has been processed to ensure consistency and accuracy, including normalization and interpolation where necessary. This file is intended for use in geological and geophysical analyses, particularly in the study of oil and gas reservoirs. For any questions or further information, please contact the data provider at [email protected]"""

2. 数据读取模块

In [30]:
# data_reader.py
# from constants import WELLLOG_HEAD

import pandas as pd

def read_file(path):
    """
    读取CSV文件并返回DataFrame。

    Parameters:
    - path (str): 文件路径。

    Returns:
    - DataFrame: 包含数据的DataFrame。
    """
    df = pd.read_csv(path)
    df.columns = WELLLOG_HEAD
    return df

3. 数据处理模块

In [31]:
# data_processing.py

import os
import pandas as pd
# from constants import DATA_PREFIX, WELLLOG_NUM
# from data_reader import read_file

def process_data():
    """
    处理井数据，包括读取文件、添加井名称列和合并数据。

    Returns:
    - DataFrame: 处理后的数据集。
    """
    df_well_list = []

    for i in range(WELLLOG_NUM):
        filename = DATA_PREFIX.format(i+1)
        df = read_file(filename)
        well_name = os.path.splitext(filename)[0].split('_')[-1]
        df['WellName'] = well_name
        df_well_list.append(df)

    return pd.concat(df_well_list, axis=0, ignore_index=True)

4. 将数据保存到文件

保存

In [32]:
# data_saver.py

import h5py
import numpy as np
# from columns_description import COLUMN_DESCRIPTIONS

def save_data_to_h5(data, filename, file_description):
    """
    将数据保存为.h5文件，并添加列的描述信息作为属性，以及整个文件的描述信息。

    Parameters:
    - data (DataFrame or ndarray): 要保存的数据。
    - filename (str): 文件名，包括路径和扩展名。
    - file_description (str): 整个文件的描述信息。
    """
    with h5py.File(filename, 'w') as h5f:
        # 添加整个文件的描述信息作为根组的属性
        h5f.attrs['file_description'] = file_description
        
        for key in data.columns:
            # 创建数据集
            dataset = h5f.create_dataset(key, data=data[key].values)
            # 添加单位信息和描述作为属性
            if key in COLUMN_DESCRIPTIONS:
                dataset.attrs['unit'] = COLUMN_DESCRIPTIONS[key]['unit']
                dataset.attrs['description'] = COLUMN_DESCRIPTIONS[key]['description']

读取

In [35]:
import h5py
import pandas as pd

def load_data_from_h5(filename):
    """
    从.h5文件中读取数据，并返回一个Pandas DataFrame。

    Parameters:
    - filename (str): 文件名，包括路径和扩展名。

    Returns:
    - df (DataFrame): 包含从文件中读取的数据的DataFrame。
    """
    with h5py.File(filename, 'r') as h5f:
        # 获取文件的描述信息
        file_description = h5f.attrs.get('file_description', 'No description')
        print(f"File Description: {file_description}")
        
        # 初始化一个空的DataFrame
        df = pd.DataFrame()
        
        # 遍历文件中的所有数据集
        for key in h5f.keys():
            # 读取数据集
            dataset = h5f[key]
            # 将数据集添加到DataFrame中
            df[key] = dataset[:]
            # 打印数据集的单位和描述
            unit = dataset.attrs.get('unit', 'No unit')
            description = dataset.attrs.get('description', 'No description')
            print(f"{key}: Unit = {unit}, Description = {description}")
        
        return df
    
# 测试
filename = './well_log_daqing.h5'
df = load_data_from_h5(filename)
print(df.head())

File Description: This HDF5 file contains oil well log data collected from various wells in the North Sea region. The data includes measurements of depth, resistivity, caliper, sound velocity, gamma ray, and density, among others. The measurements were taken using a combination of logging tools and techniques, including electrical resistivity logging (ERL), caliper logging, and gamma ray logging. The data has been processed to ensure consistency and accuracy, including normalization and interpolation where necessary. This file is intended for use in geological and geophysical analyses, particularly in the study of oil and gas reservoirs. For any questions or further information, please contact the data provider at [email protected]
BHC: Unit = us/m, Description = 表面声波时差，没有明确单位，可能是微秒每米 (.us/m)，因为与HAC类似
CAL: Unit = cm, Description = 孔隙径，单位为厘米 (.cm)
DEN: Unit = g/cm3, Description = 密度，单位为克/立方厘米 (.g/cm3)
DEPT: Unit = m, Description = 深度，单位为米 (.M)
GR: Unit = API or unitless, Description = 伽

5. 主程序

In [33]:
# main.py

# from data_processing import process_data

def main():
    """
    主程序，处理井数据并显示结果。
    """
    dataset = process_data()
    print(f"数据集长度: {len(dataset)}")
    print("数据集头部:")
    print(dataset.head())
    print("数据集尾部:")
    print(dataset.tail())
    
    save_data_to_h5(dataset, 'well_log_daqing.h5', file_description)
    print("数据已保存为well_log_daqing.h5")
    

if __name__ == "__main__":
    main()

数据集长度: 38734
数据集头部:
     DEPT    RMG    RMN  RMN-RMG     CAL       SP       GR      HAC      BHC  \
0  780.60  2.260  2.265    0.005  23.492  121.845  102.405  402.244  405.716   
1  780.65  2.241  2.281    0.040  23.453  121.845  103.093  397.115  404.701   
2  780.70  2.405  2.474    0.069  23.403  121.656  102.995  394.872  403.953   
3  780.75  2.598  2.640    0.042  23.363  121.325  102.405  397.009  404.434   
4  780.80  2.533  2.538    0.005  23.333  120.994  101.128  404.060  405.021   

     DEN WellName  
0  2.269       A1  
1  2.274       A1  
2  2.284       A1  
3  2.274       A1  
4  2.284       A1  
数据集尾部:
           DEPT    RMG    RMN  RMN-RMG     CAL      SP      GR      HAC  \
38729  1006.079  6.191  8.237    2.046  21.125  66.313  81.215  335.470   
38730  1006.129  5.867  8.042    2.175  21.095  66.248  82.754  335.470   
38731  1006.179  6.723  9.164    2.441  21.080  66.150  84.678  335.150   
38732  1006.229  7.263  9.530    2.267  21.118  66.248  87.660  335.150 

6. 技术文档

    在每个模块和函数的文档字符串中，我们已经提供了关于其功能和用法的描述。这些描述可以作为技术文档的一部分，帮助其他开发者理解和使用代码。

    通过这种方式，代码结构更加清晰，模块化，并且易于维护和扩展。同时，提供的文档字符串为代码的使用和维护提供了指导。

## 代码笔记

重新整理和组织代码结构：
1. 模块化：将代码分解为可重用的模块，提高代码可读性和可维护性。
2. 文档化：为每个模块和函数添加文档字符串(docsstrings),便于理解其功能和用法。
3. 错误处理：确保代码中适当的错误处理机制，便于调试和维护。
4. 代码风格：Python的PEP 8 风格指南，确保代码的一致性和可读性。_

元数据信息：
- 数据来源：大庆数据
- 数据格式：csv
- 数据量：约 38837 行
- 特征数量：10，['DEPT', 'RMG', 'RMN', 'RMN-RMG', 'CAL', 'SP', 'GR', 'HAC', 'BHC', 'DEN']
- 井号： A1-A6,共6口井

HDFS:
- 标签；
- 描述信息
- 数据