In [2]:
# constants.py

# 列名
COLUMNS = ['DEPT', 'RMN-RMG', 'CAL', 'SP', 'GR', 'HAC', 'BHC', 'DEN']


In [3]:
# data_read.py

import h5py
import pandas as pd

def load_data_from_h5(filename):
    """
    从.h5文件中读取数据，并返回一个Pandas DataFrame。

    Parameters:
    - filename (str): 文件名，包括路径和扩展名。

    Returns:
    - df (DataFrame): 包含从文件中读取的数据的DataFrame。
    """
    with h5py.File(filename, 'r') as h5f:
        # 获取文件的描述信息
        file_description = h5f.attrs.get('file_description', 'No description')
        print(f"File Description: {file_description}")
        
        # 初始化一个空的DataFrame
        df = pd.DataFrame()
        
        # 遍历文件中的所有数据集
        for key in h5f.keys():
            print(f"Reading {key}...")
            # 读取数据集
            dataset = h5f[key]
            # 将数据集添加到DataFrame中
            df[key] = dataset[:]
            # 打印数据集的单位和描述
            unit = dataset.attrs.get('unit', 'No unit')
            description = dataset.attrs.get('description', 'No description')
            print(f"{key}: Unit = {unit}, Description = {description}")
        
        # 在添加数据集到DataFrame之后，将WellName列中的字节字符串转换为字符串
        df['WellName'] = df['WellName'].apply(lambda x: x.decode('utf-8'))
        return df

filename = './well_log_daqing.h5'
df = load_data_from_h5(filename)
print(df.head())

File Description: This HDF5 file contains oil well log data collected from various wells in the North Sea region. The data includes measurements of depth, resistivity, caliper, sound velocity, gamma ray, and density, among others. The measurements were taken using a combination of logging tools and techniques, including electrical resistivity logging (ERL), caliper logging, and gamma ray logging. The data has been processed to ensure consistency and accuracy, including normalization and interpolation where necessary. This file is intended for use in geological and geophysical analyses, particularly in the study of oil and gas reservoirs. For any questions or further information, please contact the data provider at [email protected]
Reading BHC...
BHC: Unit = us/m, Description = 表面声波时差，没有明确单位，可能是微秒每米 (.us/m)，因为与HAC类似
Reading CAL...
CAL: Unit = cm, Description = 孔隙径，单位为厘米 (.cm)
Reading DEN...
DEN: Unit = g/cm3, Description = 密度，单位为克/立方厘米 (.g/cm3)
Reading DEPT...
DEPT: Unit = m, Descripti

标准化器：

1. standardize
2. maximize

## standardize

In [4]:
# data_process.py

import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
# form constants import COLUMNS

class DataStandardizer:
    """
    数据标准化类，使用StandardScaler进行标准化。
    """
    def __init__(self, columns=None):
        self.scaler = StandardScaler()
        self.columns = columns
        self.fitted = False

    def fit(self, df):
        """
        拟合标准化器。
        """
        if self.columns is None:
            self.columns = df.columns
        self.scaler.fit(df[self.columns])
        self.fitted = True

    def transform(self, df):
        """  
        对数据进行标准化。
        """
        if not self.fitted:
            raise ValueError("Scaler is not fitted yet. Call 'fit' with appropriate data before transforming.")
        # 标准化指定的列
        df_standardized = pd.DataFrame(self.scaler.transform(df[self.columns]), columns=self.columns)
        # 将标准化后的列替换原始数据框中的对应列
        for col in self.columns:
            df[col] = df_standardized[col]
        return df

    def inverse_transform(self, df):
        """ 
        对标准化后的数据进行反标准化。
        """
        if not self.fitted:
            raise ValueError("Scaler is not fitted yet. Call 'fit' with appropriate data before inverse transforming.")
        # 反标准化指定的列
        df_standardized = pd.DataFrame(self.scaler.inverse_transform(df[self.columns]), columns=self.columns)
        # 将反标准化后的列替换原始数据框中的对应列
        for col in self.columns:
            df[col] = df_standardized[col]
        return df

    def save(self, filename):
        """  
        保存标准化器。
        """
        joblib.dump(self.scaler, filename)

    def load(self, filename):
        """ 
        加载标准化器。
        """
        self.scaler = joblib.load(filename)
        self.fitted = True

# 示例使用
standardizer = DataStandardizer(columns= COLUMNS) # 指定需要标准化的列
standardizer.fit(df) # 使用df数据进行标准化
df_standardized = standardizer.transform(df) # 标准化df数据

# 保存标准化器
standardizer.save('scaler.joblib')

# # 加载标准化器
# standardizer.load('scaler.joblib')

# # 反标准化
# df_original = standardizer.inverse_transform(df_standardized)

In [5]:
df_standardized

Unnamed: 0,BHC,CAL,DEN,DEPT,GR,HAC,RMG,RMN,RMN-RMG,SP,WellName
0,1.354331,1.382265,0.021540,-1.575040,-0.076729,1.203110,2.260,2.265,-0.687548,1.389699,A1
1,1.331241,1.345440,0.073887,-1.574611,-0.067284,1.092743,2.241,2.281,-0.658715,1.389699,A1
2,1.314226,1.298230,0.178583,-1.574182,-0.068630,1.044477,2.405,2.474,-0.634824,1.378332,A1
3,1.325168,1.260461,0.073887,-1.573753,-0.076729,1.090462,2.598,2.640,-0.657067,1.358426,A1
4,1.338521,1.232135,0.178583,-1.573324,-0.094258,1.242187,2.533,2.538,-0.687548,1.338520,A1
...,...,...,...,...,...,...,...,...,...,...,...
38729,-0.231540,-0.852690,-1.580308,0.359415,-0.367605,-0.233752,6.191,8.237,0.993849,-1.949959,A6
38730,-0.282565,-0.881016,-1.287160,0.359844,-0.346479,-0.233752,5.867,8.042,1.100121,-1.953868,A6
38731,-0.332362,-0.895179,-1.077768,0.360273,-0.320068,-0.240638,6.723,9.164,1.319255,-1.959762,A6
38732,-0.373673,-0.859299,-0.931194,0.360702,-0.279134,-0.240638,7.263,9.530,1.175912,-1.953868,A6


In [6]:
# data_save.py

import h5py
import numpy as np

def save_data_to_h5(data, filename, file_description):
    """
    将数据保存为.h5文件，并添加文件的描述信息作为属性。

    Parameters:
    - data (DataFrame or ndarray): 要保存的数据。
    - filename (str): 文件名，包括路径和扩展名。
    - file_description (str): 整个文件的描述信息。
    """
    with h5py.File(filename, 'w') as h5f:
        # 添加整个文件的描述信息作为根组的属性
        h5f.attrs['file_description'] = file_description
        
        # 创建数据集
        for key in data.columns:
            dataset = h5f.create_dataset(key, data=data[key].values)

In [7]:
import h5py
import numpy as np
import pandas as pd

def load_data_from_h5(filename):
    """
    从.h5文件中读取数据和文件描述信息。

    Parameters:
    - filename (str): 文件名，包括路径和扩展名。

    Returns:
    - data (DataFrame): 从文件中读取的数据。
    - file_description (str): 文件的描述信息。
    """
    with h5py.File(filename, 'r') as h5f:
        # 读取文件描述信息
        file_description = h5f.attrs.get('file_description', 'No description available')

        data = {key: h5f[key][()] for key in h5f.keys()}
        # 将数据转换为DataFrame
        data_df = pd.DataFrame(data)
        
        # 在添加数据集到DataFrame之后，将WellName列中的字节字符串转换为字符串
        data_df['WellName'] = data_df['WellName'].apply(lambda x: x.decode('utf-8'))

        return data_df, file_description



In [8]:
file_description = "将标准化后的数据保存为.h5文件，以便后续使用。标准化器为StandardScaler。"
save_data_to_h5(df_standardized, 'well_log_daqing_standardized.h5', file_description)
print("数据已保存为well_log_daqing_standardized.h5。还有一个标准化器scaler.joblib。使用的标准化器为StandardScaler。")

数据已保存为well_log_daqing_standardized.h5。还有一个标准化器scaler.joblib。使用的标准化器为StandardScaler。


In [10]:
# 示例使用
filename = './well_log_daqing_standardized.h5' # 请替换为您的文件名
data, description = load_data_from_h5(filename)
print("File Description:", description)
print("Data:", data)

File Description: 将标准化后的数据保存为.h5文件，以便后续使用。标准化器为StandardScaler。
Data:             BHC       CAL       DEN      DEPT        GR       HAC    RMG  \
0      1.354331  1.382265  0.021540 -1.575040 -0.076729  1.203110  2.260   
1      1.331241  1.345440  0.073887 -1.574611 -0.067284  1.092743  2.241   
2      1.314226  1.298230  0.178583 -1.574182 -0.068630  1.044477  2.405   
3      1.325168  1.260461  0.073887 -1.573753 -0.076729  1.090462  2.598   
4      1.338521  1.232135  0.178583 -1.573324 -0.094258  1.242187  2.533   
...         ...       ...       ...       ...       ...       ...    ...   
38729 -0.231540 -0.852690 -1.580308  0.359415 -0.367605 -0.233752  6.191   
38730 -0.282565 -0.881016 -1.287160  0.359844 -0.346479 -0.233752  5.867   
38731 -0.332362 -0.895179 -1.077768  0.360273 -0.320068 -0.240638  6.723   
38732 -0.373673 -0.859299 -0.931194  0.360702 -0.279134 -0.240638  7.263   
38733 -0.496356 -0.798870 -0.920724  0.361131 -0.248756 -0.587599  6.874   

         RMN   RMN

## maximize

In [None]:
from sklearn.preprocessing import MinMaxScaler
preproc = MinMaxScaler()

In [None]:
# data_process.py

import pandas as pd
import joblib
# form constants import COLUMNS

class DataStandardizer:
    """
    数据标准化类，使用提前声明的 preproc 进行标准化。
    """
    def __init__(self, columns=None):
        self.scaler = preproc()
        self.columns = columns
        self.fitted = False

    def fit(self, df):
        """
        拟合标准化器。
        """
        if self.columns is None:
            self.columns = df.columns
        self.scaler.fit(df[self.columns])
        self.fitted = True

    def transform(self, df):
        """  
        对数据进行标准化。
        """
        if not self.fitted:
            raise ValueError("Scaler is not fitted yet. Call 'fit' with appropriate data before transforming.")
        # 标准化指定的列
        df_standardized = pd.DataFrame(self.scaler.transform(df[self.columns]), columns=self.columns)
        # 将标准化后的列替换原始数据框中的对应列
        for col in self.columns:
            df[col] = df_standardized[col]
        return df

    def inverse_transform(self, df):
        """ 
        对标准化后的数据进行反标准化。
        """
        if not self.fitted:
            raise ValueError("Scaler is not fitted yet. Call 'fit' with appropriate data before inverse transforming.")
        # 反标准化指定的列
        df_standardized = pd.DataFrame(self.scaler.inverse_transform(df[self.columns]), columns=self.columns)
        # 将反标准化后的列替换原始数据框中的对应列
        for col in self.columns:
            df[col] = df_standardized[col]
        return df

    def save(self, filename):
        """  
        保存标准化器。
        """
        joblib.dump(self.scaler, filename)

    def load(self, filename):
        """ 
        加载标准化器。
        """
        self.scaler = joblib.load(filename)
        self.fitted = True

# 示例使用
standardizer = DataStandardizer(columns= COLUMNS) # 指定需要标准化的列
standardizer.fit(df) # 使用df数据进行标准化
df_MinMaxScaler = standardizer.transform(df) # 标准化df数据

# 保存标准化器
standardizer.save('scaler_minmaxscalr.joblib')

# # 加载标准化器
# standardizer.load('scaler_minmaxscalr.joblib')

# # 反标准化
# df_original = standardizer.inverse_transform(df_MinMaxScaler)

In [None]:
file_description = "将标准化后的数据保存为.h5文件，以便后续使用。标准化器为MinMaxScaler。"
save_data_to_h5(df_standardized, 'well_log_daqing_MinMaxScaler.h5', file_description)
print("数据已保存为well_log_daqing_standardized.h5。还有一个标准化器scaler.joblib。使用的标准化器为MinMaxScaler。")

In [None]:
# 示例使用
filename = './well_log_daqing_MinMaxScaler.h5' # 请替换为您的文件名
data, description = load_data_from_h5(filename)
print("File Description:", description)
print("Data:", data)

# 开始训练

In [9]:
import sys
import warnings
import time
from tqdm import tqdm
from math import sqrt

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan



import xgboost as xgb

from IPython.display import HTML, display
import tabulate

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [13]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin

imputers = Imputers()
imputers.list()

['most_frequent',
 'sinkhorn',
 'softimpute',
 'EM',
 'sklearn_missforest',
 'miracle',
 'nop',
 'hyperimpute',
 'mice',
 'sklearn_ice',
 'median',
 'ice',
 'missforest',
 'miwae',
 'mean',
 'gain']

In [14]:
custom_ice_plugin = "custom_ice"


class NewPlugin(ImputerPlugin):
    def __init__(self):
        super().__init__()
        lr = LinearRegression()
        self._model = IterativeImputer(
            estimator=lr, max_iter=500, tol=1e-10, imputation_order="roman"
        )

    @staticmethod
    def name():
        return custom_ice_plugin

    @staticmethod
    def hyperparameter_space():
        return []

    def _fit(self, *args, **kwargs) -> "NewPlugin":
        self._model.fit(*args, **kwargs)
        return self

    def _transform(self, *args, **kwargs):
        return self._model.transform(*args, **kwargs)

    def save(self) -> bytes:
        raise NotImplemented("placeholder")

    @classmethod
    def load(cls, buff: bytes) -> "NewPlugin":
        raise NotImplemented("placeholder")


imputers.add(custom_ice_plugin, NewPlugin)

assert imputers.get(custom_ice_plugin) is not None

In [15]:
imputers.list()

['most_frequent',
 'sinkhorn',
 'custom_ice',
 'softimpute',
 'EM',
 'sklearn_missforest',
 'miracle',
 'nop',
 'hyperimpute',
 'mice',
 'sklearn_ice',
 'median',
 'ice',
 'missforest',
 'miwae',
 'mean',
 'gain']

In [19]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

preproc = MinMaxScaler()


def dataset():
    X = data

    COLUMNS = ['DEPT', 'RMN-RMG', 'CAL', 'SP', 'GR', 'HAC', 'BHC', 'DEN']
    
    X = X[COLUMNS]

    return train_test_split(X, test_size=0.2)


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return pd.DataFrame(x), pd.DataFrame(x_miss), pd.DataFrame(mask)

['most_frequent',
 'sinkhorn',
 'custom_ice',
 'softimpute',
 'EM',
 'sklearn_missforest',
 'miracle',
 'nop',
 'hyperimpute',
 'mice',
 'sklearn_ice',
 'median',
 'ice',
 'missforest',
 'miwae',
 'mean',
 'gain']

In [25]:
datasets = {}
headers = ["Plugin"]

pct = 0.3

mechanisms = ["MAR", "MNAR", "MCAR"]
percentages = [pct]


plugins = ['most_frequent',
 'sinkhorn',
 'softimpute',
 'EM',
 'sklearn_missforest',
 'miracle',
 'nop',
 'hyperimpute',
 'mice',
 'sklearn_ice',
 'median',
 'ice',
 'missforest',
 'miwae',
 'mean',
 'gain']

X_train, X_test = dataset()

for ampute_mechanism in mechanisms:
    for p_miss in percentages:
        if ampute_mechanism not in datasets:
            datasets[ampute_mechanism] = {}

        headers.append(ampute_mechanism + "-" + str(p_miss))
        datasets[ampute_mechanism][p_miss] = ampute(X_train, ampute_mechanism, p_miss)

In [21]:
import pprint

# 在这里添加打印datasets的代码
for key, value in datasets.items():
    display(f"--- {key} ---") # 使用分隔符和标题
    for sub_key, sub_value in value.items():
        display(f"{sub_key}:")
        display(sub_value)
    display("------") # 使用分隔符

'--- MAR ---'

'0.3:'

(           DEPT   RMN-RMG       CAL        SP        GR       HAC       BHC  \
 34159 -1.600958  0.011043 -0.701616  0.835695 -0.073063  0.272014  0.103910   
 31675  1.385416 -1.863126  0.202941 -0.760403  1.026007 -0.656500 -0.193709   
 13948 -1.670484 -0.623291  0.118906  0.609210 -0.015231  1.616928  1.557271   
 18242  0.171494 -0.009552 -1.067971 -1.615043 -0.097443 -0.645246 -0.927806   
 10671  0.086087 -0.463472 -1.310634  1.324207  0.403362 -0.624567 -0.424721   
 ...         ...       ...       ...       ...       ...       ...       ...   
 10588  0.050483  0.657735 -2.144375  1.013347 -0.006226 -0.569394 -0.355452   
 12939  1.058981 -0.330014 -0.586422  0.369555 -0.157128 -0.075119 -0.265526   
 34790 -1.330280  0.283725 -0.402300  0.608609  0.423595 -0.442953 -0.481820   
 12900  1.042251 -0.148776 -0.248393  0.517979 -0.237198 -0.677459 -0.974008   
 2925  -0.320315  1.025154  0.278479  0.198098 -0.242661 -0.836070 -0.710602   
 
             DEN  
 34159  1.518691  


'------'

'--- MNAR ---'

'0.3:'

(           DEPT   RMN-RMG       CAL        SP        GR       HAC       BHC  \
 34159 -1.600958  0.011043 -0.701616  0.835695 -0.073063  0.272014  0.103910   
 31675  1.385416 -1.863126  0.202941 -0.760403  1.026007 -0.656500 -0.193709   
 13948 -1.670484 -0.623291  0.118906  0.609210 -0.015231  1.616928  1.557271   
 18242  0.171494 -0.009552 -1.067971 -1.615043 -0.097443 -0.645246 -0.927806   
 10671  0.086087 -0.463472 -1.310634  1.324207  0.403362 -0.624567 -0.424721   
 ...         ...       ...       ...       ...       ...       ...       ...   
 10588  0.050483  0.657735 -2.144375  1.013347 -0.006226 -0.569394 -0.355452   
 12939  1.058981 -0.330014 -0.586422  0.369555 -0.157128 -0.075119 -0.265526   
 34790 -1.330280  0.283725 -0.402300  0.608609  0.423595 -0.442953 -0.481820   
 12900  1.042251 -0.148776 -0.248393  0.517979 -0.237198 -0.677459 -0.974008   
 2925  -0.320315  1.025154  0.278479  0.198098 -0.242661 -0.836070 -0.710602   
 
             DEN  
 34159  1.518691  


'------'

'--- MCAR ---'

'0.3:'

(           DEPT   RMN-RMG       CAL        SP        GR       HAC       BHC  \
 34159 -1.600958  0.011043 -0.701616  0.835695 -0.073063  0.272014  0.103910   
 31675  1.385416 -1.863126  0.202941 -0.760403  1.026007 -0.656500 -0.193709   
 13948 -1.670484 -0.623291  0.118906  0.609210 -0.015231  1.616928  1.557271   
 18242  0.171494 -0.009552 -1.067971 -1.615043 -0.097443 -0.645246 -0.927806   
 10671  0.086087 -0.463472 -1.310634  1.324207  0.403362 -0.624567 -0.424721   
 ...         ...       ...       ...       ...       ...       ...       ...   
 10588  0.050483  0.657735 -2.144375  1.013347 -0.006226 -0.569394 -0.355452   
 12939  1.058981 -0.330014 -0.586422  0.369555 -0.157128 -0.075119 -0.265526   
 34790 -1.330280  0.283725 -0.402300  0.608609  0.423595 -0.442953 -0.481820   
 12900  1.042251 -0.148776 -0.248393  0.517979 -0.237198 -0.677459 -0.974008   
 2925  -0.320315  1.025154  0.278479  0.198098 -0.242661 -0.836070 -0.710602   
 
             DEN  
 34159  1.518691  


'------'

In [26]:
from hyperimpute import logger

results = []
duration = []

for plugin in tqdm(plugins):
    plugin_results = [plugin]
    plugin_duration = [plugin]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            print(f"-------Running {plugin} on {ampute_mechanism} with {p_miss} missing values---------")
            # 动态开启debug模式，测试每个模型的训练时间和细节
            logger.add(sink=sys.stderr, level="DEBUG")
            
            ctx = imputers.get(plugin)
            
            x, x_miss, mask = datasets[ampute_mechanism][p_miss]

            start = time.time() * 1000
            x_imp = ctx.fit_transform(x_miss)

            plugin_duration.append(round(time.time() * 1000 - start, 4))
            plugin_results.append(RMSE(x_imp.values, x.values, mask.values))
            print(f"-------{plugin} on {ampute_mechanism} with {p_miss} missing values finished---------")

    results.append(plugin_results)
    duration.append(plugin_duration)

 31%|███▏      | 5/16 [1:10:26<2:49:36, 925.17s/it] 

Instructions for updating:
non-resource variables are not supported in the long term


100%|██████████| 16/16 [1:55:45<00:00, 434.08s/it] 


In [27]:
display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))
display(HTML(tabulate.tabulate(duration, headers=headers, tablefmt="html")))

Plugin,MAR-0.3,MNAR-0.3,MCAR-0.3
most_frequent,1.01982,1.01598,1.07754
sinkhorn,0.899713,0.861064,0.917585
softimpute,1.29684,1.50383,0.917716
EM,0.731373,0.746211,0.756631
sklearn_missforest,0.705665,0.728864,0.799713
miracle,0.68799,1.38333,0.793778
nop,,,
hyperimpute,0.534954,0.626855,0.653863
mice,1.0951,1.07816,1.06903
sklearn_ice,0.730121,0.929051,0.854112


Plugin,MAR-0.3,MNAR-0.3,MCAR-0.3
most_frequent,25.0005,19.9939,19.0002
sinkhorn,351510.0,348465.0,342343.0
softimpute,145530.0,143592.0,138369.0
EM,203784.0,1964590.0,357597.0
sklearn_missforest,2399.51,114315.0,113232.0
miracle,124503.0,154919.0,167396.0
nop,0.5105,0.0,0.0
hyperimpute,18448.2,32396.5,31318.4
mice,9683.53,12874.5,12935.4
sklearn_ice,328.017,5987.63,3408.72


毫秒为单位的时间

## 代码优化

优化缩放器的代码，使其更加简洁，可扩展，可更换缩放器

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

class DataStandardizer:
    """
    数据标准化类，支持多种标准化方法。
    """
    def __init__(self, scaler_type='standard', columns=None):
        self.scaler_type = scaler_type
        self.scaler = None
        self.columns = columns
        self.fitted = False

    def _get_scaler(self):
        if self.scaler_type == 'standard':
            return StandardScaler()
        elif self.scaler_type == 'minmax':
            return MinMaxScaler()
        else:
            raise ValueError(f"Unsupported scaler type: {self.scaler_type}")

    def fit(self, df):
        """
        拟合标准化器。
        """
        if self.columns is None:
            self.columns = df.columns
        self.scaler = self._get_scaler()
        self.scaler.fit(df[self.columns])
        self.fitted = True

    def transform(self, df):
        """  
        对数据进行标准化。
        """
        if not self.fitted:
            raise ValueError("Scaler is not fitted yet. Call 'fit' with appropriate data before transforming.")
        df_standardized = pd.DataFrame(self.scaler.transform(df[self.columns]), columns=self.columns)
        for col in self.columns:
            df[col] = df_standardized[col]
        return df

    def inverse_transform(self, df):
        """ 
        对标准化后的数据进行反标准化。
        """
        if not self.fitted:
            raise ValueError("Scaler is not fitted yet. Call 'fit' with appropriate data before inverse transforming.")
        df_standardized = pd.DataFrame(self.scaler.inverse_transform(df[self.columns]), columns=self.columns)
        for col in self.columns:
            df[col] = df_standardized[col]
        return df

    def save(self, filename):
        """  
        保存标准化器。
        """
        joblib.dump(self, filename)

    def load(self, filename):
        """ 
        加载标准化器。
        """
        loaded_obj = joblib.load(filename)
        self.scaler = loaded_obj.scaler
        self.scaler_type = loaded_obj.scaler_type
        self.columns = loaded_obj.columns
        self.fitted = loaded_obj.fitted

# # 示例使用
# standardizer = DataStandardizer(columns=COLUMNS) # 指定需要标准化的列
# standardizer.fit(df) # 使用df数据进行标准化
# df_standardized = standardizer.transform(df) # 标准化df数据

# # 保存标准化器
# standardizer.save('scaler.joblib')

# # 加载标准化器
# standardizer.load('scaler.joblib')

# # 反标准化
# df_original = standardizer.inverse_transform(df_standardized)

这部分过程可以使用skrlearn的pipeline来进行标准数据处理流，而且更简单，更方便，可读性更强，内容更完善

In [None]:
file_description = "将标准化后的数据保存为.h5文件，以便后续使用。标准化器为MinMaxScaler。"
save_data_to_h5(df_standardized, 'well_log_daqing_MinMaxScaler.h5', file_description)
print("数据已保存为well_log_daqing_standardized.h5。还有一个标准化器scaler.joblib。使用的标准化器为MinMaxScaler。")

In [None]:
# 示例使用
filename = './well_log_daqing_MinMaxScaler.h5' # 请替换为您的文件名
data, description = load_data_from_h5(filename)
print("File Description:", description)
print("Data:", data)