In [1]:
! ls

[31mData_pretreatment.ipynb[m[m [31mModel_trainning.ipynb[m[m   [31mREADME.md[m[m


In [59]:
import os

# configuration
class Conf:
    '''
    Pre-defined parameters for ML tranning
    '''
    # Paths
    root                      = './'
    path_to_Database          = os.path.join(root, 'DataBase')
    path_to_inputs            = os.path.join(path_to_Database, 'LammpsDataFile')
    path_to_temp_outputs      = os.path.join(path_to_Database, '3DGrid')
    grid_DataBase_name        = '3D_Grids.h5'
    path_to_grid_DataBase     = os.path.join(root, grid_DataBase_name)

    # Supercelling: new boundary in X/Y/Z directions
    GP_box_bounds_info_new = {'x': [600, 800],
                              'y': [600, 800],
                              'z': [600, 800]}

    # 3D grid: number of grides in X/Y/Z directions
    grid_size = [400, 400, 400]

    # 3D grid: switch
    grid_switch = True

    # 指定温度区间
    temp_list  = [373, 363, 353, 343]
    sigma_list = range(8,19,1)
    len_list   = range(2,21,1)

# 1.初始化

In [2]:
# create folders for OUTPUT
os.makedirs(Conf.path_to_inputs,  exist_ok=True)
os.makedirs(Conf.path_to_temp_outputs, exist_ok=True)

In [48]:
# basic import
import  os, math, psutil, h5py, shutil
import  concurrent.futures
from    concurrent.futures      import ProcessPoolExecutor
from    tqdm                    import  tqdm
from    pathlib                 import  Path  
from    multiprocessing         import  Pool, Lock
import  numpy                                       as np
import  pandas                                      as pd
import  torch

In [4]:
# check hardware
print(f"CPU 核心数:\t{os.cpu_count()}")
print(f"CPU 线程数:\t{psutil.cpu_count(logical=True)}")
print(f"总内存:\t\t{psutil.virtual_memory().total / (1024**3):.2f} GB")
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"可用 GPU 数量:\t{gpu_count}")
    for i in range(gpu_count):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i+1}:\t\t{gpu_name}")
else:
    print("没有可用的 GPU")

CPU 核心数:	24
CPU 线程数:	24
总内存:		187.05 GB
没有可用的 GPU


# 2.Def Functions

In [5]:
## 读取 lammps data file ##

def fetch(len_i:int,
          sigma_i:int,
          temp_i:int,
          item_i:str,
          original_file_path:str
          ):
    '''
    Fetch data from EXCEL
    '''
    # 加载Excel文件
    df = pd.read_excel(original_file_path)
    # 查找行号
    row_number = df[
        (df['len\n(A)'] == len_i) &
        (df['sigma\n(A)'] == sigma_i) &
        (df['temp\n(k)'] == temp_i)
    ].index
    # 读取文件
    return df.at[row_number[0], item_i]
    
class LmpGP_file_reader():
    '''
    read lammps data file
    '''
    def __init__(self, len: int, sigma: int, temp: int, path:int):
        ## terminology of parameters
        self.sigma          = sigma
        self.len            = len
        self.temp           = temp
        ## working path
        self.path           =  path
        self.data_file      =  f"data.3_len_{self.len}_sigma_{self.sigma}_pretreated"
        self.DataFile_path  =  os.path.join(self.path, self.data_file)
        ## auto excecuting
        self.read_GP_data_file()
    def read_GP_data_file(self):
        ### initialize
        self.GP_atom_count, self.GP_box_bounds_info, self.GP_atom_data = None, {}, []
        atom_section = False

        ### extract info
        with open(self.DataFile_path, 'r') as file:
            for line in file:

                ## Get the number of atoms
                if line.strip().endswith("atoms"):
                    self.GP_atom_count = int(line.split()[0])

                ## Get the box info
                # boudnds
                if 'xlo xhi' in line:
                    self.GP_box_bounds_info['x'] = list(map(float, line.strip().split()[:2]))
                if 'ylo yhi' in line:
                    self.GP_box_bounds_info['y'] = list(map(float, line.strip().split()[:2]))
                if 'zlo zhi' in line:
                    self.GP_box_bounds_info['z'] = list(map(float, line.strip().split()[:2]))

                ## Read atom data
                # Check title
                if 'Atoms' in line and '#' in line:
                    atom_section = True
                    continue
                # read
                if atom_section and line.strip():
                    parts = line.split()
                    if len(parts) >= 6:  # valid line Ensured
                        atom_id, atom_type, x, y, z = int(parts[0]), int(parts[1]), float(parts[4]), float(parts[5]), float(parts[6])
                        self.GP_atom_data.append([atom_id, atom_type, x, y, z])

            # calculate side length
            x_length = float(self.GP_box_bounds_info['x'][1]) - float(self.GP_box_bounds_info['x'][0])
            y_length = float(self.GP_box_bounds_info['y'][1]) - float(self.GP_box_bounds_info['y'][0])
            z_length = float(self.GP_box_bounds_info['z'][1]) - float(self.GP_box_bounds_info['z'][0])
            self.GP_box_lengths_info = np.array([x_length, y_length, z_length])

        ### post-treatment
        ## convert
        self.GP_atom_data = np.array(self.GP_atom_data)
        ## sorting
        # 首先按照 type 排序，然后在 type 相同的情况下按 ID 排序
        if hasattr(self, 'GP_atom_data') and self.GP_atom_data.size > 0:
            # 获取排序后的索引
            sorted_indices = np.lexsort((self.GP_atom_data[:, 0], self.GP_atom_data[:, 1]))  # 先按 ID，再按 type 排序
            # 应用排序
            self.GP_atom_data = self.GP_atom_data[sorted_indices]

def extract_info(len_i:int,sigma_i:int,temp_i:int)->list:
    '''
    Extract info from lammps data file
    '''
    # 读取数据文件信息
    GP                  = LmpGP_file_reader(len=len_i, sigma=sigma_i, temp=temp_i, path=Conf.path_to_inputs)
    GP_atom_count       = GP.GP_atom_count
    GP_box_bounds_info  = GP.GP_box_bounds_info

    # 计算最大边界长度
    x_length            = GP_box_bounds_info['x'][1] - GP_box_bounds_info['x'][0]
    y_length            = GP_box_bounds_info['y'][1] - GP_box_bounds_info['y'][0]
    z_length            = GP_box_bounds_info['z'][1] - GP_box_bounds_info['z'][0]
    max_BoundLength_i   = np.amax([x_length, y_length, z_length])

    # 计算最小边界长度
    min_Bound_i  = np.amin([GP_box_bounds_info['x'][0],
                            GP_box_bounds_info['x'][1],
                            GP_box_bounds_info['y'][0],
                            GP_box_bounds_info['y'][1],
                            GP_box_bounds_info['z'][0],
                            GP_box_bounds_info['z'][1]
                            ])

    # 存储信息
    return [int(len_i),                   # 0
            int(sigma_i),                 # 1
            int(temp_i),                  # 2
            GP_atom_count,                # 3
            GP_box_bounds_info['x'][0],   # 4
            GP_box_bounds_info['x'][1],   # 5
            GP_box_bounds_info['y'][0],   # 6
            GP_box_bounds_info['y'][1],   # 7
            GP_box_bounds_info['z'][0],   # 8
            GP_box_bounds_info['z'][1],   # 9
            min_Bound_i,                  # 10
            x_length,                     # 11
            y_length,                     # 12
            z_length,                     # 13
            max_BoundLength_i]            # 14

def find_info(mode: str, info_matrix:np.ndarray):
    '''
    找到拥有最大/最小 max_BoundLength_i 的那一行
    '''
    row             = []
    bounds_info_i   = None
    direction       = None

    if mode == 'min':
        row = info_matrix[np.argmin(info_matrix[:, 14])]
    if mode == 'max':
        row = info_matrix[np.argmax(info_matrix[:, 14])]
    else:
        print('select mode')

    # 判断最{mode}的 'max_BoundLength_i' 所在方向
    if row[14] == abs(row[5]-row[4]):
        direction = 'x'
        bounds_info_i = (row[4], row[5])
    elif row[14] == abs(row[7]-row[6]):
        direction = 'y'
        bounds_info_i = (row[6], row[7])
    elif row[14] == abs(row[9]-row[8]):
        direction = 'z'
        bounds_info_i = (row[8], row[9])

    print(f"拥有最{mode} 'x/y/z 最大边长' 的那一行是: {row}")
    print(f'起止坐标； {bounds_info_i}')
    print(f"最{mode}的方向是: {direction}，长度是: {row[14]}\n\n")

In [6]:
## Supercelling 并存为 HDF5 文件 ##

class DataFile_to_grid():
    '''
    1. Convert lammps data file to grids/ 3D numpy matrix
    2. Supercelling to the desired size/ identical size
    '''
    def __init__(self,
                 GP_data,
                 grid_size: list,
                 limits: dict,
                 limits_new: dict,
                 output_filename: str
                 ):
        '''
        Input:
        GP_data:          data file 原始数据
        gride_size:       网格划分的尺寸
        limits:           实际的 x/y/z 边界上下限
        limits_new:       需要补全到的 x/y/z 边界上下限
        output_filename:  输出文件名
        '''
        self.output_filename = output_filename
        self.GP_data = GP_data
        self.grid_size = grid_size
        self.xmin, self.xmax = limits["x"]
        self.ymin, self.ymax = limits["y"]
        self.zmin, self.zmax = limits["z"]
        self.xmin_new, self.xmax_new = limits_new["x"]
        self.ymin_new, self.ymax_new = limits_new["y"]
        self.zmin_new, self.zmax_new = limits_new["z"]

        # 计算原始边界和新边界的长度
        self.xlength = self.xmax - self.xmin
        self.ylength = self.ymax - self.ymin
        self.zlength = self.zmax - self.zmin
        self.xlength_new = self.xmax_new - self.xmin_new
        self.ylength_new = self.ymax_new - self.ymin_new
        self.zlength_new = self.zmax_new - self.zmin_new

        # 自动执行
        self.GP_data_pretreat()
        self.GP_data_analysis()
        self.GP_data_aligning()
        self.GP_data_to_grid()
  
    def GP_data_pretreat(self):
        # 删除 atom type 列
        self.GP_data = np.delete(self.GP_data, 1, axis=1)

    def GP_data_analysis(self):
        # 计算原子数
        self.GP_atom_count = self.GP_data.shape[0]

    def GP_data_aligning(self):
        '''
        对data file 进行补全（超胞 supercelling)
        '''
        # 计算需要复制的倍数，包括正方向和负方向
        x_repeats_pos = int(np.ceil((self.xmax_new - self.xmax) / self.xlength))
        x_repeats_neg = int(np.ceil((self.xmin - self.xmin_new) / self.xlength))
        y_repeats_pos = int(np.ceil((self.ymax_new - self.ymax) / self.ylength))
        y_repeats_neg = int(np.ceil((self.ymin - self.ymin_new) / self.ylength))
        z_repeats_pos = int(np.ceil((self.zmax_new - self.zmax) / self.zlength))
        z_repeats_neg = int(np.ceil((self.zmin - self.zmin_new) / self.zlength))

        # 扩展原子坐标，包括正方向和负方向
        extended_atoms  = []
        new_atom_id     = 1
        for i in range(-x_repeats_neg, x_repeats_pos + 1):
            for j in range(-y_repeats_neg, y_repeats_pos + 1):
                for k in range(-z_repeats_neg, z_repeats_pos + 1):
                    for atom in self.GP_data:
                        new_atom = atom.copy()
                        new_atom[0] = new_atom_id  # 更新原子ID
                        new_atom[1] = atom[1] + i * self.xlength
                        new_atom[2] = atom[2] + j * self.ylength
                        new_atom[3] = atom[3] + k * self.zlength
                        extended_atoms.append(new_atom)
                        new_atom_id += 1

        # 转换为numpy数组
        extended_atoms = np.array(extended_atoms)

        # 过滤掉超出新边界的原子
        extended_atoms = extended_atoms[
            (extended_atoms[:, 1] >= self.xmin_new) & (extended_atoms[:, 1] < self.xmax_new) &
            (extended_atoms[:, 2] >= self.ymin_new) & (extended_atoms[:, 2] < self.ymax_new) &
            (extended_atoms[:, 3] >= self.zmin_new) & (extended_atoms[:, 3] < self.zmax_new)
        ]

        # 更新GP_data，并重新分配ID以确保连续
        self.GP_data                = extended_atoms
        self.GP_data[:, 0]          = np.arange(1, self.GP_data.shape[0] + 1)
        self.GP_atom_count_extended = self.GP_data.shape[0]
    
    def GP_data_to_grid(self)-> np.ndarray:
        '''
        将原子数据转换为网格数据
        '''
        # N 格子, N+1 个点
        x_bins = np.linspace(self.xmin_new, self.xmax_new, self.grid_size[0] + 1)
        y_bins = np.linspace(self.ymin_new, self.ymax_new, self.grid_size[1] + 1)
        z_bins = np.linspace(self.zmin_new, self.zmax_new, self.grid_size[2] + 1)

        # 生成 X * Y * Z 形状的 3D 矩阵 (grid_size=(XY,Z))
        grid_counts = np.zeros(self.grid_size)

        for atom in self.GP_data:
            x_idx = np.digitize(atom[1], x_bins) - 1    # digitize() 从1开始
            y_idx = np.digitize(atom[2], y_bins) - 1
            z_idx = np.digitize(atom[3], z_bins) - 1
            if x_idx < self.grid_size[0] and y_idx < self.grid_size[1] and z_idx < self.grid_size[2]:
                grid_counts[x_idx, y_idx, z_idx] += 1

        self.grid_counts = grid_counts

        # 打印结果以检查
        #print(f"Grid counts shape: {self.grid_counts.shape}")
        #print(self.grid_counts)

        return grid_counts

def process_single_DataFile_to_grid(len_i:int, sigma_i:int, temp_i:int)->np.ndarray:
    """
    1. 处理单个 len_i sigma_i temp_i 写成 3D grid, 即 np.ndarray
    2. 复制成4个温度
    3. 返回 np.ndarray
    """
    # 读取第一个温度的数据
    GP                  = LmpGP_file_reader(len=len_i, sigma=sigma_i, temp=temp_i, path=Conf.path_to_inputs)
    GP_atom_count       = GP.GP_atom_count
    GP_box_bounds_info  = GP.GP_box_bounds_info
    GP_atom_data        = GP.GP_atom_data

    # Supercelling 处理
    GP = DataFile_to_grid(GP_data           = GP_atom_data,                 # 原始的 data file info
                          grid_size         = Conf.grid_size,               # 网格划分的尺寸
                          limits            = GP_box_bounds_info,           # 实际的 x/y/z 边界上下限
                          limits_new        = Conf.GP_box_bounds_info_new,  # 需要补全到的 x/y/z 边界上下限
                          output_filename   = os.path.join(Conf.path_to_temp_outputs,f'CHECK_data_4_len_{len_i}_sigma_{sigma_i}_temp_i_{temp_i}.txt')
                          )

    # 提取 3D grid
    GP_grid = GP.grid_counts
    return GP_grid

def process_AllSigma_DataFile_to_grid(len_i, sigma_i)->h5py:
    '''
    1. 处理 单个 len_i_Sigma_i 下 所有 temp_i 的 3D grid/ np.ndarray 
    2. 压缩保存为 h5py 文件
    '''
    # 指定 hdf5 文件位置
    hdf5_file_path = os.path.join(Conf.path_to_temp_outputs, f"len_{len_i}_sigma_{sigma_i}.h5")
    
    # 读取标准的 grid file
    grid_data_i   =  process_single_DataFile_to_grid(len_i,sigma_i,Conf.temp_list[0])

    # 如果文件已经存在，跳过任务，避免并发冲突
    if os.path.exists(hdf5_file_path):
        print(f"❌ Skipping {hdf5_file_path}, file already exists.")
        return 
    
    ## 存储不同温度下的相同结构信息到 HDF5
    # 打开 hdf5 file
    with h5py.File(hdf5_file_path, 'w-') as hf:
        for temp_i in Conf.temp_list:
            # 定义名称
            grid_name_i  =  f"len_{len_i}_sigma_{sigma_i}_{temp_i}"
            # 存储
            hf.create_dataset(grid_name_i, data=grid_data_i, compression="gzip")  # 进行压缩，节省空间

def process_parallel_DataFile_to_grid(len_i:int):
    '''
    1. 并行 处理 单 len_i 下 所有 Sigma_i 的 3D grid/ np.ndarray 
    2. 压缩保存为 h5py 文件
    '''
    # 获取 CPU 线程数
    num_workers = min(psutil.cpu_count(logical=True),len(Conf.sigma_list))

    # 进度条 + 多线程执行任务
    with concurrent.futures.ThreadPoolExecutor(num_workers) as executor:
        futures = {
            executor.submit(process_AllSigma_DataFile_to_grid, len_i, sigma_i): 
            (len_i, sigma_i)
            for sigma_i in Conf.sigma_list
        }

        # 开始并行写入
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing DataFiles"):
            len_i, sigma_i = futures[future]  # 获取任务参数
            try:
                future.result()     # 确保任务成功完成
            except Exception as e:
                print(f"❌ len_i={len_i}_sihma_{sigma_i}: {e}")
                
        print(f"✅ len_i={len_i}_sihma_{sigma_i}: HDF5 save completed")

In [54]:
## 检查 hdf5 文件完整性 ##

def check_hdf5_integrity(len_i,sigma_i):
     '''
     检查所有 hdf5 文件完整性
     '''
     file_name = f'len_{len_i}_sigma_{sigma_i}.h5'
     file_path = os.path.join(Conf.path_to_temp_outputs,file_name)
     try:
          with h5py.File(file_path, "r") as h5f:
               pass
     except OSError as e:
          print(f"❌ {file_name} HDF5 文件可能损坏:", e)

def parallel_check_hdf5_integrity(len_i:int):
    '''
    并行检查完整性
    '''
    # 获取 CPU 线程数
    num_workers = min(psutil.cpu_count(logical=True),len(Conf.sigma_list))

    # 进度条 + 多线程执行任务
    with concurrent.futures.ThreadPoolExecutor(num_workers) as executor:
        futures = {
            executor.submit(check_hdf5_integrity, len_i, sigma_i): 
            (len_i, sigma_i)
            for sigma_i in Conf.sigma_list
        }

        # 开始并行写入
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing DataFiles"):
            len_i, sigma_i = futures[future]  # 获取任务参数
            try:
                future.result()     # 确保任务成功完成
            except Exception as e:
                print(f"❌ len_i_{len_i}_sigma_{sigma_i}: {e}")
                
        print(f"✅ len_i={len_i} HDF5 integrity check completed")

def check_hdf5_content(file_path=Conf.path_to_grid_DataBase):
     
     # 初始化数据集计数器
     dataset_count = 0  

     # 定义一个内部函数用于遍历 HDF5 文件
     def count_datasets(name, obj):
          nonlocal dataset_count
          if isinstance(obj, h5py.Dataset):  # 判断是否为数据集
               dataset_count += 1
          elif isinstance(obj, h5py.Group):  # 判断是否为组
               pass  # 如果是组，不计数

     with h5py.File(file_path, "r") as h5f:
          # 打印
          print(f"文件结构:")
          # 遍历文件内容以计数
          h5f.visititems(count_datasets)
          # 打印所有内变量名字
          h5f.visit(print) 

     print(f'文件数: {dataset_count}')

     return dataset_count # 文件数



## 汇总所有 hdf5 文件 ##

def extract_single_hdf5_file(file_name:str)-> dict:
     '''
     读取单个 HDF5 文件中的所有数据集
     :param len_i: len_i 参数
     :param sigma_i: sigma_i 参数
     :return: 字典，键为数据集名称，值为数据集数据
     '''
     file_path   = os.path.join(Conf.path_to_temp_outputs, f'{file_name}.h5')
     datasets_i  = {}
          
     with h5py.File(file_path, "r") as h5f:
          # 遍历文件中的所有数据集
          def collect_datasets(name, obj):
               if isinstance(obj, h5py.Dataset):
                    # 将数据集数据存储到字典中
                    datasets_i[name] = obj[()] 

          h5f.visititems(collect_datasets)

     return datasets_i



def merge_hdf5_files(output_file=Conf.path_to_grid_DataBase):
    """
    并行读取所有 HDF5 文件并将提取出的矩阵添加到目标 HDF5 文件中。
    每个矩阵的变量名等于字典中的键值。
    """

    # 创建或更新 HDF5 文件
    with h5py.File(output_file, 'a') as hf:  # 'a' 模式：追加数据
          for len_i in Conf.len_list:
               for sigma_i in tqdm(Conf.sigma_list, total=len(Conf.sigma_list), desc=f"len_{len_i}\t"):
                    # 提取单个 HDF5 文件中的数据
                    datasets_i = extract_single_hdf5_file(f'len_{len_i}_sigma_{sigma_i}')                   
                    try: 
                         # 遍历提取出的数据字典并存入 HDF5 文件
                         for dataset_name, dataset_data in datasets_i.items():
                              # 若已存在相同名称的数据集，删除后重新写入
                              if dataset_name in hf:
                                   del hf[dataset_name]  

                              hf.create_dataset(dataset_name, data=dataset_data, compression="gzip")
                    
                    except Exception as e:
                         print(f"❌ len_i_{len_i}_sigma_{sigma_i}: {e}")
          
    print(f"✅ All extracted matrices have been merged into {output_file}")

# 3. CHECK
找到合适的supercelling 参数

In [8]:
## Collect data files info ##

# 初始化一个主列表用于存储所有信息
info_mtx = np.empty((0, 15))

# 创建一个线程池
with concurrent.futures.ThreadPoolExecutor(psutil.cpu_count(logical=True)) as executor:
    # 生成所有任务
    futures = {executor.submit(extract_info, len_i, sigma_i, temp_i):
                        (len_i, sigma_i, temp_i)
                        for len_i in range(2, 21, 1)
                        for sigma_i in range(8, 19, 1)
                        for temp_i in range(373, 333, -10)}

    # 收集所有返回的数据
    results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures),total=len(futures),desc="Processing DataFiles")]

# 转换为 NumPy 数组
info_mtx = np.array(results)

# 打印结果以检查
print(f'info list shape:\t{info_mtx.shape}')
print(f'reference length:\t{len(Conf.len_list)*len(Conf.sigma_list)*len(Conf.temp_list)},{15}')

Processing DataFiles: 100%|██████████| 836/836 [01:47<00:00,  7.78it/s]

info list shape:	(836, 15)
reference length:	836,15





In [9]:
## 寻找最大和最短的 box 长度 ##
find_info('min', info_mtx) # 最小
find_info('max', info_mtx) # 最小

select mode
拥有最min 'x/y/z 最大边长' 的那一行是: [3.00000000e+00 8.00000000e+00 3.63000000e+02 3.83600000e+03
 6.51675695e+02 7.08250305e+02 6.52167825e+02 7.08467175e+02
 6.51941735e+02 7.08693265e+02 6.51675695e+02 5.65746094e+01
 5.62993504e+01 5.67515307e+01 5.67515307e+01]
起止坐标； (np.float64(651.9417346706942), np.float64(708.6932653293076))
最min的方向是: z，长度是: 56.75153065861332


拥有最max 'x/y/z 最大边长' 的那一行是: [4.00000000e+00 1.80000000e+01 3.43000000e+02 4.27600000e+03
 6.00449820e+02 7.59476180e+02 6.00133917e+02 7.60501083e+02
 6.00572822e+02 7.60062178e+02 6.00133917e+02 1.59026359e+02
 1.60367165e+02 1.59489356e+02 1.60367165e+02]
起止坐标； (np.float64(600.1339173401502), np.float64(760.5010826598495))
最max的方向是: y，长度是: 160.36716531969932




# 4.EXE
并行执行 lammps data file -> supercelling -> HDF5

In [10]:
len_i = 2
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [02:50<00:00, 15.51s/it]


✅ len_i=2_sihma_10: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 2817.03it/s]

✅ len_i=2 HDF5 integrity check completed





In [11]:
len_i = 3
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [02:46<00:00, 15.11s/it]


✅ len_i=3_sihma_8: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 2170.76it/s]

✅ len_i=3 HDF5 integrity check completed





In [12]:
len_i = 4
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [02:45<00:00, 15.03s/it]


✅ len_i=4_sihma_8: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 1516.88it/s]

✅ len_i=4 HDF5 integrity check completed





In [13]:
len_i = 5
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [02:53<00:00, 15.74s/it]


✅ len_i=5_sihma_9: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3892.46it/s]

✅ len_i=5 HDF5 integrity check completed





In [14]:
len_i = 6
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [02:57<00:00, 16.10s/it]


✅ len_i=6_sihma_12: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 1723.54it/s]

✅ len_i=6 HDF5 integrity check completed





In [15]:
len_i = 7
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:05<00:00, 16.83s/it]


✅ len_i=7_sihma_11: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3960.63it/s]

✅ len_i=7 HDF5 integrity check completed





In [16]:
len_i = 8
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:07<00:00, 17.05s/it]


✅ len_i=8_sihma_15: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 2516.49it/s]

✅ len_i=8 HDF5 integrity check completed





In [17]:
len_i = 9
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:16<00:00, 17.85s/it]


✅ len_i=9_sihma_15: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3012.76it/s]

✅ len_i=9 HDF5 integrity check completed





In [18]:
len_i = 10
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:20<00:00, 18.21s/it]


✅ len_i=10_sihma_13: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 4193.54it/s]

✅ len_i=10 HDF5 integrity check completed





In [19]:
len_i = 11
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:24<00:00, 18.55s/it]


✅ len_i=11_sihma_12: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 2980.64it/s]

✅ len_i=11 HDF5 integrity check completed





In [20]:
len_i = 12
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:29<00:00, 19.05s/it]


✅ len_i=12_sihma_8: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 4242.90it/s]

✅ len_i=12 HDF5 integrity check completed





In [21]:
len_i = 13
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:35<00:00, 19.57s/it]


✅ len_i=13_sihma_13: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 2169.64it/s]

✅ len_i=13 HDF5 integrity check completed





In [22]:
len_i = 14
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:41<00:00, 20.11s/it]


✅ len_i=14_sihma_14: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 1573.36it/s]

✅ len_i=14 HDF5 integrity check completed





In [23]:
len_i = 15
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:48<00:00, 20.73s/it]


✅ len_i=15_sihma_12: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3089.21it/s]

✅ len_i=15 HDF5 integrity check completed





In [24]:
len_i = 16
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:51<00:00, 21.07s/it]


✅ len_i=16_sihma_10: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 4127.88it/s]

✅ len_i=16 HDF5 integrity check completed





In [25]:
len_i = 17
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [03:59<00:00, 21.81s/it]


✅ len_i=17_sihma_14: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3067.85it/s]

✅ len_i=17 HDF5 integrity check completed





In [26]:
len_i = 18
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [04:06<00:00, 22.41s/it]


✅ len_i=18_sihma_11: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 2022.95it/s]

✅ len_i=18 HDF5 integrity check completed





In [27]:
len_i = 19
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [04:14<00:00, 23.12s/it]


✅ len_i=19_sihma_9: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3678.63it/s]

✅ len_i=19 HDF5 integrity check completed





In [28]:
len_i = 20
process_parallel_DataFile_to_grid(len_i)
parallel_check_hdf5_integrity(len_i)

Processing DataFiles: 100%|██████████| 11/11 [04:21<00:00, 23.74s/it]


✅ len_i=20_sihma_8: HDF5 save completed


Processing DataFiles: 100%|██████████| 11/11 [00:00<00:00, 3225.03it/s]


✅ len_i=20 HDF5 integrity check completed


# 5. 合并
合并所有HDF5文件为一个整体

In [56]:
merge_hdf5_files()

len_2	: 100%|██████████| 11/11 [03:33<00:00, 19.38s/it]
len_3	: 100%|██████████| 11/11 [03:25<00:00, 18.69s/it]
len_4	: 100%|██████████| 11/11 [03:23<00:00, 18.49s/it]
len_5	: 100%|██████████| 11/11 [03:22<00:00, 18.43s/it]
len_6	: 100%|██████████| 11/11 [03:23<00:00, 18.49s/it]
len_7	: 100%|██████████| 11/11 [03:23<00:00, 18.49s/it]
len_8	: 100%|██████████| 11/11 [03:23<00:00, 18.50s/it]
len_9	: 100%|██████████| 11/11 [03:24<00:00, 18.55s/it]
len_10	: 100%|██████████| 11/11 [03:23<00:00, 18.50s/it]
len_11	: 100%|██████████| 11/11 [03:24<00:00, 18.62s/it]
len_12	: 100%|██████████| 11/11 [03:24<00:00, 18.55s/it]
len_13	: 100%|██████████| 11/11 [03:28<00:00, 18.95s/it]
len_14	: 100%|██████████| 11/11 [03:25<00:00, 18.72s/it]
len_15	: 100%|██████████| 11/11 [03:22<00:00, 18.40s/it]
len_16	: 100%|██████████| 11/11 [03:22<00:00, 18.37s/it]
len_17	: 100%|██████████| 11/11 [03:22<00:00, 18.45s/it]
len_18	: 100%|██████████| 11/11 [03:22<00:00, 18.43s/it]
len_19	: 100%|██████████| 11/11 [03:23<

✅ All extracted matrices have been merged into 3D_Grids.h5





# 6. 后处理
检查完整性并删除临时文件

In [61]:
# 检验 hdf5 GA结构数据库 存在
if Path(Conf.path_to_grid_DataBase).exists():
    print("数据库存在 ✅")
    # 检查 hdf5 GA结构数据库 内含矩阵数量
    if check_hdf5_content(Conf.path_to_grid_DataBase) == int(len(Conf.len_list) * len( Conf.sigma_list) * len( Conf.temp_list)):
        print("数据库内容 OK 👌🏻")
        # 删除临时文件
        shutil.rmtree(Conf.path_to_temp_outputs)

数据库存在 ✅
文件结构:
len_10_sigma_10_343
len_10_sigma_10_353
len_10_sigma_10_363
len_10_sigma_10_373
len_10_sigma_11_343
len_10_sigma_11_353
len_10_sigma_11_363
len_10_sigma_11_373
len_10_sigma_12_343
len_10_sigma_12_353
len_10_sigma_12_363
len_10_sigma_12_373
len_10_sigma_13_343
len_10_sigma_13_353
len_10_sigma_13_363
len_10_sigma_13_373
len_10_sigma_14_343
len_10_sigma_14_353
len_10_sigma_14_363
len_10_sigma_14_373
len_10_sigma_15_343
len_10_sigma_15_353
len_10_sigma_15_363
len_10_sigma_15_373
len_10_sigma_16_343
len_10_sigma_16_353
len_10_sigma_16_363
len_10_sigma_16_373
len_10_sigma_17_343
len_10_sigma_17_353
len_10_sigma_17_363
len_10_sigma_17_373
len_10_sigma_18_343
len_10_sigma_18_353
len_10_sigma_18_363
len_10_sigma_18_373
len_10_sigma_8_343
len_10_sigma_8_353
len_10_sigma_8_363
len_10_sigma_8_373
len_10_sigma_9_343
len_10_sigma_9_353
len_10_sigma_9_363
len_10_sigma_9_373
len_11_sigma_10_343
len_11_sigma_10_353
len_11_sigma_10_363
len_11_sigma_10_373
len_11_sigma_11_343
len_11_sigma_1