In [9]:
import lmdb
import pickle
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
from collections import defaultdict

def analyze_adsorbate_types(lmdb_path):

    env = lmdb.open(
        lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False
    )

    adsorbate_dict = defaultdict(list)
    
    print(f"Analyzing adsorbate types from: {lmdb_path}")
    
    with env.begin(write=False) as txn:
        cursor = txn.cursor()
        total_entries = env.stat()['entries']
        
        for key, value in tqdm(cursor, total=total_entries, desc="Processing"):
            try:
                data = pickle.loads(value)
                
                if "ads_atomic_numbers" not in data:
                    continue
                
                ads_nums = data["ads_atomic_numbers"]
                
                if hasattr(ads_nums, 'tolist'):
                    ads_list = ads_nums.tolist()
                else:
                    ads_list = list(ads_nums)
                
                sorted_nums = sorted(ads_list)

                adsorbate_key = tuple(sorted_nums)
                
                try:
                    index = int(key.decode('ascii'))
                except (ValueError, UnicodeDecodeError):
                    index = key.decode('ascii')
                
                adsorbate_dict[adsorbate_key].append(index)
                
            except Exception as e:
                print(f"Error processing key {key}: {e}")
                continue
    
    env.close()
    
    def get_adsorbate_name(atomic_nums):
        element_map = {
            1: 'H', 2: 'He', 3: 'Li', 4: 'Be', 5: 'B', 6: 'C', 7: 'N', 8: 'O',
            9: 'F', 10: 'Ne', 11: 'Na', 12: 'Mg', 13: 'Al', 14: 'Si', 15: 'P',
            16: 'S', 17: 'Cl', 18: 'Ar'
        }
        
        elements = [element_map.get(num, f"X{num}") for num in atomic_nums]
        
        common_names = {
            (8,): 'O',
            (1, 8): 'OH',
            (1, 8, 8): 'OOH',
            (1, 1, 8): 'H2O',
            (6, 8): 'CO',
            (6, 8, 8): 'CO2',
        }
        
        if tuple(atomic_nums) in common_names:
            return common_names[tuple(atomic_nums)]
        else:
            return '-'.join(elements)
    
    named_dict = {}
    for atomic_nums_tuple, indices in adsorbate_dict.items():
        name = get_adsorbate_name(list(atomic_nums_tuple))
        named_dict[name] = sorted(indices)
    
    count_dict = {name: len(indices) for name, indices in named_dict.items()}
    
    return named_dict, count_dict

source_lmdb = "dataset/val_id/dataset.lmdb"
adsorbate_dict, count_dict = analyze_adsorbate_types(source_lmdb)

print("\n" + "="*50)
print("       Adsorbate Types Analysis       ")
print("="*50)
print(f"Total unique adsorbate types: {len(count_dict)}")
print("\nAdsorbate type counts:")
for name, count in sorted(count_dict.items(), key=lambda x: x[1], reverse=True):
    print(f"  {name}: {count} samples")

Analyzing adsorbate types from: dataset/val_id/dataset.lmdb


Processing: 100%|██████████| 24898/24898 [00:00<00:00, 45537.30it/s]

Error processing key b'length': argument of type 'int' is not iterable

       Adsorbate Types Analysis       
Total unique adsorbate types: 46

Adsorbate type counts:
  H-H-H-H-C-C-O: 1272 samples
  H-H-C-C-O-O: 1226 samples
  H-H-C-C-O: 1207 samples
  H-H-H-H-C-C-O-O: 1207 samples
  H-H-H-H-H-C-C-O: 1094 samples
  H-H-H-C-C-O: 1071 samples
  H-H-H-C-C: 855 samples
  H-H-C-C: 807 samples
  H-H-C-O: 791 samples
  H-H-H-C-O: 732 samples
  H-C-C-O: 652 samples
  O: 622 samples
  H: 608 samples
  C: 597 samples
  H-H-H-C-C-O-O: 581 samples
  N: 573 samples
  H-H-C: 525 samples
  OH: 493 samples
  H-N: 469 samples
  C-C: 461 samples
  H-H-H-C: 437 samples
  H2O: 427 samples
  N-O: 411 samples
  H-N-O: 408 samples
  H-H-H-N: 398 samples
  N-O-O: 396 samples
  H-H-N-N: 383 samples
  H-H-H-H-H-C-C: 375 samples
  N-O-O-O: 367 samples
  H-H-H-H-C: 362 samples
  H-N-N: 359 samples
  N-N-O: 356 samples
  H-C-C-O-O: 353 samples
  H-H-H-N-O: 352 samples
  H-H-H-H-C-O: 345 samples
  H-C-C: 339 sampl




In [None]:
output_json_path = "val_id_adsorbate_indices.json"

with open(output_json_path, 'w') as f:
    json.dump(adsorbate_dict, f, indent=2)

print(f"Saved adsorbate indices to '{output_json_path}'")
print(f"\nExample entries:")

for i, (name, indices) in enumerate(sorted(adsorbate_dict.items(), key=lambda x: len(x[1]), reverse=True)[:5]):
    print(f"  {name}: {len(indices)} samples, first 10 indices: {indices[:10]}")

Saved adsorbate indices to 'val_id_adsorbate_indices.json'

Example entries:
  H-H-H-H-C-C-O: 1272 samples, first 10 indices: [10, 33, 71, 77, 90, 93, 104, 112, 121, 189]
  H-H-C-C-O-O: 1226 samples, first 10 indices: [44, 50, 53, 58, 59, 87, 108, 171, 202, 204]
  H-H-C-C-O: 1207 samples, first 10 indices: [8, 17, 34, 69, 70, 101, 105, 116, 168, 183]
  H-H-H-H-C-C-O-O: 1207 samples, first 10 indices: [5, 21, 28, 52, 73, 95, 107, 120, 126, 141]
  H-H-H-H-H-C-C-O: 1094 samples, first 10 indices: [47, 51, 56, 67, 100, 152, 211, 227, 248, 255]


In [None]:
import lmdb
import pickle
from tqdm import tqdm
import json
import os

def split_dataset_by_adsorbate(src_lmdb_path, json_path, output_dir):

    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Loading adsorbate indices from: {json_path}")
    with open(json_path, 'r') as f:
        adsorbate_indices = json.load(f)
    
    print(f"Found {len(adsorbate_indices)} adsorbate types")
    
    env_src = lmdb.open(
        src_lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False
    )
    
    map_size_3gb = 3 * 1024 * 1024 * 1024
    
    for adsorbate_name, indices in adsorbate_indices.items():
        if not indices: 
            continue
        
        safe_name = adsorbate_name.replace('/', '_').replace('\\', '_')
        output_filename = f"val_id_{safe_name}_subset.lmdb"
        output_path = os.path.join(output_dir, output_filename)
        
        print(f"\nProcessing {adsorbate_name}: {len(indices)} samples")
        print(f"  Saving to: {output_path}")
        
        env_dst = lmdb.open(
            output_path,
            map_size=map_size_3gb,
            subdir=False,
            readonly=False,
            meminit=False,
            map_async=True
        )
        
        new_index_counter = 0
        
        with env_src.begin(write=False) as txn_src:
            with env_dst.begin(write=True) as txn_dst:
                sorted_indices = sorted(indices)
                
                for original_index in tqdm(sorted_indices, desc=f"  Processing {adsorbate_name}"):
                    try:
                        key = str(original_index).encode('ascii')
                        
                        value = txn_src.get(key)
                        
                        if value is None:
                            print(f"  Warning: Index {original_index} not found in source LMDB")
                            continue
                        
                        new_key = str(new_index_counter).encode('ascii')
                        txn_dst.put(new_key, value)
                        new_index_counter += 1
                            
                    except Exception as e:
                        print(f"  Error processing index {original_index}: {e}")
                        continue
        
        env_dst.close()
        
        print(f"  Saved {new_index_counter} samples to {output_filename}")
    
    env_src.close()
    
    print("\n" + "="*50)
    print("       Splitting Complete       ")
    print("="*50)
    print(f"All adsorbate subsets saved to: {output_dir}")

source_lmdb = "dataset/val_id/dataset.lmdb"
json_file = "val_id_adsorbate_indices.json"
output_directory = "dataset_per_adsorbate"

split_dataset_by_adsorbate(source_lmdb, json_file, output_directory)

Loading adsorbate indices from: val_id_adsorbate_indices.json
Found 46 adsorbate types

Processing H-H-N-N: 383 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-N-N_subset.lmdb


  Processing H-H-N-N: 100%|██████████| 383/383 [00:00<00:00, 69376.74it/s]


  Saved 383 samples to val_id_H-H-N-N_subset.lmdb

Processing H-H-H-H-H-C-C: 375 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-H-C-C_subset.lmdb


  Processing H-H-H-H-H-C-C: 100%|██████████| 375/375 [00:00<00:00, 212491.76it/s]


  Saved 375 samples to val_id_H-H-H-H-H-C-C_subset.lmdb

Processing H-H-H-H-C-C-O: 1272 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-C-C-O_subset.lmdb


  Processing H-H-H-H-C-C-O: 100%|██████████| 1272/1272 [00:00<00:00, 274258.71it/s]


  Saved 1272 samples to val_id_H-H-H-H-C-C-O_subset.lmdb

Processing H-H-H-H-H-C-C-O: 1094 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-H-C-C-O_subset.lmdb


  Processing H-H-H-H-H-C-C-O: 100%|██████████| 1094/1094 [00:00<00:00, 361674.83it/s]


  Saved 1094 samples to val_id_H-H-H-H-H-C-C-O_subset.lmdb

Processing C-N: 324 samples
  Saving to: dataset_per_adsorbate/val_id_C-N_subset.lmdb


  Processing C-N: 100%|██████████| 324/324 [00:00<00:00, 330967.97it/s]


  Saved 324 samples to val_id_C-N_subset.lmdb

Processing H-H-C-C-O: 1207 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-C-C-O_subset.lmdb


  Processing H-H-C-C-O: 100%|██████████| 1207/1207 [00:00<00:00, 428265.37it/s]


  Saved 1207 samples to val_id_H-H-C-C-O_subset.lmdb

Processing H-H-H-C-C-O: 1071 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-C-C-O_subset.lmdb


  Processing H-H-H-C-C-O: 100%|██████████| 1071/1071 [00:00<00:00, 454160.31it/s]


  Saved 1071 samples to val_id_H-H-H-C-C-O_subset.lmdb

Processing N-N-O-O-O-O: 249 samples
  Saving to: dataset_per_adsorbate/val_id_N-N-O-O-O-O_subset.lmdb


  Processing N-N-O-O-O-O: 100%|██████████| 249/249 [00:00<00:00, 358032.81it/s]


  Saved 249 samples to val_id_N-N-O-O-O-O_subset.lmdb

Processing H-H-H-N: 398 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-N_subset.lmdb


  Processing H-H-H-N: 100%|██████████| 398/398 [00:00<00:00, 406955.87it/s]


  Saved 398 samples to val_id_H-H-H-N_subset.lmdb

Processing H-H-C-C: 807 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-C-C_subset.lmdb


  Processing H-H-C-C: 100%|██████████| 807/807 [00:00<00:00, 455130.20it/s]


  Saved 807 samples to val_id_H-H-C-C_subset.lmdb

Processing H-H-H-C-C: 855 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-C-C_subset.lmdb


  Processing H-H-H-C-C: 100%|██████████| 855/855 [00:00<00:00, 460055.15it/s]


  Saved 855 samples to val_id_H-H-H-C-C_subset.lmdb

Processing O: 622 samples
  Saving to: dataset_per_adsorbate/val_id_O_subset.lmdb


  Processing O: 100%|██████████| 622/622 [00:00<00:00, 452926.58it/s]


  Saved 622 samples to val_id_O_subset.lmdb

Processing H-H-C: 525 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-C_subset.lmdb


  Processing H-H-C: 100%|██████████| 525/525 [00:00<00:00, 434664.35it/s]


  Saved 525 samples to val_id_H-H-C_subset.lmdb

Processing C: 597 samples
  Saving to: dataset_per_adsorbate/val_id_C_subset.lmdb


  Processing C: 100%|██████████| 597/597 [00:00<00:00, 451659.36it/s]


  Saved 597 samples to val_id_C_subset.lmdb

Processing H-H-C-C-O-O: 1226 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-C-C-O-O_subset.lmdb


  Processing H-H-C-C-O-O: 100%|██████████| 1226/1226 [00:00<00:00, 482429.56it/s]


  Saved 1226 samples to val_id_H-H-C-C-O-O_subset.lmdb

Processing H-H-H-C-C-O-O: 581 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-C-C-O-O_subset.lmdb


  Processing H-H-H-C-C-O-O: 100%|██████████| 581/581 [00:00<00:00, 440826.81it/s]


  Saved 581 samples to val_id_H-H-H-C-C-O-O_subset.lmdb

Processing H-H-H-H-C-C-O-O: 1207 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-C-C-O-O_subset.lmdb


  Processing H-H-H-H-C-C-O-O: 100%|██████████| 1207/1207 [00:00<00:00, 490554.74it/s]


  Saved 1207 samples to val_id_H-H-H-H-C-C-O-O_subset.lmdb

Processing H-H-H-C-O: 732 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-C-O_subset.lmdb


  Processing H-H-H-C-O: 100%|██████████| 732/732 [00:00<00:00, 460028.55it/s]


  Saved 732 samples to val_id_H-H-H-C-O_subset.lmdb

Processing H-N: 469 samples
  Saving to: dataset_per_adsorbate/val_id_H-N_subset.lmdb


  Processing H-N: 100%|██████████| 469/469 [00:00<00:00, 441952.05it/s]


  Saved 469 samples to val_id_H-N_subset.lmdb

Processing H2O: 427 samples
  Saving to: dataset_per_adsorbate/val_id_H2O_subset.lmdb


  Processing H2O: 100%|██████████| 427/427 [00:00<00:00, 429282.79it/s]


  Saved 427 samples to val_id_H2O_subset.lmdb

Processing N: 573 samples
  Saving to: dataset_per_adsorbate/val_id_N_subset.lmdb


  Processing N: 100%|██████████| 573/573 [00:00<00:00, 448550.99it/s]


  Saved 573 samples to val_id_N_subset.lmdb

Processing N-O-O: 396 samples
  Saving to: dataset_per_adsorbate/val_id_N-O-O_subset.lmdb


  Processing N-O-O: 100%|██████████| 396/396 [00:00<00:00, 409099.60it/s]


  Saved 396 samples to val_id_N-O-O_subset.lmdb

Processing C-C: 461 samples
  Saving to: dataset_per_adsorbate/val_id_C-C_subset.lmdb


  Processing C-C: 100%|██████████| 461/461 [00:00<00:00, 433342.48it/s]


  Saved 461 samples to val_id_C-C_subset.lmdb

Processing H-N-O: 408 samples
  Saving to: dataset_per_adsorbate/val_id_H-N-O_subset.lmdb


  Processing H-N-O: 100%|██████████| 408/408 [00:00<00:00, 417996.10it/s]


  Saved 408 samples to val_id_H-N-O_subset.lmdb

Processing H-H-C-O: 791 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-C-O_subset.lmdb


  Processing H-H-C-O: 100%|██████████| 791/791 [00:00<00:00, 523791.36it/s]


  Saved 791 samples to val_id_H-H-C-O_subset.lmdb

Processing H-C-C: 339 samples
  Saving to: dataset_per_adsorbate/val_id_H-C-C_subset.lmdb


  Processing H-C-C: 100%|██████████| 339/339 [00:00<00:00, 396726.86it/s]


  Saved 339 samples to val_id_H-C-C_subset.lmdb

Processing H: 608 samples
  Saving to: dataset_per_adsorbate/val_id_H_subset.lmdb


  Processing H: 100%|██████████| 608/608 [00:00<00:00, 448967.75it/s]


  Saved 608 samples to val_id_H_subset.lmdb

Processing C-C-O: 272 samples
  Saving to: dataset_per_adsorbate/val_id_C-C-O_subset.lmdb


  Processing C-C-O: 100%|██████████| 272/272 [00:00<00:00, 371612.60it/s]


  Saved 272 samples to val_id_C-C-O_subset.lmdb

Processing N-O-O-O: 367 samples
  Saving to: dataset_per_adsorbate/val_id_N-O-O-O_subset.lmdb


  Processing N-O-O-O: 100%|██████████| 367/367 [00:00<00:00, 422191.32it/s]


  Saved 367 samples to val_id_N-O-O-O_subset.lmdb

Processing N-N-O: 356 samples
  Saving to: dataset_per_adsorbate/val_id_N-N-O_subset.lmdb


  Processing N-N-O: 100%|██████████| 356/356 [00:00<00:00, 387635.57it/s]


  Saved 356 samples to val_id_N-N-O_subset.lmdb

Processing H-H-H-N-O: 352 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-N-O_subset.lmdb


  Processing H-H-H-N-O: 100%|██████████| 352/352 [00:00<00:00, 400541.24it/s]


  Saved 352 samples to val_id_H-H-H-N-O_subset.lmdb

Processing N-O: 411 samples
  Saving to: dataset_per_adsorbate/val_id_N-O_subset.lmdb


  Processing N-O: 100%|██████████| 411/411 [00:00<00:00, 427438.37it/s]


  Saved 411 samples to val_id_N-O_subset.lmdb

Processing H-H-H-H-H-H-C-C-N-N-O: 328 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-H-H-C-C-N-N-O_subset.lmdb


  Processing H-H-H-H-H-H-C-C-N-N-O: 100%|██████████| 328/328 [00:00<00:00, 400271.08it/s]


  Saved 328 samples to val_id_H-H-H-H-H-H-C-C-N-N-O_subset.lmdb

Processing H-C-C-O: 652 samples
  Saving to: dataset_per_adsorbate/val_id_H-C-C-O_subset.lmdb


  Processing H-C-C-O: 100%|██████████| 652/652 [00:00<00:00, 466988.77it/s]

  Saved 652 samples to val_id_H-C-C-O_subset.lmdb

Processing OH: 493 samples
  Saving to: dataset_per_adsorbate/val_id_OH_subset.lmdb



  Processing OH: 100%|██████████| 493/493 [00:00<00:00, 453065.70it/s]


  Saved 493 samples to val_id_OH_subset.lmdb

Processing H-N-N: 359 samples
  Saving to: dataset_per_adsorbate/val_id_H-N-N_subset.lmdb


  Processing H-N-N: 100%|██████████| 359/359 [00:00<00:00, 406192.38it/s]


  Saved 359 samples to val_id_H-N-N_subset.lmdb

Processing H-H-H-H-H-C-C-O-O: 314 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-H-C-C-O-O_subset.lmdb


  Processing H-H-H-H-H-C-C-O-O: 100%|██████████| 314/314 [00:00<00:00, 497736.76it/s]


  Saved 314 samples to val_id_H-H-H-H-H-C-C-O-O_subset.lmdb

Processing H-H-H-C: 437 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-C_subset.lmdb


  Processing H-H-H-C: 100%|██████████| 437/437 [00:00<00:00, 415437.64it/s]


  Saved 437 samples to val_id_H-H-H-C_subset.lmdb

Processing H-N-N-O: 334 samples
  Saving to: dataset_per_adsorbate/val_id_H-N-N-O_subset.lmdb


  Processing H-N-N-O: 100%|██████████| 334/334 [00:00<00:00, 416066.98it/s]


  Saved 334 samples to val_id_H-N-N-O_subset.lmdb

Processing H-H-H-H-C-O: 345 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-C-O_subset.lmdb


  Processing H-H-H-H-C-O: 100%|██████████| 345/345 [00:00<00:00, 500357.84it/s]


  Saved 345 samples to val_id_H-H-H-H-C-O_subset.lmdb

Processing H-H-N-N-O: 292 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-N-N-O_subset.lmdb


  Processing H-H-N-N-O: 100%|██████████| 292/292 [00:00<00:00, 378238.66it/s]


  Saved 292 samples to val_id_H-H-N-N-O_subset.lmdb

Processing H-H-H-H-C: 362 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-C_subset.lmdb


  Processing H-H-H-H-C: 100%|██████████| 362/362 [00:00<00:00, 502927.48it/s]


  Saved 362 samples to val_id_H-H-H-H-C_subset.lmdb

Processing H-C-C-O-O: 353 samples
  Saving to: dataset_per_adsorbate/val_id_H-C-C-O-O_subset.lmdb


  Processing H-C-C-O-O: 100%|██████████| 353/353 [00:00<00:00, 411389.08it/s]


  Saved 353 samples to val_id_H-C-C-O-O_subset.lmdb

Processing H-H-H-H-C-N-N-O: 251 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-C-N-N-O_subset.lmdb


  Processing H-H-H-H-C-N-N-O: 100%|██████████| 251/251 [00:00<00:00, 379924.32it/s]


  Saved 251 samples to val_id_H-H-H-H-C-N-N-O_subset.lmdb

Processing N-N: 302 samples
  Saving to: dataset_per_adsorbate/val_id_N-N_subset.lmdb


  Processing N-N: 100%|██████████| 302/302 [00:00<00:00, 393012.66it/s]


  Saved 302 samples to val_id_N-N_subset.lmdb

Processing H-H-H-H-H-H-C-C-O: 320 samples
  Saving to: dataset_per_adsorbate/val_id_H-H-H-H-H-H-C-C-O_subset.lmdb


  Processing H-H-H-H-H-H-C-C-O: 100%|██████████| 320/320 [00:00<00:00, 495085.68it/s]


  Saved 320 samples to val_id_H-H-H-H-H-H-C-C-O_subset.lmdb

       Splitting Complete       
All adsorbate subsets saved to: dataset_per_adsorbate


In [1]:
import lmdb
import pickle
from tqdm import tqdm
import json
import os
import glob

def extract_ref_energy_from_lmdb(lmdb_path, output_json_path):

    if not os.path.exists(lmdb_path):
        print(f"File not found: {lmdb_path}")
        return []

    try:
        env = lmdb.open(
            lmdb_path,
            subdir=False,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False
        )
    except Exception as e:
        print(f"Failed to open LMDB {lmdb_path}: {e}")
        return []
    
    ref_energies = []
    
    print(f"\nProcessing: {os.path.basename(lmdb_path)}")
    
    with env.begin(write=False) as txn:
        cursor = txn.cursor()
        total_entries = env.stat()['entries']
        
        file_name = os.path.basename(lmdb_path)
        
        for key, value in tqdm(cursor, total=total_entries, desc=f"Extracting {file_name}", leave=False):
            try:
                key_str = key.decode('ascii')
                if key_str == 'length':
                    continue
                
                data = pickle.loads(value)
                
                if "ref_energy" in data:
                    ref_energy = data["ref_energy"]
                    if hasattr(ref_energy, 'item'):
                        ref_energy = ref_energy.item()
                    elif hasattr(ref_energy, 'tolist'):
                        ref_energy = ref_energy.tolist()
                    ref_energies.append(ref_energy)
                else:
                    ref_energies.append(None)
                
            except Exception as e:
                continue
    
    env.close()
    
    output_data = {
        "lmdb_source": os.path.basename(lmdb_path),
        "ref_energies": ref_energies,
        "total_samples": len(ref_energies),
        "samples_with_ref_energy": sum(1 for e in ref_energies if e is not None)
    }
    
    with open(output_json_path, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f" -> Saved to: {output_json_path}")
    print(f" -> Valid Samples: {output_data['samples_with_ref_energy']} / {output_data['total_samples']}")
    
    return ref_energies

def process_all_lmdbs(target_directory):

    search_pattern = os.path.join(target_directory, "*.lmdb")
    lmdb_files = glob.glob(search_pattern)

    print(f"{'='*50}")
    print(f"Found {len(lmdb_files)} LMDB files in '{target_directory}'")
    print(f"{'='*50}")

    for lmdb_path in lmdb_files:

        base_name = os.path.splitext(lmdb_path)[0] 
        output_json_path = f"{base_name}_ref_energy.json"
        
        extract_ref_energy_from_lmdb(lmdb_path, output_json_path)

    print(f"\n{'='*50}")
    print("All processing complete.")
    print(f"{'='*50}")

target_dir = "/home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id"

process_all_lmdbs(target_dir)

Found 46 LMDB files in '/home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id'

Processing: val_id_C-C_subset.lmdb


                                                                                  

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_C-C_subset_ref_energy.json
 -> Valid Samples: 461 / 461

Processing: val_id_H-H-H-H-C-N-N-O_subset.lmdb


                                                                                      

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-C-N-N-O_subset_ref_energy.json
 -> Valid Samples: 251 / 251

Processing: val_id_N-N-O-O-O-O_subset.lmdb


                                                                                  

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N-N-O-O-O-O_subset_ref_energy.json
 -> Valid Samples: 249 / 249

Processing: val_id_N-O-O-O_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N-O-O-O_subset_ref_energy.json
 -> Valid Samples: 367 / 367

Processing: val_id_N-N_subset.lmdb


                                                                          

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N-N_subset_ref_energy.json
 -> Valid Samples: 302 / 302

Processing: val_id_H-H-H-N-O_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-N-O_subset_ref_energy.json
 -> Valid Samples: 352 / 352

Processing: val_id_C_subset.lmdb


                                                                        

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_C_subset_ref_energy.json
 -> Valid Samples: 597 / 597

Processing: val_id_H-H-H-C-C-O-O_subset.lmdb


                                                                                    

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-C-C-O-O_subset_ref_energy.json
 -> Valid Samples: 581 / 581

Processing: val_id_N-O-O_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N-O-O_subset_ref_energy.json
 -> Valid Samples: 396 / 396

Processing: val_id_H-C-C_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-C-C_subset_ref_energy.json
 -> Valid Samples: 339 / 339

Processing: val_id_N-O_subset.lmdb


                                                                          

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N-O_subset_ref_energy.json
 -> Valid Samples: 411 / 411

Processing: val_id_H-H-H-H-H-C-C-O-O_subset.lmdb


                                                                                        

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-H-C-C-O-O_subset_ref_energy.json
 -> Valid Samples: 314 / 314

Processing: val_id_N-N-O_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N-N-O_subset_ref_energy.json
 -> Valid Samples: 356 / 356

Processing: val_id_N_subset.lmdb


                                                                        

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_N_subset_ref_energy.json
 -> Valid Samples: 573 / 573

Processing: val_id_OH_subset.lmdb


                                                                         

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_OH_subset_ref_energy.json
 -> Valid Samples: 493 / 493

Processing: val_id_H-H-H-H-H-H-C-C-O_subset.lmdb


                                                                                        

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-H-H-C-C-O_subset_ref_energy.json
 -> Valid Samples: 320 / 320

Processing: val_id_H-H-N-N_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-N-N_subset_ref_energy.json
 -> Valid Samples: 383 / 383

Processing: val_id_H-C-C-O-O_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-C-C-O-O_subset_ref_energy.json
 -> Valid Samples: 353 / 353

Processing: val_id_H-H-H-C-O_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-C-O_subset_ref_energy.json
 -> Valid Samples: 732 / 732

Processing: val_id_H-H-N-N-O_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-N-N-O_subset_ref_energy.json
 -> Valid Samples: 292 / 292

Processing: val_id_H-H-H-H-C-O_subset.lmdb


                                                                                  

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-C-O_subset_ref_energy.json
 -> Valid Samples: 345 / 345

Processing: val_id_H2O_subset.lmdb


                                                                          

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H2O_subset_ref_energy.json
 -> Valid Samples: 427 / 427

Processing: val_id_H-H-H-H-C-C-O-O_subset.lmdb


                                                                                       

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-C-C-O-O_subset_ref_energy.json
 -> Valid Samples: 1207 / 1207

Processing: val_id_H-H-H-H-C_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-C_subset_ref_energy.json
 -> Valid Samples: 362 / 362

Processing: val_id_H-H-H-C_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-C_subset_ref_energy.json
 -> Valid Samples: 437 / 437

Processing: val_id_H-H-C-O_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-C-O_subset_ref_energy.json
 -> Valid Samples: 791 / 791

Processing: val_id_C-N_subset.lmdb


                                                                          

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_C-N_subset_ref_energy.json
 -> Valid Samples: 324 / 324

Processing: val_id_H-H-H-H-C-C-O_subset.lmdb


                                                                                     

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-C-C-O_subset_ref_energy.json
 -> Valid Samples: 1272 / 1272

Processing: val_id_H-C-C-O_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-C-C-O_subset_ref_energy.json
 -> Valid Samples: 652 / 652

Processing: val_id_O_subset.lmdb


                                                                        

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_O_subset_ref_energy.json
 -> Valid Samples: 622 / 622

Processing: val_id_H-H-H-C-C_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-C-C_subset_ref_energy.json
 -> Valid Samples: 855 / 855

Processing: val_id_H-H-C-C-O_subset.lmdb


                                                                                

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-C-C-O_subset_ref_energy.json
 -> Valid Samples: 997 / 997

Processing: val_id_H-H-H-H-H-C-C-O_subset.lmdb


                                                                                       

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-H-C-C-O_subset_ref_energy.json
 -> Valid Samples: 1094 / 1094

Processing: val_id_H-H-C-C-O-O_subset.lmdb


                                                                                   

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-C-C-O-O_subset_ref_energy.json
 -> Valid Samples: 1226 / 1226

Processing: val_id_C-C-O_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_C-C-O_subset_ref_energy.json
 -> Valid Samples: 272 / 272

Processing: val_id_H-N-N_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-N-N_subset_ref_energy.json
 -> Valid Samples: 359 / 359

Processing: val_id_H_subset.lmdb


                                                                        

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H_subset_ref_energy.json
 -> Valid Samples: 608 / 608

Processing: val_id_H-H-H-H-H-H-C-C-N-N-O_subset.lmdb


                                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-H-H-C-C-N-N-O_subset_ref_energy.json
 -> Valid Samples: 328 / 328

Processing: val_id_H-N-N-O_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-N-N-O_subset_ref_energy.json
 -> Valid Samples: 334 / 334

Processing: val_id_H-H-H-H-H-C-C_subset.lmdb


                                                                                    

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-H-H-C-C_subset_ref_energy.json
 -> Valid Samples: 375 / 375

Processing: val_id_H-H-C-C_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-C-C_subset_ref_energy.json
 -> Valid Samples: 807 / 807

Processing: val_id_H-H-H-C-C-O_subset.lmdb


                                                                                   

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-C-C-O_subset_ref_energy.json
 -> Valid Samples: 1071 / 1071

Processing: val_id_H-H-C_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-C_subset_ref_energy.json
 -> Valid Samples: 525 / 525

Processing: val_id_H-N_subset.lmdb


                                                                          

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-N_subset_ref_energy.json
 -> Valid Samples: 469 / 469

Processing: val_id_H-N-O_subset.lmdb


                                                                            

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-N-O_subset_ref_energy.json
 -> Valid Samples: 408 / 408

Processing: val_id_H-H-H-N_subset.lmdb


                                                                              

 -> Saved to: /home/jovyan/mk-catgen-data/dataset_per_adsorbate/val_id/val_id_H-H-H-N_subset_ref_energy.json
 -> Valid Samples: 398 / 398

All processing complete.
