In [1]:
import numpy as np
import os

from cclib.io import ccread
from cclib.parser.utils import convertor as conv

from ase.io import read as aseread
from ase.units import kcal, mol, eV, Hartree, Bohr, Angstrom

from pathlib import Path
from natsort import natsorted

In [2]:
def list_files(path, ext):
    """list files in a directory with a given extension
    Args:
        path (): _description_
        ext (str): extension without the dot (log, out, ...)

    Returns:
        list: collection of files with the given extension
    """
    path = Path(path)
    files =  [str(x) for x in path.iterdir() if x.is_file() and x.suffix == f'.{ext}']
    files = natsorted(files) # natsort by filenames; pathlib objects could not be sorted directly! 
    
    return files

In [3]:
def get_E(file, parser):
    """_summary_

    Args:
        file (str): full path to file
        parser (str): ase or cclib

    Raises:
        ValueError: wrong parser

    Returns:
        E: Energy in kcal/mol
    """
    # print('file: ', file)
    if parser == 'cclib':
        data = ccread(file)
        E = data.scfenergies[-1]
    elif parser == 'ase':
        atoms = aseread(file)
        E = atoms.get_potential_energy()
    else:
        raise ValueError('Unknown parser')

    return E

In [4]:
def get_forces(file, parser):
    """_summary_

    Args:
        file (str): full path to file
        parser (str): ase or cclib

    Raises:
        ValueError: wrong parser

    Returns:
        E: Energy in kcal/mol
    """
    if parser == 'cclib':
        data = ccread(file)
        F = data.grads[-1]
    elif parser == 'ase':
        atoms = aseread(file)
        F = atoms.get_forces()
    else:
        raise ValueError('Unknown parser')

    return F

In [19]:
def get_positions(file, parser):
    
    if parser == 'cclib':
        data = ccread(file)
        F = data.positions[-1]
    if parser == 'ase':
        atoms = aseread(file)
        R = atoms.get_positions()
    else:
        raise ValueError('Unknown parser')
    
    return R

In [20]:
def get_atomic_numbers(file, parser):
    
    if parser == 'cclib':
        data = ccread(file)
        F = data.atomic_numbers[-1]
    if parser == 'ase':
        atoms = aseread(file)
        Z = atoms.get_atomic_numbers()
    else:
        raise ValueError('Unknown parser')
    
    return Z

In [21]:
def save_npy(array, filename):    
    np.save(filename, array)

In [9]:
def convert_array(array, new_units):
    array = array * new_units

    return  array

In [10]:
def parse_output(path, ext, parser):

    files = list_files(path, ext) 
    E = [get_E(file, parser) for file in files]
    F = [get_forces(file, parser) for file in files]
    p = [Path(file) for file in files]
    R = [get_positions(file, parser) for file in files]
    Z = [get_atomic_numbers(file, parser) for file in files]
    
    E = np.array(E)
    F = np.array(F)
    R = np.array(R)
    Z = np.array(Z)
    p = np.array(p)
    
    return E, F, R, Z, p


In [16]:
cd ..

/vast/projects/ml4chem/NikitaFedik/DATA/ml-tps-ad-az


In [17]:
pwd

'/vast/projects/ml4chem/NikitaFedik/DATA/ml-tps-ad-az'

### save azobenzene data

In [None]:
az_path = 'data/AZ/thermal_MD_10k/logs'
az_E, az_F, az_R, az_Z, az_p = parse_output(az_path, 'log', 'ase')

In [None]:
os.chdir('data/AZ/thermal_MD_10k')

In [24]:
az_E_kcal = convert_array(az_E, 1/(kcal/mol))
az_F_kcal = convert_array(az_F, 1/(kcal/mol))

In [25]:
az_E_shifted = az_E_kcal - np.mean(az_E_kcal)  # relative formation energies in kcal/mol
save_npy(az_E_shifted, 'az_E_QM_kcal_mol.npy') 
save_npy(az_F_kcal, 'az_F_QM_kcal_mol_A.npy')
save_npy(az_Z, 'az_Z.npy')
save_npy(az_R, 'az_R.npy')
save_npy(az_p, 'az_paths.npy')

In [None]:
ad_E, ad_F, ad_R, ad_Z, ad_p = parse_output('../../AD/thermal_MD_10k/', 'log', 'ase')

In [None]:
os.chdir('../AD/thermal_MD_10k')

In [30]:
pwd

'/vast/projects/ml4chem/NikitaFedik/DATA/ml-tps-ad-az/data/AD'

In [31]:
ad_E_kcal = convert_array(ad_E, 1/(kcal/mol))
ad_F_kcal = convert_array(ad_F, 1/(kcal/mol))

In [32]:
ad_E_shifted = ad_E_kcal - np.mean(ad_E_kcal)  # relative formation energies in kcal/mol
save_npy(ad_E_shifted, 'ad_E_QM_kcal_mol.npy') 
save_npy(ad_F_kcal, 'ad_F_QM_kcal_mol_A.npy')
save_npy(ad_Z, 'ad_Z.npy')
save_npy(ad_R, 'ad_R.npy')
save_npy(ad_p, 'ad_paths.npy')

In [33]:
ad_E_shifted

array([-7.92819498,  5.66827054,  8.72458429, ..., -1.1404432 ,
        4.58517161,  2.59057314])