In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import json
import gzip
import multiprocessing as mp
from tqdm.autonotebook import tqdm
from atomate2.lobster.schemas import LobsterTaskDocument, CondensedBondingAnalysis
from pathlib import Path
from pymatgen.core import Structure
from pymatgen.electronic_structure.cohp import Cohp
from pymatgen.electronic_structure.plotter import CohpPlotter
from pymatgen.electronic_structure.cohp import CompleteCohp
from pymatgen.electronic_structure.dos import LobsterCompleteDos
from multiprocessing.pool import ThreadPool

#### Change to directory containing raw calculation files (Will be provided at time of final publication)

In [None]:
parent=os.getcwd()
os.chdir(parent)
#os.chdir('Results/') #Directory consisting of lobster outputs
os.chdir('/hpc-user/AG-JGeorge/anaik/Phonon_dataset_LSO/Results/')

In [None]:
mpids= [f for f in os.listdir() if not f.startswith('t') and not f.startswith('.') and not f.startswith('__')
            and os.path.isdir(f)]
mats= list(set([ids.split('_')[0] for ids in mpids]))
mats.sort()
os.chdir(parent)

In [None]:
def replace_inf_values(data):
    """
    This function recursively iterates over the dictionary and its nested dictionaries and lists. 
    It checks for -inf values and replaces them with the string representation '-Infinity'
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, dict) or isinstance(value, list):
                replace_inf_values(value)  # Recursively process nested dictionaries and lists
            elif value == float('-inf'):
                data[key] = '-Infinity'  # Replace -inf with a string representation
    elif isinstance(data, list):
        for index, item in enumerate(data):
            if isinstance(item, dict) or isinstance(item, list):
                replace_inf_values(item)  # Recursively process nested dictionaries and lists
            elif item == float('-inf'):
                data[index] = '-Infinity'  # Replace -inf with a string representation

In [None]:
def create_jsons(mpid, json_save_dir='/path/to/store/computationaldatajsons/'):
    """
    This function will generate LobsterTaskDocument object and save it as json
    """
    
    parent=os.getcwd()
    #os.chdir('Results')
    os.chdir('/hpc-user/AG-JGeorge/anaik/Phonon_dataset_LSO/Results/')
    os.chdir(mpid)
    path= os.getcwd()
    output = LobsterTaskDocument.from_directory(dir_name=path, save_cohp_plots=False, store_lso_dos=True)
    
    with gzip.open(os.path.join(json_save_dir, "{}.json.gz".format(mpid)), 'wt',encoding='UTF-8') as f:
        f.write('[')
        for attribute in output.__fields__.keys():
            if attribute not in ('dir_name', 'last_updated'):
                if hasattr(output.__getattribute__(attribute),'dict'):
                    if 'lobsterpy_data' in attribute:
                        data = output.__getattribute__(attribute).dict()
                        for item in data['cohp_plot_data'].items():
                            key, value = item
                            if hasattr(value, 'as_dict'):# check if item has a `as_dict` method (i.e. it is a pymatgen object)
                                data['cohp_plot_data'][key]=value.as_dict()
                        data_new={attribute:data}
                        replace_inf_values(data_new)
                        json.dump(data_new, f)
                        if attribute != list(output.__fields__.keys())[-1]: #add comma seperator between two dicts
                            f.write(',')
                    else:
                        data = {attribute:output.__getattribute__(attribute).dict()}
                        json.dump(data, f)
                        if attribute != list(output.__fields__.keys())[-1]: #add comma seperator between two dicts
                            f.write(',')
                elif hasattr(output.__getattribute__(attribute),'as_dict'):
                    data = {attribute:output.__getattribute__(attribute).as_dict()}
                    json.dump(data, f)
                    if attribute != list(output.__fields__.keys())[-1]: #add comma seperator between two dicts
                        f.write(',')
                else:
                    data = {attribute:output.__getattribute__(attribute)}
                    json.dump(data, f)
                    if attribute != list(output.__fields__.keys())[-1]: #add comma seperator between two dicts
                        f.write(',')
        f.write(']')
    return mpid+' Done'

In [None]:
# Caution : Make sure to change the number of parallel processes as per your system 
# (Memory intensive tasks, better to submit jobs on hpc)
items=mats
with mp.Pool(processes=4,maxtasksperchild=1) as pool:
    results = tqdm(
        pool.imap_unordered(create_jsons, items, chunksize=1),
        total=len(items),
    )
    row=[]
    for result in results:
        row.append(result)