In [None]:
import os
import json
import gzip
import multiprocessing as mp
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from __future__ import annotations
from tqdm.autonotebook import tqdm
from atomate2.lobster.schemas import LobsterTaskDocument, CondensedBondingAnalysis
from pydantic import create_model
from pymatgen.io.lobster import Charge,MadelungEnergies, Lobsterout, Lobsterin,Doscar, Bandoverlaps
from pymatgen.analysis.bond_valence import BVAnalyzer
from pymatgen.core.structure import Structure
from pymatgen.io.vasp.outputs import Vasprun

#### Change to directory containing raw calculation files (Will be provided at time of final publication)

In [None]:
parent=os.getcwd()
os.chdir(parent)
#os.chdir('Results/') #Directory consisting of lobster outputs

In [None]:
mpids= [f for f in os.listdir() if not f.startswith('t') and not f.startswith('.') and not f.startswith('__')
            and os.path.isdir(f)]
mats= list(set([ids.split('_')[0] for ids in mpids]))
mats.sort()

In [None]:
os.chdir('/hpc-user/AG-JGeorge/anaik/Phonon_dataset_LSO/Results/')

In [None]:
def get_lobster_calc_quality_summary(
    path_to_calc_dir: str,
    dos_comparison: bool = False,
    e_range: list = [-15, 0],
    n_bins: int | None = None,
    bva_comp: bool = False,
    ) -> dict:
    """
    This method will analyze LOBSTER calculation quality

    Returns:
        A dict of summary of LOBSTER calculation quality by analyzing basis set used,
        charge spilling from lobsterout/ PDOS comparisons of VASP and LOBSTER /
        BVA charge comparisons

    """
    path_to_poscar = os.path.join(path_to_calc_dir,'POSCAR.gz')
    path_to_lobsteout = os.path.join(path_to_calc_dir,'lobsterout.gz')
    path_to_potcar = os.path.join(path_to_calc_dir,'POTCAR.gz')
    path_to_lobsterin = os.path.join(path_to_calc_dir,'lobsterin.gz')
    path_to_bandoverlaps = os.path.join(path_to_calc_dir,'bandOverlaps.lobster.gz')
    path_to_charge = os.path.join(path_to_calc_dir,'CHARGE.lobster.gz')
    path_to_doscar = os.path.join(path_to_calc_dir,'DOSCAR.LSO.lobster.gz')
    path_to_vasprun = os.path.join(path_to_calc_dir,'vasprun.xml.gz')

    quality_dict = {}
    # read lobsterin 
    potcar_names = Lobsterin._get_potcar_symbols(POTCAR_input=path_to_potcar)

    struct = Structure.from_file(path_to_poscar)

    ref_bases = Lobsterin.get_all_possible_basis_functions(
        structure=struct, potcar_symbols=potcar_names
    )

    lobs_in = Lobsterin.from_file(path_to_lobsterin)
    calc_basis = []
    for basis in lobs_in["basisfunctions"]:
        basis_sep = basis.split()[1:]
        basis_comb = " ".join(basis_sep)
        calc_basis.append(basis_comb)

    if calc_basis == list(ref_bases[0].values()):
        quality_dict["minimal_basis"] = True  
    else:
        quality_dict["minimal_basis"] = False  
        warnings.warn(
            "Consider rerunning the calc with the minimum basis as well. Choosing is "
            "larger basis set is recommended if you see a significant improvement of "
            "the charge spilling and material has non-zero band gap."
        )

    # read lobsterout to get charge spillings
    lob_out = Lobsterout(path_to_lobsteout)

    quality_dict["charge_spilling"] = {
        "abs_charge_spilling": round((sum(lob_out.charge_spilling) / 2) * 100, 4)}  

    # following code block reads bandoverlaps.LOBSTER file if it exists and checks for 
    # max deviations recorded during orthonormalization 
    if os.path.isfile(path_to_bandoverlaps):
        band_overlaps = Bandoverlaps(filename=path_to_bandoverlaps)
        for line in lob_out.warning_lines:
            if 'k-points could not be orthonormalized' in line:
                total_kpoints = int(line.split(' ')[2])

        # store actual number of devations above pymatgen default limit of 0.1
        dev_val=[]
        for dev in band_overlaps.max_deviation:
            if dev>0.1:
                dev_val.append(dev)

        quality_dict["band_overlaps"] = {
            "file_exists" : True,
            "limit_maxDeviation" : 0.1,
            "has_good_quality_maxDeviation": band_overlaps.has_good_quality_maxDeviation(limit_maxDeviation=0.1),
            "max_deviation": max(band_overlaps.max_deviation),
            "percent_kpoints_abv_limit": (len(dev_val)/total_kpoints)*100
        }
    else:
        quality_dict["band_overlaps"] = {
            "file_exists" : False,
            "limit_maxDeviation": None,
            "has_good_quality_maxDeviation": True,
            "max_deviation": None,
            "percent_kpoints_abv_limit": None
        }

    # Compare Mulliken and Löwdin charge signs with Bond valence analyzer charge signs 
    if bva_comp:
        try:
            bond_valence = BVAnalyzer()

            bva_oxi = []
            lobs_charge = Charge(filename=path_to_charge)
            for i in bond_valence.get_valences(structure=struct):
                if i >= 0:
                    bva_oxi.append("POS")
                else:
                    bva_oxi.append("NEG")

            mull_oxi = []
            for i in lobs_charge.Mulliken:
                if i >= 0:
                    mull_oxi.append("POS")
                else:
                    mull_oxi.append("NEG")

            loew_oxi = []
            for i in lobs_charge.Loewdin:
                if i >= 0:
                    loew_oxi.append("POS")
                else:
                    loew_oxi.append("NEG")

            quality_dict["charges"] = {}  
            if mull_oxi == bva_oxi:
                quality_dict["charges"]["BVA_Mulliken_agree"] = True  
            else:
                quality_dict["charges"]["BVA_Mulliken_agree"] = False  

            if mull_oxi == bva_oxi:
                quality_dict["charges"]["BVA_Loewdin_agree"] = True  
            else:
                quality_dict["charges"]["BVA_Loewdin_agree"] = False  
        except ValueError:
            quality_dict["charges"] = {}  
            warnings.warn(
                "Oxidation states from BVA analyzer cannot be determined. "
                "Thus BVA charge comparison will be skipped"
            )

    # following code blocks reads vasprun.xml and DOSCAR.LSO.lobster to compute 
    # tanimoto index for all orbitals from DOS fingerprints
    if dos_comparison:
        if "LSO" not in str(path_to_doscar).split("."):
            warnings.warn(
                "Consider using DOSCAR.LSO.lobster, as non LSO DOS from LOBSTER can have "
                "negative DOS values"
            )
        doscar_lobster = Doscar(
            doscar=path_to_doscar, structure_file=path_to_poscar, dftprogram="Vasp"
        )

        dos_lobster = doscar_lobster.completedos

        vasprun = Vasprun(path_to_vasprun)
        dos_vasp = vasprun.complete_dos

        quality_dict["dos_comparisons"] = {}  

        for orb in dos_lobster.get_spd_dos():
            if e_range[0] >= min(dos_vasp.energies) and e_range[0] >= min(
                dos_lobster.energies
            ):
                min_e = e_range[0]
            else:
                warnings.warn(
                    "Mimimum energy range requested for DOS comparisons is not available "
                    "in VASP or LOBSTER calculation. Thus, setting min_e to -5 eV"
                )
                min_e = -5

            if e_range[-1] <= max(dos_vasp.energies) and e_range[-1] <= max(
                dos_lobster.energies
            ):
                max_e = e_range[-1]
            else:
                warnings.warn(
                    "Maximum energy range requested for DOS comparisons is not available "
                    "in VASP or LOBSTER calculation. Thus, setting max_e to 0 eV"
                )
                max_e = 0

            if (
                np.diff(dos_vasp.energies)[0] >= 0.1
                and np.diff(dos_lobster.energies)[0] >= 0.1
            ):
                warnings.warn(
                    "Input DOS files have very few points in the energy interval and thus "
                    "comparisons will not be reliable. Please rerun the calculations with "
                    "higher number of DOS points. Set NEDOS and COHPSteps tags to >= 2000 in VASP and LOBSTER "
                    "calculations, respectively."
                )

            if not n_bins:
                n_bins = 56

            fp_lobster_orb = dos_lobster.get_dos_fp(
                min_e=min_e,
                max_e=max_e,
                n_bins=n_bins,
                normalize=True,
                type=orb.name,
            )
            fp_vasp_orb = dos_vasp.get_dos_fp(
                min_e=min_e,
                max_e=max_e,
                n_bins=n_bins,
                normalize=True,
                type=orb.name,
            )

            tani_orb = round(
                dos_vasp.get_dos_fp_similarity(
                    fp_lobster_orb, fp_vasp_orb, tanimoto=True
                ),
                4,
            )
            quality_dict["dos_comparisons"][
                "tanimoto_orb_{}".format(orb.name)
            ] = tani_orb  # type: ignore

        fp_lobster = dos_lobster.get_dos_fp(
            min_e=min_e,
            max_e=max_e,
            n_bins=n_bins,
            normalize=True,
            type="summed_pdos",
        )
        fp_vasp = dos_vasp.get_dos_fp(
            min_e=min_e,
            max_e=max_e,
            n_bins=n_bins,
            normalize=True,
            type="summed_pdos",
        )

        tanimoto_summed = round(
            dos_vasp.get_dos_fp_similarity(fp_lobster, fp_vasp, tanimoto=True), 4
        )
        quality_dict["dos_comparisons"]["tanimoto_summed"] = tanimoto_summed  
        quality_dict["dos_comparisons"]["e_range"] = [min_e, max_e]  
        quality_dict["dos_comparisons"]["n_bins"] = n_bins  

        return quality_dict

In [None]:
def get_lobster_lightweight_json(mpid, json_save_dir='/path/to/store/lobsterlightweightjsons/'):
    
    directory = mpid
    
    which_bonds=['cation-anion','all']
    with gzip.open(os.path.join(json_save_dir, "{}.json.gz".format(mpid)), 'wt',encoding='UTF-8') as f:
            f.write('[')
            for which_bond in which_bonds:
                    try:
                        (lobsterpy_data,
                         lobsterpy_text,
                         sb_icobi,
                         sb_icohp,
                         sb_icoop) = CondensedBondingAnalysis.from_directory(dir_name=mpid, which_bonds=which_bond,
                                                                       save_cohp_plots=False)

                        bond_type = which_bond.replace('-','_')
                        dict_data= {bond_type+'_bonds':{
                        'lobsterpy_data': lobsterpy_data.dict(),
                        'lobsterpy_text': ["".join(lobsterpy_text.text)],
                        'sb_icobi': sb_icobi.dict(),
                        'sb_icohp': sb_icohp.dict(),
                        'sb_icoop': sb_icoop.dict(),
                        }}

                        for item in dict_data[bond_type+'_bonds']['lobsterpy_data']['cohp_plot_data'].items():
                            key, value = item
                            # check if item has a `as_dict` method (i.e. it is a pymatgen object)
                            if hasattr(value, 'as_dict'):
                                dict_data[bond_type+'_bonds']['lobsterpy_data']['cohp_plot_data'][key]=value.as_dict()            
                        json.dump(dict_data, f)

                        if which_bond != which_bonds[-1]: 
                            f.write(',') #add comma seperator between two dicts
                            
                    except AttributeError:
                        dict_data={bond_type+'_bonds':{}}
                        json.dump(dict_data, f)
                        
                        if which_bond != which_bonds[-1]: 
                            f.write(',') #add comma seperator between two dicts
                            
            f.write(',')        
            madelung_energies_path = os.path.join(mpid, "MadelungEnergies.lobster.gz")
            charge_path = os.path.join(mpid,"CHARGE.lobster.gz")
            madelung_obj = MadelungEnergies(filename=madelung_energies_path)

            madelung_energies = {'madelung_energies':{
                "Mulliken": madelung_obj.madelungenergies_Mulliken,
                "Loewdin": madelung_obj.madelungenergies_Loewdin,
                "Ewald_splitting": madelung_obj.ewald_splitting,
            }}
            json.dump(madelung_energies, f)
            f.write(',')
            charge = Charge(charge_path)
            charges = {"charges":{"Mulliken": charge.Mulliken, "Loewdin": charge.Loewdin}}
            json.dump(charges, f)
            f.write(',')
            calc_quality = {"calc_quality_summary": get_lobster_calc_quality_summary(path_to_calc_dir=mpid,
                                                                                    dos_comparison=True,
                                                                                    bva_comp=True,n_bins=256)}
            json.dump(calc_quality, f)
            f.write(']')
    return mpid+' Done'

In [None]:
# Caution : Make sure to change the number of parallel processes as per your system
with mp.Pool(processes=12,maxtasksperchild=1) as pool:
    results = tqdm(
        pool.imap_unordered(get_lobster_lightweight_json, mats, chunksize=1),
        total=len(mats),
    )
    row=[]
    for result in results:
        row.append(result)