In [None]:
import os
import gzip
import json
import numpy as np
import pandas as pd
import multiprocessing as mp
import warnings
from tqdm.notebook import tqdm

In [None]:
from lobsterpy.cohp.analyze import Analysis
from pymatgen.electronic_structure.core import Spin
from lobsterpy.cohp.describe import Description

In [None]:
os.chdir('../Lobsterpy_json/')

In [None]:
mpids = [f for f in os.listdir(".")]

In [None]:
#with gzip.open('mp-632319.json.gz', 'r') as f:
#    data = json.loads(f.read().decode('utf-8'))

In [None]:
def featurize_lobsterpy_icohp(mpid):
    """
    This function loads lobsterpy json files and computes summary stats dataframe used in ML model
    """
    df = pd.DataFrame(index=[mpid.split('.')[0]])
    with gzip.open(mpid, 'r') as f:
        data = json.loads(f.read().decode('utf-8'))
        
    try:
        Icohp_mean=[]
        Icohp_sum=[]
        bond=[]
        antibond=[]
        for k, v in data['sites'].items():
            for k1,v1 in v['bonds'].items():
                Icohp_mean.append(float(v1['ICOHP_mean']))
                Icohp_sum.append(float(v1['ICOHP_sum']))
                bond.append(v1['bonding']['perc'])
                antibond.append(v1['antibonding']['perc'])

        df.loc[mpid.split('.')[0],'Icohp_mean_avg'] = np.mean(Icohp_mean)
        df.loc[mpid.split('.')[0],'Icohp_mean_max'] = np.max(Icohp_mean)
        df.loc[mpid.split('.')[0],'Icohp_mean_min'] = np.min(Icohp_mean)
        df.loc[mpid.split('.')[0],'Icohp_mean_std'] = np.std(Icohp_mean)

        df.loc[mpid.split('.')[0],'Icohp_sum_avg'] = np.mean(Icohp_sum)
        df.loc[mpid.split('.')[0],'Icohp_sum_max'] = np.max(Icohp_sum)
        df.loc[mpid.split('.')[0],'Icohp_sum_min'] = np.min(Icohp_sum)
        df.loc[mpid.split('.')[0],'Icohp_sum_std'] = np.std(Icohp_sum)

        df.loc[mpid.split('.')[0],'bonding_perc_avg'] = np.mean(bond)
        df.loc[mpid.split('.')[0],'bonding_perc_max'] = np.max(bond)
        df.loc[mpid.split('.')[0],'bonding_perc_min'] = np.min(bond)
        df.loc[mpid.split('.')[0],'bonding_perc_std'] = np.std(bond)

        df.loc[mpid.split('.')[0],'antibonding_perc_avg'] = np.mean(antibond)
        df.loc[mpid.split('.')[0],'antibonding_perc_min'] = np.min(antibond)
        df.loc[mpid.split('.')[0],'antibonding_perc_max'] = np.max(antibond)
        df.loc[mpid.split('.')[0],'antibonding_perc_std'] = np.std(antibond)

        df.loc[mpid.split('.')[0],'Madelung_Mull'] = data['madelung_energy']
    except ValueError:
        print(mpid)
    
    return df  # Return pandas Dataframe

In [None]:
with mp.Pool(processes=14,maxtasksperchild=1) as pool:
    results = tqdm(
        pool.imap_unordered(featurize_lobsterpy_icohp, mpids, chunksize=1),
        total=len(mpids), desc="Generating summarized LobsterPy dataframe",
    )
    row=[]
    for result in results:
        row.append(result)

In [None]:
df = pd.concat(row)

In [None]:
df.to_csv('Small_basis_sum_stat.csv')