In [3]:
import os
import json
import pandas as pd
import numpy as np
import time

In [4]:
def process_material(file_path, data_dict, efermi_limit):
    """Processes a single material file and extracts relevant DOS information.

    Args:
        file_path (str): Path to the material data file.
        data_dict (dict): Dictionary to store the processed data.
        efermi_limit (float): Energy limit around the Fermi level for DOS extraction.
    """

    with open(file_path, 'r') as f:
        material_data = json.load(f)

    material_list = []
    bravais_lattice = os.path.basename(os.path.dirname(file_path))
    material_list.append(bravais_lattice)
    
    name_parts = material_data['name'].split('_')
    material_list.append(name_parts[0])  # Name
    material_list.append(name_parts[-1]) # ICSD
    material_list.append(material_data['Efermi'])

    energy_respect_fermi = np.array(material_data['tDOS_data']['energy'])
    
    if 'tDOS' in material_data['tDOS_data']:
        material_list.append(False)  # No magnetic
        DOS_data = np.array(material_data['tDOS_data']['tDOS'])
        desired_indices = (energy_respect_fermi >= -efermi_limit) & (energy_respect_fermi <= efermi_limit)
        DOS_grid = DOS_data[desired_indices][:1999]
    else:
        material_list.append(True)   # Magnetic
        DOS_spin_majority = np.array(material_data['tDOS_data']['spin_majority'])
        DOS_spin_minority = np.array(material_data['tDOS_data']['spin_minority'])
        desired_indices = (energy_respect_fermi >= -efermi_limit) & (energy_respect_fermi <= efermi_limit)
        DOS_grid = (DOS_spin_majority - DOS_spin_minority)[desired_indices][:1999]

    material_list.append(DOS_grid)
    
    data_dict[bravais_lattice].append(material_list)

def convert_to_df(data_dict):
    """Converts the processed data dictionary into a Pandas DataFrame.

    Args:
        data_dict (dict): Dictionary containing processed data for each Bravais lattice.

    Returns:
        pd.DataFrame: DataFrame with all the processed material data.
    """
    
    all_data_df = pd.DataFrame()

    for bravais_key, bravais_data in data_dict.items():
        temp_df = pd.DataFrame(bravais_data, columns=['bravais_lattice', 'material_name', 'ICSD', 'fermi_energy', 'is_magnetic', 'dos_grid'])
        df_dos = temp_df['dos_grid'].apply(pd.Series)
        df_dos.columns = [f'DOS_{i}' for i in range(len(df_dos.columns))]
        df_formatted = pd.concat([temp_df.drop('dos_grid', axis=1), df_dos], axis=1)
        all_data_df = pd.concat([all_data_df, df_formatted], ignore_index=True)

    return all_data_df

def main(data_raw_path, efermi_limit=15, save_csv= False, dos_csv_path= r"D:\tfg\data\dos_data.csv"):
    """Processes material data files, extracts DOS information, and creates a DataFrame.

    Args:
        data_raw_path (str): Path to the directory containing material data files.
        efermi_limit (float, optional): Energy limit around the Fermi level for DOS extraction. Defaults to 15.

    Returns:
        pd.DataFrame: DataFrame with all the processed material data.
    """
    
    data_dict = {}

    for bravais_lattice in os.listdir(data_raw_path):
        start = time.time()
        print(f'· Reading data files of: {bravais_lattice}...')
        folder_path = os.path.join(data_raw_path, bravais_lattice)
        data_dict[bravais_lattice] = []
        
        for file in os.listdir(folder_path):
            process_material(os.path.join(folder_path, file), data_dict, efermi_limit)
        
        end = time.time()
        elapsed_time = end - start
        print(f'  {bravais_lattice} completed. Time: {elapsed_time:.2f} s')

    all_data_df = convert_to_df(data_dict)
    all_data_df.fillna(0.0, inplace=True)

    if save_csv:
        all_data_df.to_csv(dos_csv_path, index=False)

    return all_data_df


# Example usage
all_data_df = main(r"D:\tfg\data\data_raw", efermi_limit=15)

· Reading data files of: BCC...
  BCC completed. Time: 15.14 s
· Reading data files of: BCT...
  BCT completed. Time: 97.48 s
· Reading data files of: CUB...
  CUB completed. Time: 75.16 s
· Reading data files of: FCC...
  FCC completed. Time: 153.20 s
· Reading data files of: HEX...
  HEX completed. Time: 188.62 s
· Reading data files of: MCL...
  MCL completed. Time: 81.62 s
· Reading data files of: MCLC...
  MCLC completed. Time: 89.28 s
· Reading data files of: ORC...
  ORC completed. Time: 144.25 s
· Reading data files of: ORCC...
  ORCC completed. Time: 57.85 s
· Reading data files of: ORCF...
  ORCF completed. Time: 5.49 s
· Reading data files of: ORCI...
  ORCI completed. Time: 16.52 s
· Reading data files of: RHL...
  RHL completed. Time: 67.84 s
· Reading data files of: TET...
  TET completed. Time: 81.08 s
· Reading data files of: TRI...
  TRI completed. Time: 56.73 s


In [33]:
# all_data_df.DOS_1998.describe()
# all_data_df.sort_values(by='DOS_1998', ascending = False)

# all_data_df[all_data_df.isnull().any(axis=1)][:,:-62]

# print(all_data_df.isnull().sum()[-290:-260])

all_data_df.sort_values(by='DOS_1998', ascending = False).head(20)


all_data_df.isnull().sum()

bravais_lattice    0
material_name      0
ICSD               0
fermi_energy       0
is_magnetic        0
                  ..
DOS_1994           0
DOS_1995           0
DOS_1996           0
DOS_1997           0
DOS_1998           0
Length: 2004, dtype: int64

In [None]:
x = len(all_data_df[all_data_df['DOS_1996'].isnull()])
x/len(all_data_df)*100