In [3]:
import os
import json
import pandas as pd
import numpy as np
import time

In [4]:
def process_material(file_path, data_dict, efermi_limit):
    """Processes a single material file and extracts relevant DOS information.

    Args:
        file_path (str): Path to the material data file.
        data_dict (dict): Dictionary to store the processed data.
        efermi_limit (float): Energy limit around the Fermi level for DOS extraction.
    """

    with open(file_path, 'r') as f:
        material_data = json.load(f)

    material_list = []
    bravais_lattice = os.path.basename(os.path.dirname(file_path))
    material_list.append(bravais_lattice)
    
    name_parts = material_data['name'].split('_')
    material_list.append(name_parts[0])  # Name
    material_list.append(name_parts[-1]) # ICSD
    material_list.append(material_data['Efermi'])

    energy_respect_fermi = np.array(material_data['tDOS_data']['energy'])
    
    if 'tDOS' in material_data['tDOS_data']:
        material_list.append(False)  # No magnetic
        DOS_data = np.array(material_data['tDOS_data']['tDOS'])
        desired_indices = (energy_respect_fermi >= -efermi_limit) & (energy_respect_fermi <= efermi_limit)
        DOS_grid = DOS_data[desired_indices][:1999]
    else:
        material_list.append(True)   # Magnetic
        DOS_spin_majority = np.array(material_data['tDOS_data']['spin_majority'])
        DOS_spin_minority = np.array(material_data['tDOS_data']['spin_minority'])
        desired_indices = (energy_respect_fermi >= -efermi_limit) & (energy_respect_fermi <= efermi_limit)
        DOS_grid = (DOS_spin_majority - DOS_spin_minority)[desired_indices][:1999]

    material_list.append(DOS_grid)
    
    data_dict[bravais_lattice].append(material_list)

def convert_to_df(data_dict):
    """Converts the processed data dictionary into a Pandas DataFrame.

    Args:
        data_dict (dict): Dictionary containing processed data for each Bravais lattice.

    Returns:
        pd.DataFrame: DataFrame with all the processed material data.
    """
    
    all_data_df = pd.DataFrame()

    for bravais_key, bravais_data in data_dict.items():
        temp_df = pd.DataFrame(bravais_data, columns=['bravais_lattice', 'material_name', 'ICSD', 'fermi_energy', 'is_magnetic', 'dos_grid'])
        df_dos = temp_df['dos_grid'].apply(pd.Series)
        df_dos.columns = [f'DOS_{i}' for i in range(len(df_dos.columns))]
        df_formatted = pd.concat([temp_df.drop('dos_grid', axis=1), df_dos], axis=1)
        all_data_df = pd.concat([all_data_df, df_formatted], ignore_index=True)

    return all_data_df

def main(data_raw_path, efermi_limit=15, save_csv= False, dos_csv_path= r"D:\tfg\data\dos_data.csv"):
    """Processes material data files, extracts DOS information, and creates a DataFrame.

    Args:
        data_raw_path (str): Path to the directory containing material data files.
        efermi_limit (float, optional): Energy limit around the Fermi level for DOS extraction. Defaults to 15.

    Returns:
        pd.DataFrame: DataFrame with all the processed material data.
    """
    
    data_dict = {}

    for bravais_lattice in os.listdir(data_raw_path):
        start = time.time()
        print(f'· Reading data files of: {bravais_lattice}...')
        folder_path = os.path.join(data_raw_path, bravais_lattice)
        data_dict[bravais_lattice] = []
        
        for file in os.listdir(folder_path):
            process_material(os.path.join(folder_path, file), data_dict, efermi_limit)
        
        end = time.time()
        elapsed_time = end - start
        print(f'  {bravais_lattice} completed. Time: {elapsed_time:.2f} s')

    all_data_df = convert_to_df(data_dict)
    all_data_df.fillna(0.0, inplace=True)

    if save_csv:
        all_data_df.to_csv(dos_csv_path, index=False)

    return all_data_df


# Example usage
all_data_df = main(r"D:\tfg\data\data_raw", efermi_limit=15)

· Reading data files of: BCC...
  BCC completed. Time: 15.14 s
· Reading data files of: BCT...
  BCT completed. Time: 97.48 s
· Reading data files of: CUB...
  CUB completed. Time: 75.16 s
· Reading data files of: FCC...
  FCC completed. Time: 153.20 s
· Reading data files of: HEX...
  HEX completed. Time: 188.62 s
· Reading data files of: MCL...
  MCL completed. Time: 81.62 s
· Reading data files of: MCLC...
  MCLC completed. Time: 89.28 s
· Reading data files of: ORC...
  ORC completed. Time: 144.25 s
· Reading data files of: ORCC...
  ORCC completed. Time: 57.85 s
· Reading data files of: ORCF...
  ORCF completed. Time: 5.49 s
· Reading data files of: ORCI...
  ORCI completed. Time: 16.52 s
· Reading data files of: RHL...
  RHL completed. Time: 67.84 s
· Reading data files of: TET...
  TET completed. Time: 81.08 s
· Reading data files of: TRI...
  TRI completed. Time: 56.73 s


In [33]:
# all_data_df.DOS_1998.describe()
# all_data_df.sort_values(by='DOS_1998', ascending = False)

# all_data_df[all_data_df.isnull().any(axis=1)][:,:-62]

# print(all_data_df.isnull().sum()[-290:-260])

all_data_df.sort_values(by='DOS_1998', ascending = False).head(20)


all_data_df.isnull().sum()

bravais_lattice    0
material_name      0
ICSD               0
fermi_energy       0
is_magnetic        0
                  ..
DOS_1994           0
DOS_1995           0
DOS_1996           0
DOS_1997           0
DOS_1998           0
Length: 2004, dtype: int64

In [None]:
x = len(all_data_df[all_data_df['DOS_1996'].isnull()])
x/len(all_data_df)*100

In [1]:
import os
import sys
import json
import pandas as pd
import numpy as np
import time

from utils import tools
sys.path.append('./../')
import config

class MaterialDataProcessor:
    """Class to process material data files and extract DOS information."""

    def __init__(self, run_results_path):
        self.run_results_path = run_results_path
        self.data_folder_path = config.DATA_FOLDER_PATH
        self.data_raw_path = os.path.join(self.data_folder_path, r'data_raw')
        self.dos_csv_path = os.path.join(self.data_folder_path, r'dos_data.csv')
        self.data_dict = {}

    @staticmethod
    def extract_material_data(file_path):
        """Reads material data from a JSON file."""
        with open(file_path, 'r') as f:
            material_data = json.load(f)
        return material_data

    @staticmethod
    def get_material_info(material_data, bravais_lattice):
        """Extracts basic material information."""
        material_list = [bravais_lattice]
        name_parts = material_data['name'].split('_')
        material_list.extend([name_parts[0], name_parts[-1], material_data['Efermi']])
        return material_list

    @staticmethod
    def extract_dos_data(material_data):
        """Extracts DOS data from the material dictionary."""
        energy_respect_fermi = np.array(material_data['tDOS_data']['energy'])
        is_magnetic = 'tDOS' not in material_data['tDOS_data']
        if is_magnetic:
            DOS_grid = (np.array(material_data['tDOS_data']['spin_majority']) - 
                        np.array(material_data['tDOS_data']['spin_minority']))[
                (energy_respect_fermi >= -config.EFERMI_LIMIT) & (
                    energy_respect_fermi <= config.EFERMI_LIMIT
                )
            ][:config.EFERMI_GRID_POINTS]
        else:
            DOS_grid = np.array(material_data['tDOS_data']['tDOS'])[
                (energy_respect_fermi >= -config.EFERMI_LIMIT) & (
                    energy_respect_fermi <= config.EFERMI_LIMIT
                )
            ][:config.EFERMI_GRID_POINTS]
        return is_magnetic, DOS_grid

    def process_material(self, file_path):
        """Processes a single material file and extracts relevant DOS information."""
        bravais_lattice = os.path.basename(os.path.dirname(file_path))
        material_data = self.extract_material_data(file_path)
        material_list = self.get_material_info(material_data, bravais_lattice)
        is_magnetic, DOS_grid = self.extract_dos_data(material_data)
        material_list.extend([is_magnetic, DOS_grid])
        self.data_dict[bravais_lattice].append(material_list)

    def process_all_materials(self):
        """Processes all material data files in the specified directory."""
        for bravais_lattice in os.listdir(self.data_raw_path):
            start = time.time()
            print(f'· Reading data files of: {bravais_lattice}...')

            folder_path = os.path.join(self.data_raw_path, bravais_lattice)
            self.data_dict[bravais_lattice] = []
            for file in os.listdir(folder_path):
                self.process_material(os.path.join(folder_path, file))

            elapsed_time = time.time() - start
            print(f'  {bravais_lattice} completed. Time: {elapsed_time:.2f} s')

    @staticmethod
    def create_dos_df(bravais_data):
        """Creates a DataFrame for DOS data of a single bravais lattice."""
        temp_df = pd.DataFrame(
            bravais_data,
            columns=[
                'bravais_lattice',
                'material_name',
                'ICSD',
                'fermi_energy',
                'is_magnetic',
                'dos_grid',
            ],
        )
        df_dos = temp_df['dos_grid'].apply(pd.Series)

        energy_values = np.linspace(-config.EFERMI_LIMIT, config.EFERMI_LIMIT, config.EFERMI_GRID_POINTS)

        def format_energy(energy):
            if energy == 0:
                return 'DOS_0'
            sign = 'm' if energy < 0 else 'p'
            return f'DOS_{sign}{abs(energy):.2f}'.replace('.', '_')

        df_dos.columns = [format_energy(energy) for energy in energy_values]

        return pd.concat([temp_df.drop('dos_grid', axis=1), df_dos], axis=1)

    def convert_to_df(self):
        """Converts the processed data dictionary into a Pandas DataFrame."""
        all_data_df = pd.DataFrame()
        for _, bravais_data in self.data_dict.items():
            df_formatted = self.create_dos_df(bravais_data)
            all_data_df = pd.concat([all_data_df, df_formatted], ignore_index=True)
        return all_data_df

    def data_raw_read_workflow(self):
        """Processes material data files, extracts DOS information, and creates a DataFrame."""
        self.process_all_materials()
        all_data_df = self.convert_to_df()
        all_data_df.fillna(0.0, inplace=True)
        all_data_df.to_csv(self.dos_csv_path, index=False)

raw_data_reading = MaterialDataProcessor('rbnfjn')
raw_data_reading.data_raw_read_workflow()

· Reading data files of: BCC...
  BCC completed. Time: 15.20 s


In [38]:
df = pd.read_csv(r'D:\tfg\data\dos_data.csv')

In [40]:
df.columns

Index(['bravais_lattice', 'material_name', 'ICSD', 'fermi_energy',
       'is_magnetic', 'DOS_m15_00', 'DOS_m14_98', 'DOS_m14_97', 'DOS_m14_95',
       'DOS_m14_94',
       ...
       'DOS_p14_86', 'DOS_p14_88', 'DOS_p14_89', 'DOS_p14_91', 'DOS_p14_92',
       'DOS_p14_94', 'DOS_p14_95', 'DOS_p14_97', 'DOS_p14_98', 'DOS_p15_00'],
      dtype='object', length=2004)

In [21]:
energy_values = np.linspace(-config.EFERMI_LIMIT, config.EFERMI_LIMIT, 1999)
df_dos = [f'DOS{int(energy):03d}' for energy in energy_values]

In [23]:
energy_values = np.linspace(-config.EFERMI_LIMIT, config.EFERMI_LIMIT, 1999)

def format_energy(energy):
    sign = 'm' if energy < 0 else 'p'
    return f'DOS_{sign}{abs(energy):.2f}'.replace('.', '_')

df_dos = [format_energy(energy) for energy in energy_values]

In [28]:
df_dos

['DOS_m0_02',
 'DOS_m0_03',
 'DOS_m0_05',
 'DOS_m0_06',
 'DOS_m0_08',
 'DOS_m0_09',
 'DOS_m0_11',
 'DOS_m0_12',
 'DOS_m0_14',
 'DOS_m0_15',
 'DOS_m0_17',
 'DOS_m0_18',
 'DOS_m0_20',
 'DOS_m0_21',
 'DOS_m0_23',
 'DOS_m0_24',
 'DOS_m0_26',
 'DOS_m0_27',
 'DOS_m0_29',
 'DOS_m0_30',
 'DOS_m0_32',
 'DOS_m0_33',
 'DOS_m0_35',
 'DOS_m0_36',
 'DOS_m0_38',
 'DOS_m0_39',
 'DOS_m0_41',
 'DOS_m0_42',
 'DOS_m0_44',
 'DOS_m0_45',
 'DOS_m0_47',
 'DOS_m0_48',
 'DOS_m0_50',
 'DOS_m0_51',
 'DOS_m0_53',
 'DOS_m0_54',
 'DOS_m0_56',
 'DOS_m0_57',
 'DOS_m0_59',
 'DOS_m0_60',
 'DOS_m0_62',
 'DOS_m0_63',
 'DOS_m0_65',
 'DOS_m0_66',
 'DOS_m0_68',
 'DOS_m0_69',
 'DOS_m0_71',
 'DOS_m0_72',
 'DOS_m0_74',
 'DOS_m0_75',
 'DOS_m0_77',
 'DOS_m0_78',
 'DOS_m0_80',
 'DOS_m0_81',
 'DOS_m0_83',
 'DOS_m0_84',
 'DOS_m0_86',
 'DOS_m0_87',
 'DOS_m0_89',
 'DOS_m0_90',
 'DOS_m0_92',
 'DOS_m0_93',
 'DOS_m0_95',
 'DOS_m0_96',
 'DOS_m0_98',
 'DOS_m0_99',
 'DOS_m10_00',
 'DOS_m10_02',
 'DOS_m10_03',
 'DOS_m10_05',
 'DOS_m10_06',
 