In [8]:
import pandas as pd
import numpy as np
from pymatgen.core.structure import Structure
from pymatgen.analysis.magnetism.analyzer import CollinearMagneticStructureAnalyzer

In [2]:
df = pd.read_csv('../data/xas_manual_query.csv')
df['elements'] = df['elements'].apply(eval)
df['spectra'] = df['spectra'].apply(eval)
print('total spectra:', len(df))

total spectra: 32639


In [3]:
# remove entries without xas spectra
df = df[df['spectra'].str.len()>0].reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 19921


In [4]:
# remove entries with NaNs in spectra
nan_ids = []
for i, entry in enumerate(df.itertuples()):
    for _, xas in entry.spectra.items():
        if np.any(np.isnan(xas['x'])) | np.any(np.isnan(xas['y'])):
            nan_ids += [i]
            break
        
df = df[~df.index.isin(nan_ids)].reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 19921


In [5]:
# remove flat spectra
tol = 1e-3
flat_ids = []
for i, entry in enumerate(df.itertuples()):
    for _, xas in entry.spectra.items():
        if np.all(np.array(xas['y']) < tol):
            flat_ids += [i]
            break
        
df = df[~df.index.isin(flat_ids)].reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 19903


In [6]:
# standardize class representation
def band_to_class(x):
    if x < 1:
        # trivial
        c = 3
    elif x < 3:
        # topological insulator
        c = 1
    else:
        # topological semimetal
        c = 2
    return c
df['class'] = df['band'].map(lambda x: band_to_class(x))

# standardize formula representation
df['structure'] = df['structure'].apply(eval).apply(Structure.from_dict)
df['structure'] = df['structure'].map(lambda x: x.get_primitive_structure())
df['formula'] = df['structure'].map(lambda x: x.formula)

In [9]:
# apply structural filters
drop_ids = []

# remove structures with more than N atoms in primitive cell and alloys
N = 30
for entry in df.itertuples():
    if (entry.structure.num_sites > N) or not (entry.structure.is_ordered):
        drop_ids += [entry.Index]
print('remove:', len(drop_ids))

# remove certain f-electron atoms
atoms = ['Pu', 'Lu', 'Tm', 'Er', 'Pr', 'Nd', 'Pm', 'Eu', 'Gd', 'Tb', 'Dy', 'Xe', 'Sm']
for entry in df.itertuples():
    if any(k for k in entry.elements if k in atoms):
        drop_ids += [entry.Index]
print('remove:', len(drop_ids))

# remove magnetic atoms or compounds       
for entry in df.itertuples():
    if CollinearMagneticStructureAnalyzer(entry.structure).is_magnetic:
        drop_ids += [entry.Index]
print('remove:', len(drop_ids))

remove: 6560
remove: 9458
remove: 9492


In [12]:
# drop filtered entries
df = df.drop(drop_ids).reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 11245


In [13]:
# save data
df['structure'] = df['structure'].map(lambda x: x.as_dict())
columns = ['icsd', 'formula', 'spacegroup', 'class', 'mp_id', 'structure', 'spectra']
df.to_csv('../data/data_manual.csv', columns=columns, index=False)