In [17]:
import pandas as pd
import numpy as np
from pymatgen.core.structure import Structure

In [18]:
df = pd.read_csv('../data/xas_query.csv')
df['elements'] = df['elements'].apply(eval)
df['spectra'] = df['spectra'].apply(eval)
print('total spectra:', len(df))

total spectra: 46452


In [19]:
# remove entries without xas spectra
df = df[df['spectra'].str.len()>0].reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 32493


In [20]:
# remove entries with NaNs in spectra
nan_ids = []
for i, entry in enumerate(df.itertuples()):
    for _, xas in entry.spectra.items():
        if np.any(np.isnan(xas['x'])) | np.any(np.isnan(xas['y'])):
            nan_ids += [i]
            break
        
df = df[~df.index.isin(nan_ids)].reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 32493


In [21]:
# remove flat spectra
tol = 1e-3
flat_ids = []
for i, entry in enumerate(df.itertuples()):
    for _, xas in entry.spectra.items():
        if np.all(np.array(xas['y']) < tol):
            flat_ids += [i]
            break
        
df = df[~df.index.isin(flat_ids)].reset_index(drop=True)
print('total spectra:', len(df))

total spectra: 32472


In [22]:
# standardize column names
df = df.rename(columns={'sg': 'spacegroup'})

# standardize formula representation
df['structure'] = df['structure'].apply(eval).apply(Structure.from_dict)
df['structure'] = df['structure'].map(lambda x: x.get_primitive_structure())
df['formula'] = df['structure'].map(lambda x: x.formula)

  % self.symbol)
  % self.symbol)
  % self.symbol)


In [23]:
# save data
df['structure'] = df['structure'].map(lambda x: x.as_dict())
columns = ['icsd', 'formula', 'spacegroup', 'class', 'mp_id', 'structure', 'spectra']
df.to_csv('../data/data.csv', columns=columns, index=False)