In [1]:
from pymatgen.core.composition import Composition
import numpy as np
import pandas as pd
import ase.db # https://wiki.fysik.dtu.dk/ase/ase/db/db.html

In [2]:
def extract_elements(elements, atomic_table):
    return atomic_table[atomic_table['Element'].isin(elements)]

def construct_features_row(dict,atomic_table):
    df = extract_elements(dict.keys(),atomic_table)
    array = df.drop(columns=df.columns[:1], axis=1).to_numpy()

    mean = np.mean(array,axis=0)
    copy = np.copy(array)

    natoms = sum(dict.values())

    for (n,w) in enumerate(dict.values()):
        copy[n,:] *= w / natoms
            
    
    wmean = np.sum(copy,axis=0) 

    max = np.max(array,axis=0)
    min = np.min(array,axis=0)

    std = np.std(array,axis=0)

    copy = np.copy(array)
    
    for (n,m) in enumerate(wmean):
        copy[:,n] -= m

    wstd = np.mean(copy**2, axis = 0)

    return np.hstack((mean,wmean,max,min,std,wstd))


def construct_features(data, atomic_table):

    columns=['Material', 'Space Group', 'gap']
    for term in ('_mean', '_wmean', '_max', '_min', '_std', '_wstd'):
        columns.extend((atomic_table.columns[1:] + term))
    
    df = pd.DataFrame(columns=columns)

    for row in data:
        new_entry = [row.formula, row.spacegroup, row['gap']]
        dict = Composition(row.formula).as_dict()
        new_entry.extend(construct_features_row(dict,atomic_table))            
        df.loc[len(df)] = new_entry

    return df
    

In [3]:
atomic_table = pd.read_csv('Schleder2019_AtomicTable.csv')
atomic_table.fillna(0,inplace=True)
atomic_table.head()

Unnamed: 0,Element,Z,Electronegativity,IonizationPotential,ElectronAffinity,HOMO,LUMO,r_s_orbital,r_p_orbital,r_d_orbital,r_atomic_nonbonded,r_valence_lastorbital,r_covalent,Valence,PeriodicColumn,PeriodicColumn_upto18,NumberUnfilledOrbitals,Polarizability
0,H,1,2.2,-12.6833,-1.5273,-6.4925,0.725,0.3865,0.0,0.0,0.37,0.3865,0.31,1.0,1.0,1.0,1.0,4.507107
1,He,2,0.0,-26.7499,3.0204,-15.761,1.5714,0.2964,1.0292,0.4176,0.32,0.2964,0.28,2.0,8.0,18.0,0.0,1.383746
2,Li,3,0.98,-5.3606,-0.5863,-2.8744,-0.9074,1.6578,1.8874,2.0869,1.34,1.6578,1.28,1.0,1.0,1.0,1.0,164.0
3,Be,4,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,1.2128,1.9594,0.9,1.0805,0.96,2.0,2.0,2.0,0.0,37.71
4,B,5,2.04,-8.1261,0.0312,-3.6067,2.4547,0.8025,0.8348,1.3619,0.82,0.8348,0.84,3.0,3.0,13.0,5.0,20.53


In [4]:
data = ase.db.connect('c2db-2021-06-24.db').select(is_magnetic=False)

In [5]:
df = construct_features(data, atomic_table)

In [6]:
df.head()

Unnamed: 0,Material,Space Group,gap,Z_mean,Electronegativity_mean,IonizationPotential_mean,ElectronAffinity_mean,HOMO_mean,LUMO_mean,r_s_orbital_mean,...,r_p_orbital_wstd,r_d_orbital_wstd,r_atomic_nonbonded_wstd,r_valence_lastorbital_wstd,r_covalent_wstd,Valence_wstd,PeriodicColumn_wstd,PeriodicColumn_upto18_wstd,NumberUnfilledOrbitals_wstd,Polarizability_wstd
0,Be4,Pbcm,0.0,4.0,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AlTe4,Cm,0.0,32.5,1.855,-7.7068,-1.2054,-4.33185,3.5525,1.027,...,0.020793,0.557753,0.009826,0.020793,0.009826,57.46,3.06,3.06,3.06,115.1104
2,As4O6,P2_1,3.876239,20.5,2.81,-12.7017,-2.2098,-7.11345,4.203,0.6583,...,0.099108,0.175934,0.055016,0.099108,0.073034,21.06,0.26,0.26,0.26,156.830336
3,As4S6,Pc,2.271995,24.5,2.38,-10.4111,-2.1007,-6.1024,4.41855,0.80155,...,0.010546,0.338489,0.007514,0.010546,0.005096,21.06,0.26,0.26,0.26,28.284074
4,B2N,P-3m1,0.0,6.0,2.54,-10.84585,-0.82285,-5.34825,3.30365,0.6693,...,0.028338,0.006076,0.001361,0.028338,0.004694,1.111111,1.111111,1.111111,1.111111,46.44025


In [7]:
df.to_csv('gap_prediction.csv')