In [2]:
from pymatgen.core.composition import Composition
import numpy as np
import pandas as pd
import ase.db # https://wiki.fysik.dtu.dk/ase/ase/db/db.html

In [47]:
def extract_elements(elements, atomic_table):
    return atomic_table[atomic_table['Element'].isin(elements)]

def construct_features_row(dict,atomic_table):
    df = extract_elements(dict.keys(),atomic_table)
    array = df.drop(columns=df.columns[:1], axis=1).to_numpy()

    mean = np.mean(array,axis=0)
    copy = np.copy(array)

    natoms = sum(dict.values())

    for (n,w) in enumerate(dict.values()):
        copy[n,:] *= w / natoms
            
    
    wmean = np.sum(copy,axis=0) 

    max = np.max(array,axis=0)
    min = np.min(array,axis=0)

    std = np.std(array,axis=0)

    copy = np.copy(array)
    
    for (n,m) in enumerate(wmean):
        copy[:,n] -= m

    wstd = np.mean(copy**2, axis = 0)

    return np.hstack((mean,wmean,max,min,std,wstd))


def construct_features(data, atomic_table):

    columns=['Material', 'Space Group', 'gap']
    for term in ('_mean', '_wmean', '_max', '_min', '_std', '_wstd'):
        columns.extend((atomic_table.columns[1:] + term))
    
    df = pd.DataFrame(columns=columns)

    for row in data:
        new_entry = [row.formula, row.spacegroup, row['gap']]
        dict = Composition(row.formula).as_dict()
        new_entry.extend(construct_features_row(dict,atomic_table))            
        df.loc[len(df)] = new_entry

    return df

In [14]:
atomic_table = pd.read_csv('Schleder2019_AtomicTable.csv')
atomic_table.fillna(0,inplace=True)
atomic_table.head()

Unnamed: 0,Element,Z,Electronegativity,IonizationPotential,ElectronAffinity,HOMO,LUMO,r_s_orbital,r_p_orbital,r_d_orbital,r_atomic_nonbonded,r_valence_lastorbital,r_covalent,Valence,PeriodicColumn,PeriodicColumn_upto18,NumberUnfilledOrbitals,Polarizability
0,H,1,2.2,-12.6833,-1.5273,-6.4925,0.725,0.3865,0.0,0.0,0.37,0.3865,0.31,1.0,1.0,1.0,1.0,4.507107
1,He,2,0.0,-26.7499,3.0204,-15.761,1.5714,0.2964,1.0292,0.4176,0.32,0.2964,0.28,2.0,8.0,18.0,0.0,1.383746
2,Li,3,0.98,-5.3606,-0.5863,-2.8744,-0.9074,1.6578,1.8874,2.0869,1.34,1.6578,1.28,1.0,1.0,1.0,1.0,164.0
3,Be,4,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,1.2128,1.9594,0.9,1.0805,0.96,2.0,2.0,2.0,0.0,37.71
4,B,5,2.04,-8.1261,0.0312,-3.6067,2.4547,0.8025,0.8348,1.3619,0.82,0.8348,0.84,3.0,3.0,13.0,5.0,20.53


In [3]:
data = ase.db.connect('c2db-2021-06-24.db').select(is_magnetic=False)

In [5]:
df = construct_features(data, atomic_table)

In [6]:
df.head()

Unnamed: 0,Material,Space Group,gap,Z_mean,Electronegativity_mean,IonizationPotential_mean,ElectronAffinity_mean,HOMO_mean,LUMO_mean,r_s_orbital_mean,...,r_p_orbital_wstd,r_d_orbital_wstd,r_atomic_nonbonded_wstd,r_valence_lastorbital_wstd,r_covalent_wstd,Valence_wstd,PeriodicColumn_wstd,PeriodicColumn_upto18_wstd,NumberUnfilledOrbitals_wstd,Polarizability_wstd
0,Be4,Pbcm,0.0,4.0,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AlTe4,Cm,0.0,32.5,1.855,-7.7068,-1.2054,-4.33185,3.5525,1.027,...,0.020793,0.557753,0.009826,0.020793,0.009826,57.46,3.06,3.06,3.06,115.1104
2,As4O6,P2_1,3.876239,20.5,2.81,-12.7017,-2.2098,-7.11345,4.203,0.6583,...,0.099108,0.175934,0.055016,0.099108,0.073034,21.06,0.26,0.26,0.26,156.830336
3,As4S6,Pc,2.271995,24.5,2.38,-10.4111,-2.1007,-6.1024,4.41855,0.80155,...,0.010546,0.338489,0.007514,0.010546,0.005096,21.06,0.26,0.26,0.26,28.284074
4,B2N,P-3m1,0.0,6.0,2.54,-10.84585,-0.82285,-5.34825,3.30365,0.6693,...,0.028338,0.006076,0.001361,0.028338,0.004694,1.111111,1.111111,1.111111,1.111111,46.44025


In [7]:
df.to_csv('gap_prediction.csv')

# Preparation of a random dataset

In [118]:
metals = ['Sc','Ti','Cu','Zn','Zr','Nb','Mo','Ru','Rh','Pd','Ag','Cd','Hf','Ta','W','Re','Os','Ir','Pt','Au']
alkali_metals = ['Li', 'Na', 'K', 'Rb', 'Cs', 'Fr']
alkaline_earth_metals = ['Be', 'Mg', 'Ca', 'Sr', 'Ba', 'Ra']
halogens = ['F','Cl','Br','I','S', 'At']
chalcogens = ['O', 'S', 'Se', 'Te', 'Po']
pnictogens = ['N', 'P', 'As', 'Sb', 'Bi']
nonmetals = np.hstack((halogens,chalcogens,pnictogens,['H']))
othermetals = np.hstack((alkali_metals,alkaline_earth_metals))

def generate_random_formula(space_groups):
    nmetals = np.random.randint(1,3)
    nnonmetals = np.random.randint(1,5)
    nother_metals = np.random.randint(2)

    metal = np.random.choice(metals,size=nmetals)
    nonmetal = np.random.choice(nonmetals,size=nnonmetals)
    other_metal = np.random.choice(othermetals,size=nother_metals)

    keys = np.hstack((metal,nonmetal,other_metal))
    values = np.random.choice(range(1,5),size = nmetals + nnonmetals + nother_metals)
    return {keys[i]: values[i] for i in range(len(keys))}, np.random.choice(space_groups)

def construct_random_features(data,atomic_table,N_materials):

    columns=['Material', 'Space Group']
    for term in ('_mean', '_wmean', '_max', '_min', '_std', '_wstd'):
        columns.extend((atomic_table.columns[1:] + term))
    _spacegroups = np.array([row.spacegroup for row in data])
    spacegroups = np.unique(_spacegroups)

    df = pd.DataFrame(columns=columns)

    for _ in range(N_materials):
        formula,spacegroup = generate_random_formula(spacegroups)
        new_entry = [Composition(formula).reduced_formula, spacegroup]
        new_entry.extend(construct_features_row(formula,atomic_table))            
        df.loc[len(df)] = new_entry

    return df

In [119]:
data = ase.db.connect('c2db-2021-06-24.db').select(is_magnetic=False)
random_data = construct_random_features(data,atomic_table,5000)

In [120]:
random_data

Unnamed: 0,Material,Space Group,Z_mean,Electronegativity_mean,IonizationPotential_mean,ElectronAffinity_mean,HOMO_mean,LUMO_mean,r_s_orbital_mean,r_p_orbital_mean,...,r_p_orbital_wstd,r_d_orbital_wstd,r_atomic_nonbonded_wstd,r_valence_lastorbital_wstd,r_covalent_wstd,Valence_wstd,PeriodicColumn_wstd,PeriodicColumn_upto18_wstd,NumberUnfilledOrbitals_wstd,Polarizability_wstd
0,Ca4WI2,Pmma,49.000000,2.006667,-8.735433,-0.850100,-5.112633,0.641800,1.369533,1.569300,...,0.154737,0.027169,0.031489,0.222409,0.023514,74.755102,5.176871,40.278912,8.795918,2975.246600
1,Cs3Ti2P2(S2I)2,P2/m,32.200000,1.952000,-8.739540,-1.546640,-5.187260,2.533320,1.312900,1.437760,...,0.502055,0.300624,0.201184,0.487616,0.265811,30.530178,4.376331,48.115976,7.178698,21427.942230
2,Hf2Ta2Te2P2Cl3,Pmna,45.800000,2.050000,-9.481840,-1.348640,-5.268600,2.436940,1.092980,1.254620,...,0.148839,0.131669,0.039076,0.017869,0.094091,34.206612,1.056198,32.692562,7.879339,1397.835646
3,Rb4SbW3SN3,P-3,37.000000,2.170000,-9.311760,-1.353260,-5.128540,2.605620,1.201700,1.345560,...,0.492235,0.282555,0.215189,0.397098,0.263287,49.306944,3.473611,40.306944,2.973611,13678.420200
4,Hf2Zn3Po3At3(H2Se)2,Cm,51.000000,1.983333,-9.915933,-1.112683,-5.659450,2.001400,0.947167,1.117283,...,0.323487,0.080155,0.385099,0.076545,0.204352,107.221453,4.889273,39.918108,7.129181,1049.137070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Fr2Ir4Pt2Se3S4,P222_1,58.400000,2.062000,-8.896920,-2.084540,-5.337920,2.088920,0.706300,0.949740,...,0.299298,0.241416,0.252880,0.113023,0.314051,84.351111,10.200000,31.000000,0.404444,13023.502100
4996,Re2RhN4,Pc,42.333333,2.406667,-10.429333,-1.133700,-5.322667,0.189900,1.117633,1.190233,...,0.231781,0.066061,0.132147,0.010811,0.131073,55.510204,2.748299,12.136054,1.115646,670.306667
4997,Zn3ReAt2SBr4,P2/m,48.200000,2.258000,-10.796280,-1.654140,-6.576260,2.628560,0.973760,1.216480,...,0.109686,0.202766,0.312757,0.023854,0.033006,72.023140,3.824793,14.965289,3.024793,275.185530
4998,Po3Ru2Cl3,P-3,48.333333,2.453333,-10.190533,-2.347633,-5.936633,3.235000,1.037167,1.174733,...,0.108092,0.135153,0.294300,0.061724,0.039123,113.666667,0.666667,16.395833,1.583333,444.081540


In [121]:
random_data.to_csv('gap_prediction_random.csv')