# Dataset Preparation

In this notebook, we will prepare our datasets in a form that is suitable to the application of machine learning algorithms.

In [1]:
from pymatgen.core.composition import Composition
import numpy as np
import pandas as pd
import ase.db # https://wiki.fysik.dtu.dk/ase/ase/db/db.html
from itertools import product

In [2]:
def extract_elements(elements, atomic_table):
    # Extracts the rows of `atomic_table` corresponding to `elements`
    return atomic_table[atomic_table['Element'].isin(elements)]

def construct_features_row(dict,atomic_table):
    """
    Constructs a feature row that will be used in the machine learning algorithms;
    The features are statistical functions involving the properties of all atoms of a molecule;
    `dict` is a dictionary that represents the chemical formula of the material;
    """
    
    #Extracts the atomic properties of relevant elements
    df = extract_elements(dict.keys(),atomic_table)

    #Converts them to a numpy array
    array = df.drop(columns=df.columns[:1], axis=1).to_numpy()

    #Mean
    mean = np.mean(array,axis=0)

    #Weighted Mean
    copy = np.copy(array)

    natoms = sum(dict.values())

    for (n,w) in enumerate(dict.values()):
        copy[n,:] *= w / natoms
            
    wmean = np.sum(copy,axis=0) 

    #Maximum
    max = np.max(array,axis=0)

    #Minimum
    min = np.min(array,axis=0)

    #Standard deviation with respect to mean
    std = np.std(array,axis=0)

    #Standard deviation with respect to weighted mean
    copy = np.copy(array)
    
    for (n,m) in enumerate(wmean):
        copy[:,n] -= m

    wstd = np.mean(copy**2, axis = 0)

    return np.hstack((mean,wmean,max,min,std,wstd))


def construct_features(data, atomic_table):
    # Constructs the entire dataset one row at a time, simply iterating through the previous function

    #Builds the name of all columns
    columns=['Material', 'Space Group', 'gap']
    for term in ('_mean', '_wmean', '_max', '_min', '_std', '_wstd'):
        columns.extend((atomic_table.columns[1:] + term))
    
    df = pd.DataFrame(columns=columns)

    for row in data:
        new_entry = [row.formula, row.spacegroup, row['gap']]
        dict = Composition(row.formula).as_dict()
        new_entry.extend(construct_features_row(dict,atomic_table))            
        df.loc[len(df)] = new_entry

    return df

In [3]:
atomic_table = pd.read_csv('Schleder2019_AtomicTable.csv')
atomic_table.fillna(0,inplace=True)
atomic_table.head()

Unnamed: 0,Element,Z,Electronegativity,IonizationPotential,ElectronAffinity,HOMO,LUMO,r_s_orbital,r_p_orbital,r_d_orbital,r_atomic_nonbonded,r_valence_lastorbital,r_covalent,Valence,PeriodicColumn,PeriodicColumn_upto18,NumberUnfilledOrbitals,Polarizability
0,H,1,2.2,-12.6833,-1.5273,-6.4925,0.725,0.3865,0.0,0.0,0.37,0.3865,0.31,1.0,1.0,1.0,1.0,4.507107
1,He,2,0.0,-26.7499,3.0204,-15.761,1.5714,0.2964,1.0292,0.4176,0.32,0.2964,0.28,2.0,8.0,18.0,0.0,1.383746
2,Li,3,0.98,-5.3606,-0.5863,-2.8744,-0.9074,1.6578,1.8874,2.0869,1.34,1.6578,1.28,1.0,1.0,1.0,1.0,164.0
3,Be,4,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,1.2128,1.9594,0.9,1.0805,0.96,2.0,2.0,2.0,0.0,37.71
4,B,5,2.04,-8.1261,0.0312,-3.6067,2.4547,0.8025,0.8348,1.3619,0.82,0.8348,0.84,3.0,3.0,13.0,5.0,20.53


In [4]:
data = ase.db.connect('c2db-2021-06-24.db').select(is_magnetic=False)

In [5]:
df = construct_features(data, atomic_table)

In [6]:
df.head()

Unnamed: 0,Material,Space Group,gap,Z_mean,Electronegativity_mean,IonizationPotential_mean,ElectronAffinity_mean,HOMO_mean,LUMO_mean,r_s_orbital_mean,...,r_p_orbital_wstd,r_d_orbital_wstd,r_atomic_nonbonded_wstd,r_valence_lastorbital_wstd,r_covalent_wstd,Valence_wstd,PeriodicColumn_wstd,PeriodicColumn_upto18_wstd,NumberUnfilledOrbitals_wstd,Polarizability_wstd
0,Be4,Pbcm,0.0,4.0,1.57,-9.5007,0.7972,-5.6097,-2.0104,1.0805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AlTe4,Cm,0.0,32.5,1.855,-7.7068,-1.2054,-4.33185,3.5525,1.027,...,0.020793,0.557753,0.009826,0.020793,0.009826,57.46,3.06,3.06,3.06,115.1104
2,As4O6,P2_1,3.876239,20.5,2.81,-12.7017,-2.2098,-7.11345,4.203,0.6583,...,0.099108,0.175934,0.055016,0.099108,0.073034,21.06,0.26,0.26,0.26,156.830336
3,As4S6,Pc,2.271995,24.5,2.38,-10.4111,-2.1007,-6.1024,4.41855,0.80155,...,0.010546,0.338489,0.007514,0.010546,0.005096,21.06,0.26,0.26,0.26,28.284074
4,B2N,P-3m1,0.0,6.0,2.54,-10.84585,-0.82285,-5.34825,3.30365,0.6693,...,0.028338,0.006076,0.001361,0.028338,0.004694,1.111111,1.111111,1.111111,1.111111,46.44025


In [7]:
df.to_csv('gap_prediction.csv')

# Preparation of a novel dataset

In [18]:
#List with all possible space groups
space_groups=set(pd.read_csv('gap_prediction.csv')["Space Group"])

metals=['Sc','Ti','Cr','Mn','Fe','Co','Ni','Cu','Zn']
halogens=['F','Cl','Br','I']
stechiometries=[[2,3]]

def construct_novel_dataset(atomic_table):
    columns=['Material', 'Space Group']
    for term in ('_mean', '_wmean', '_max', '_min', '_std', '_wstd'):
        columns.extend((atomic_table.columns[1:] + term))

    df = pd.DataFrame(columns=columns)

    for (space_group,metal,halogen,stechiometry) in product(space_groups,metals,halogens,stechiometries):
        formula = dict(zip([metal,halogen],stechiometry))
        new_entry = [Composition(formula).reduced_formula, space_group]
        new_entry.extend(construct_features_row(formula,atomic_table))            
        df.loc[len(df)] = new_entry

    return df

In [20]:
data = ase.db.connect('c2db-2021-06-24.db').select(is_magnetic=False)
novel_dataset = construct_novel_dataset(atomic_table)

In [21]:
novel_dataset

Unnamed: 0,Material,Space Group,Z_mean,Electronegativity_mean,IonizationPotential_mean,ElectronAffinity_mean,HOMO_mean,LUMO_mean,r_s_orbital_mean,r_p_orbital_mean,...,r_p_orbital_wstd,r_d_orbital_wstd,r_atomic_nonbonded_wstd,r_valence_lastorbital_wstd,r_covalent_wstd,Valence_wstd,PeriodicColumn_wstd,PeriodicColumn_upto18_wstd,NumberUnfilledOrbitals_wstd,Polarizability_wstd
0,Sc2F3,P-3m1,15.0,2.670,-13.15365,-1.62310,-7.17485,0.76665,1.02750,0.99850,...,0.405600,0.061058,0.138554,0.011130,0.331994,4.16,4.16,50.96,16.64,2774.431400
1,Sc2Cl3,P-3m1,19.0,2.260,-10.44055,-1.42390,-5.91520,2.36990,1.16455,1.18985,...,0.195124,0.153354,0.052650,0.008035,0.120224,4.16,4.16,50.96,16.64,2221.259274
2,Sc2Br3,P-3m1,28.0,2.160,-9.77930,-1.33820,-5.54710,2.61745,1.20100,1.25320,...,0.142222,0.032362,0.023400,0.023792,0.065000,50.96,4.16,50.96,16.64,1887.350400
3,Sc2I3,P-3m1,37.0,2.010,-9.07290,-1.23945,-5.15980,2.02120,1.27650,1.34895,...,0.078108,0.007426,0.003146,0.063449,0.024986,50.96,4.16,50.96,16.64,1362.857600
4,Ti2F3,P-3m1,15.5,2.760,-13.29680,-1.89805,-7.68275,0.77590,0.97495,0.95595,...,0.352212,0.077595,0.109850,0.005482,0.275834,2.34,2.34,43.94,12.74,2027.191400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,Cu2I3,P422,41.0,2.280,-9.69130,-2.37515,-5.84605,2.79185,1.05230,1.16205,...,0.007899,0.002244,0.000650,0.004213,0.001274,9.36,9.36,9.36,0.00,92.285856
1976,Zn2F3,P422,19.5,2.815,-14.57370,-1.04400,-8.54560,0.46595,0.75560,1.00875,...,0.419024,0.152477,0.093600,0.139619,0.109850,6.50,6.50,6.50,0.26,317.881501
1977,Zn2Cl3,P422,23.5,2.405,-11.86060,-0.84480,-7.28595,2.06920,0.89265,1.20010,...,0.204468,0.286213,0.026624,0.031868,0.010400,6.50,6.50,6.50,0.26,150.960476
1978,Zn2Br3,P422,32.5,2.305,-11.19935,-0.75910,-6.91785,2.31675,0.92910,1.26345,...,0.150216,0.001333,0.007514,0.012976,0.000104,6.50,6.50,6.50,0.26,73.960109


In [121]:
novel_dataset.to_csv('gap_prediction_random.csv')