In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import sys
import os
from catGP import preprocess_data, OMGP
from catSC import Linear_Scaling
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
'''data_format = [metadata, slab_d0, slab_d1, bimetal_fp, 
                  ads_d0_unscaled, ads_d1_unscaled, ads_d2_unscaled, SS_unscaled,
                  ads_d0_scaled, ads_d1_scaled, ads_d2_scaled, SS_scaled,
                  energy, energy_A]'''

def get_data(A):
    data = np.load('../Delta_FP/{}_data_delta.npy'.format(A))[()]
    y = np.array([i[-2] for i in data if i[-1] is not None])
    X = np.vstack([np.hstack([i[1].reshape(-1), 
                              i[2].reshape(-1),
                              i[3].reshape(-1), 
                              i[4].reshape(-1),
                              i[5].reshape(-1),
                              i[6].reshape(-1),
                              i[7].reshape(-1),
                              i[8].reshape(-1),
                              i[9].reshape(-1),
                              i[10].reshape(-1),
                              i[11].reshape(-1)]) for i in data if i[-1] is not None])
    y_sc = np.array([i[-1] for i in data if i[-1] is not None])
    
    return X, y, y_sc

In [3]:
d1 = ['SD0', 'SD1', 'AD0U', 'AD1U', 'AD2U', 'SSU', 
      'AD0S', 'AD1S', 'AD2S', 'SSS']
p1 = ['atomic_number', 'atomic_radius', 'dband_center_slab',
      'dband_width_slab', 'dband_skewness_slab', 'dband_kurtosis_slab',
      'dipole_polarizability', 'electron_affinity', 'heat_of_formation',
      'specific_heat']
p2 = ['atomic_number', 'atomic_radius', 'dband_center_slab',
      'dband_width_slab', 'dipole_polarizability', 'electron_affinity',
      'heat_of_formation', 'specific_heat', 'en_allen']

fp_list = [i + '_' +j for j in p1 for i in d1]
fp_list += ['BF_' + i for i in p2]
print(len(fp_list))



109


In [4]:
adsorbates = ['CH', 'CH2', 'CH3', 'OH', 'NH', 'SH']

for ads in adsorbates:
    mask_fp = np.load('../GP_d2_FS_LASSOCV/{}_fp.npy'.format(ads))[()]
    mask = [i for i, j in enumerate(fp_list) if j in mask_fp]
    print('Working on adsorbates: {}'.format(ads))
    X, y, y_sc = get_data(ads)
    X = X[:, mask]
    print(X.shape)
    data = preprocess_data(X, y)
    data.clean_data()
    X, y = data.get_data()
    assert len(y) == len(y_sc)
    
    sc = Linear_Scaling(y_sc, y, ads[0], ads, 'eV')
    sc.get_coeff()
    a = sc.plot_scaling()

    scaling_dict = sc.__dict__

    kernel_recipe = {'ConstantKernel' : [{'RBF' : [1.0,
                                                   {'length_scale' : 1.0}]},
                                      {'constant_value' : 1.0,
                                       'constant_value_bounds' : (3e-7, 3e7)}],
                     'WhiteKernel' : {'noise_level' : 0.1,
                                      'noise_level_bounds' : (1e-5, 1e5)}}

    r_state = [10, 20, 42]
    #r_state = [42]
    r_data = {rs : {} for rs in r_state}
    for rs in r_state:
        print('    Working on random state: {}'.format(rs))
        X_train, X_test, y_train, y_test, y_sc_train, y_sc_test = train_test_split(X, y, y_sc, 
                                                                                   train_size=0.80, 
                                                                                   random_state=rs)

        MLGP = OMGP(X_train=X_train,
                    X_test=X_test,
                    y_train=y_train,
                    y_test=y_test,
                    kernel_recipe=kernel_recipe,
                    scaling=True,
                    scaling_params={'alpha' : sc.slope,
                                    'gamma' : sc.intercept},
                    scaling_y_train=y_sc_train,
                    scaling_y_test=y_sc_test)

        MLGP.run_GP()
        r_data[rs] = MLGP.__dict__


    if not os.path.exists('run_{}'.format(ads)):
        os.mkdir('run_{}'.format(ads))
        
    a.savefig('run_{}/scaling_plot.png'.format(ads))
    np.save('run_{}/scaling_data.npy'.format(ads), scaling_dict)

    np.save('run_{0}/{0}_rsdata.npy'.format(ads), r_data)
    #LC = MLGP.plot_learning_curve()
    #LC.savefig('run_{0}/{0}_learning_curve.png'.format(ads))
    PP = MLGP.parity_plot(data='train')
    PP.savefig('run_{0}/{0}_parity_plot_train.png'.format(ads))
    PP = MLGP.parity_plot(data='test')
    PP.savefig('run_{0}/{0}_parity_plot_test.png'.format(ads))
    PP = MLGP.parity_plot(data='train', err_bar=True)
    PP.savefig('run_{0}/{0}_parity_plot_train_err_bar.png'.format(ads))
    PP = MLGP.parity_plot(data='test', err_bar=True)
    PP.savefig('run_{0}/{0}_parity_plot_test_err_bar.png'.format(ads))
    #PP = MLGP.parity_plot_fancy(data='train')
    #PP.savefig('run_{0}/{0}_parity_plot_train.png'.format(ads))
    #PP = MLGP.parity_plot_fancy(data='test')
    #PP.savefig('run_{0}/{0}_parity_plot_test.png'.format(ads))

Working on adsorbates: CH
(901, 31)
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
Working on adsorbates: CH2
(613, 8)
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
Working on adsorbates: CH3
(489, 37)
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
Working on adsorbates: OH
(594, 18)
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
Working on adsorbates: NH
(918, 28)
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42




Working on adsorbates: SH
(682, 8)
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42




In [5]:
for ads in adsorbates:
    data = np.load('run_{0}/{0}_rsdata.npy'.format(ads))[()]
    MAE_train = np.array([data[k]['MAE_train'] for k in data])
    MAE_test = np.array([data[k]['MAE_test'] for k in data])
    with open('run_{}/out.txt'.format(ads), 'w') as f:
        print('\nData for adsorbate: {}'.format(ads))
        f.write('MAE_train: {}\n'.format(MAE_train))
        f.write('MAE_train mean: {}\n'.format(MAE_train.mean()))
        f.write('MAE_train std: {}\n'.format(MAE_train.std()))
        f.write('MAE_test: {}\n'.format(MAE_test))
        f.write('MAE_test mean: {}\n'.format(MAE_test.mean()))
        f.write('MAE_test std: {}\n'.format(MAE_test.std()))
        print('MAE_train: {}'.format(MAE_train))
        print('MAE_train mean: {}'.format(MAE_train.mean()))
        print('MAE_train std: {}'.format(MAE_train.std()))
        print('MAE_test: {}'.format(MAE_test))
        print('MAE_test mean: {}'.format(MAE_test.mean()))
        print('MAE_test std: {}\n'.format(MAE_test.std()))


Data for adsorbate: CH
MAE_train: [0.03845378 0.02331749 0.02396364]
MAE_train mean: 0.028578302048917772
MAE_train std: 0.006988000755588839
MAE_test: [0.11364615 0.1082941  0.11349598]
MAE_test mean: 0.11181207842776032
MAE_test std: 0.002488341119064285


Data for adsorbate: CH2
MAE_train: [0.16017625 0.16365636 0.09251548]
MAE_train mean: 0.13878269618970843
MAE_train std: 0.03274669418174944
MAE_test: [0.19146444 0.17252968 0.24415545]
MAE_test mean: 0.2027165254547543
MAE_test std: 0.030304232766507875


Data for adsorbate: CH3
MAE_train: [0.09815495 0.10682018 0.10055821]
MAE_train mean: 0.10184444625432632
MAE_train std: 0.003652610538604174
MAE_test: [0.1696629  0.14380469 0.1631748 ]
MAE_test mean: 0.15888079506662814
MAE_test std: 0.01098455486887664


Data for adsorbate: OH
MAE_train: [0.11808628 0.10776778 0.10773225]
MAE_train mean: 0.11119543703651419
MAE_train std: 0.004872583800027916
MAE_test: [0.15062094 0.12593715 0.14566477]
MAE_test mean: 0.14074095259104102
MAE_

In [6]:
'''data_format = [metadata, slab_d0, slab_d1, bimetal_fp, 
                  ads_d0_unscaled, ads_d1_unscaled, ads_d2_unscaled, SS_unscaled,
                  ads_d0_scaled, ads_d1_scaled, ads_d2_scaled, SS_scaled,
                  energy, energy_A]'''

'data_format = [metadata, slab_d0, slab_d1, bimetal_fp, \n                  ads_d0_unscaled, ads_d1_unscaled, ads_d2_unscaled, SS_unscaled,\n                  ads_d0_scaled, ads_d1_scaled, ads_d2_scaled, SS_scaled,\n                  energy, energy_A]'