In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import sys
import os
from catGP import preprocess_data, OMGP
from catSC import Linear_Scaling
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
'''data_format = [metadata, slab_d0, slab_d1, bimetal_fp, 
                  ads_d0_unscaled, ads_d1_unscaled, ads_d2_unscaled, SS_unscaled,
                  ads_d0_scaled, ads_d1_scaled, ads_d2_scaled, SS_scaled,
                  energy, energy_A]'''

def get_data(A):
    data = np.load('../Delta_FP/{}_data_delta.npy'.format(A))[()]
    y = np.array([i[-2] for i in data if i[-1] is not None])
    X = np.vstack([np.hstack([i[1].reshape(-1), 
                              i[2].reshape(-1),
                              i[3].reshape(-1), 
                              i[4].reshape(-1),
                              i[5].reshape(-1),
                              i[8].reshape(-1),
                              i[9].reshape(-1)]) for i in data if i[-1] is not None])
    y_sc = np.array([i[-1] for i in data if i[-1] is not None])
    
    return X, y, y_sc

In [3]:
adsorbates = ['CH', 'CH2', 'CH3', 'OH', 'NH', 'SH']

for ads in adsorbates:
    print('Working on adsorbates: {}'.format(ads))
    X, y, y_sc = get_data(ads)
    data = preprocess_data(X, y)
    data.clean_data()
    X, y = data.get_data()
    assert len(y) == len(y_sc)
    
    sc = Linear_Scaling(y_sc, y, ads[0], ads, 'eV')
    sc.get_coeff()
    a = sc.plot_scaling()

    scaling_dict = sc.__dict__

    kernel_recipe = {'ConstantKernel' : [{'RBF' : [1.0,
                                                   {'length_scale' : 1.0}]},
                                      {'constant_value' : 1.0,
                                       'constant_value_bounds' : (3e-7, 3e7)}],
                     'WhiteKernel' : {'noise_level' : 0.1,
                                      'noise_level_bounds' : (1e-5, 1e5)}}

    r_state = [10, 20, 42, 80]
    #r_state = [42]
    r_data = {rs : {} for rs in r_state}
    for rs in r_state:
        print('    Working on random state: {}'.format(rs))
        X_train, X_test, y_train, y_test, y_sc_train, y_sc_test = train_test_split(X, y, y_sc, 
                                                                                   train_size=0.80, 
                                                                                   random_state=rs)

        MLGP = OMGP(X_train=X_train,
                    X_test=X_test,
                    y_train=y_train,
                    y_test=y_test,
                    kernel_recipe=kernel_recipe,
                    scaling=True,
                    scaling_params={'alpha' : sc.slope,
                                    'gamma' : sc.intercept},
                    scaling_y_train=y_sc_train,
                    scaling_y_test=y_sc_test)

        MLGP.run_GP()
        r_data[rs] = MLGP.__dict__


    if not os.path.exists('run_{}'.format(ads)):
        os.mkdir('run_{}'.format(ads))
        
    a.savefig('run_{}/scaling_plot.png'.format(ads))
    np.save('run_{}/scaling_data.npy'.format(ads), scaling_dict)

    np.save('run_{0}/{0}_rsdata.npy'.format(ads), r_data)
    #LC = MLGP.plot_learning_curve()
    #LC.savefig('run_{0}/{0}_learning_curve.png'.format(ads))
    PP = MLGP.parity_plot(data='train')
    PP.savefig('run_{0}/{0}_parity_plot_train.png'.format(ads))
    PP = MLGP.parity_plot(data='test')
    PP.savefig('run_{0}/{0}_parity_plot_test.png'.format(ads))
    PP = MLGP.parity_plot(data='train', err_bar=True)
    PP.savefig('run_{0}/{0}_parity_plot_train_err_bar.png'.format(ads))
    PP = MLGP.parity_plot(data='test', err_bar=True)
    PP.savefig('run_{0}/{0}_parity_plot_test_err_bar.png'.format(ads))
    #PP = MLGP.parity_plot_fancy(data='train')
    #PP.savefig('run_{0}/{0}_parity_plot_train.png'.format(ads))
    #PP = MLGP.parity_plot_fancy(data='test')
    #PP.savefig('run_{0}/{0}_parity_plot_test.png'.format(ads))

Working on adsorbates: CH
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
    Working on random state: 80
Working on adsorbates: CH2
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
    Working on random state: 80
Working on adsorbates: CH3
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
    Working on random state: 80
Working on adsorbates: OH
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
    Working on random state: 80
Working on adsorbates: NH
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
    Working on random state: 80




Working on adsorbates: SH
    Working on random state: 10




    Working on random state: 20
    Working on random state: 42
    Working on random state: 80




In [4]:
for ads in adsorbates:
    data = np.load('run_{0}/{0}_rsdata.npy'.format(ads))[()]
    MAE_train = np.array([data[k]['MAE_train'] for k in data])
    MAE_test = np.array([data[k]['MAE_test'] for k in data])
    with open('run_{}/out.txt'.format(ads), 'w') as f:
        print('\nData for adsorbate: {}'.format(ads))
        f.write('MAE_train: {}\n'.format(MAE_train))
        f.write('MAE_train mean: {}\n'.format(MAE_train.mean()))
        f.write('MAE_train std: {}\n'.format(MAE_train.std()))
        f.write('MAE_test: {}\n'.format(MAE_test))
        f.write('MAE_test mean: {}\n'.format(MAE_test.mean()))
        f.write('MAE_test std: {}\n'.format(MAE_test.std()))
        print('MAE_train: {}'.format(MAE_train))
        print('MAE_train mean: {}'.format(MAE_train.mean()))
        print('MAE_train std: {}'.format(MAE_train.std()))
        print('MAE_test: {}'.format(MAE_test))
        print('MAE_test mean: {}'.format(MAE_test.mean()))
        print('MAE_test std: {}\n'.format(MAE_test.std()))


Data for adsorbate: CH
MAE_train: [0.07702328 0.07315896 0.07320689 0.07728236]
MAE_train mean: 0.07516787307039073
MAE_train std: 0.001987130585298205
MAE_test: [0.13346903 0.12912852 0.1329131  0.1323896 ]
MAE_test mean: 0.1319750636272218
MAE_test std: 0.0016871919176643841


Data for adsorbate: CH2
MAE_train: [0.12440014 0.12872057 0.12913712 0.11992504]
MAE_train mean: 0.1255457180131942
MAE_train std: 0.003737719697014643
MAE_test: [0.18836695 0.203455   0.23062715 0.22236705]
MAE_test mean: 0.2112040354250598
MAE_test std: 0.016457901229868205


Data for adsorbate: CH3
MAE_train: [0.12118255 0.12411492 0.12435277 0.1216301 ]
MAE_train mean: 0.12282008453291307
MAE_train std: 0.0014250725197625493
MAE_test: [0.17386516 0.15516223 0.1549704  0.14907004]
MAE_test mean: 0.1582669588216394
MAE_test std: 0.00933265628195246


Data for adsorbate: OH
MAE_train: [0.10544093 0.10488722 0.09974031 0.10495725]
MAE_train mean: 0.10375642960571077
MAE_train std: 0.002328486530771459
MAE_test

In [5]:
'''data_format = [metadata, slab_d0, slab_d1, bimetal_fp, 
                  ads_d0_unscaled, ads_d1_unscaled, ads_d2_unscaled, SS_unscaled,
                  ads_d0_scaled, ads_d1_scaled, ads_d2_scaled, SS_scaled,
                  energy, energy_A]'''

'data_format = [metadata, slab_d0, slab_d1, bimetal_fp, \n                  ads_d0_unscaled, ads_d1_unscaled, ads_d2_unscaled, SS_unscaled,\n                  ads_d0_scaled, ads_d1_scaled, ads_d2_scaled, SS_scaled,\n                  energy, energy_A]'