In [4]:
import keras
from keras.layers import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import multiprocessing as mp
from sklearn.decomposition import PCA

df = pd.read_pickle('df_S300_featurized')
df = df.replace([np.inf, -np.inf, np.nan], 0)
df_X = df.drop('Input Data|S_300_atom',axis=1)
a = {}
for x in df_X.columns:
    if 'ChemEnvSiteFingerprint|GaussianSymmFunc' in x:
        a[x] = 'GaussianSymmFunc|'+x[39:]
df_X = df_X.rename(a,axis=1)

df_Y = df[['Input Data|S_300_atom']]

#df_X=(df_X-df_X.min())/(df_X.max()-df_X.min())
#df_Y=(df_Y-df_Y.min())/(df_Y.max()-df_Y.min())

from sklearn.preprocessing import StandardScaler

x_scaled = StandardScaler().fit_transform(df_X)
df_X = pd.DataFrame(data=x_scaled,columns = df_X.columns,index = df_X.index)

df_X = df_X.replace([np.inf, -np.inf, np.nan], 0)

def get_features(n):

    return ["pls {}".format(i+1) for i in range(n)]

def CV_NN(model,df_X,df_Y,n_feat,cv=5,epochs=200,batch_size=100,plot = False):
  df_X,df_Y = shuffle(df_X,df_Y)
  size = df_X.shape[0]
  mae_score = np.zeros(cv)
  map_score = np.zeros(cv)
  
  weights = model.get_weights()
  
  for i in range(cv):
        
    df_X_val = df_X.iloc[:size//cv,:]
    idx = df_X_val.index
    df_X = df_X.drop(idx,axis=0)
    df_Y_val = df_Y.loc[idx,:]
    df_Y = df_Y.drop(idx,axis=0)
        
    pca = PCA(n_components=n_feat)
    pca.fit(df_X.values)
    x_train = pca.transform(df_X.values)
    x_val = pca.transform(df_X_val)

    
    fit_params = {
            'x': x_train,
            'y': df_Y.values,
            'epochs': epochs,
            'batch_size': batch_size,
            'verbose': 0,
            'validation_data': (x_val,df_Y_val.values)
            #'validation_freq': 2
    }
    
    model.set_weights(weights)
    history = model.fit(**fit_params)
    mae =  history.history['mean_absolute_error']
    val_map = history.history['val_mean_absolute_percentage_error']
    val_mae = history.history['val_mean_absolute_error']   
    score = np.sort(val_mae)[:40].mean()
    score_p = np.sort(val_map)[:40].mean()
    #score = np.array(val_mae[-40:]).mean()
    #score_p = np.array(val_map[-40:]).mean()
    mae_score[i] = score
    map_score[i] = score_p
    
    df_X = df_X.append(df_X_val)
    df_Y = df_Y.append(df_Y_val)
    
    if plot:
      fig, ax = plt.subplots(figsize=(6,6))
      ax.plot(mae,label='mae')
      ax.plot(val_mae,label='val_mae')
      # ax.set_ylim([0,6])
      ax.set_title('entropy @ 300K')
      ax.set_ylabel('MAE [J/Kmol]')
      ax.set_ylabel('epochs')
      ax.legend()
      #fig.savefig("e{}.jpg".format(str(T)))
      ax.grid()
      
  return (mae_score.mean(), map_score.mean())

def gen_model(n_features,bias=2):
    f_input = Input(shape=(n_features,))
    hidden1 = Dense(int(n_features/2)+bias,activation='relu')(f_input)
    #hidden2 = Dense(int(n_features/2)+2,activation='relu')(hidden1)
    out = Dense(1,activation='linear')(hidden1) #output is bounded below
    model = keras.models.Model(f_input,out)
    model.compile(loss = 'mae', metrics=['mae','mean_absolute_percentage_error'],
                  optimizer=keras.optimizers.Adam(0.01))
    return model
  
    
def grid_search(df_X,df_Y,n_features):
    iterations = len(n_features)
    i=1
    mae_score = []
    map_score = []
    for n in n_features:
        print('{}/{}'.format(i,iterations))
        model = gen_model(n,bias=16)
        best_score, best_scorep = CV_NN(model,df_X,df_Y,n)
        print("{} features, {:.2f}% map/ {:.2f} mae".format(n,best_scorep,best_score))
        mae_score += [best_score]
        map_score += [best_scorep]
        i+=1
    return (mae_score,map_score)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [2]:
n_features = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 24, 28, 32, 35, 40, 45, 50, 55, 60,
      70, 80, 90, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 900, 996]
mae_err, map_err = grid_search(df_X,df_Y,n_features=n_features)

1/38
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
2 features, 58.38% map/ 9.13 mae
2/38
3 features, 39.85% map/ 7.25 mae
3/38
4 features, 37.14% map/ 6.60 mae
4/38
5 features, 34.65% map/ 6.20 mae
5/38
6 features, 32.07% map/ 5.92 mae
6/38
7 features, 31.08% map/ 5.72 mae
7/38
8 features, 30.27% map/ 5.64 mae
8/38
9 features, 30.35% map/ 5.75 mae
9/38
10 features, 29.00% map/ 5.39 mae
10/38
12 features, 29.64% map/ 5.03 mae
11/38
14 features, 28.24% map/ 4.94 mae
12/38
16 features, 26.82% map/ 4.69 mae
13/38
18 features, 25.49% map/ 4.59 mae
14/38
20 features, 25.64% map/ 4.55 mae
15/38
24 features, 25.43% map/ 4.41 mae
16/38
28 features, 24.57% map/ 4.40 mae
17/38
32 features, 23.97% map/ 4.26 mae
18/38
35 features, 25.10% map/ 4.15 mae
19/38
40 features, 24.26% map/ 4.21 mae
20/38
45 features, 23.17% map/ 4.20 mae
21/38
50 features, 23.66% map/ 4.24 mae
22/38
55 features, 22.96% map/ 4.16 mae
23/38
60 features

In [5]:
err = np.array(mae_err)
np.save("pca_fit_results",err)