In [1]:
import keras
from keras.layers import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import multiprocessing as mp
from sklearn.cross_decomposition import PLSRegression as PLS

df = pd.read_pickle('df_S300_featurized')
df = df.replace([np.inf, -np.inf, np.nan], 0)
df_X = df.drop('Input Data|S_300_atom',axis=1)
a = {}
for x in df_X.columns:
    if 'ChemEnvSiteFingerprint|GaussianSymmFunc' in x:
        a[x] = 'GaussianSymmFunc|'+x[39:]
df_X = df_X.rename(a,axis=1)

df_Y = df[['Input Data|S_300_atom']]

df_X=(df_X-df_X.min())/(df_X.max()-df_X.min())
#df_Y=(df_Y-df_Y.min())/(df_Y.max()-df_Y.min())
df_X = df_X.replace([np.inf, -np.inf, np.nan], 0)

def get_features(n):

    return ["pls {}".format(i+1) for i in range(n)]

def CV_NN(model,df_X,df_Y,n_feat,cv=5,epochs=200,batch_size=100,plot = False):
  df_X,df_Y = shuffle(df_X,df_Y)
  size = df_X.shape[0]
  mae_score = np.zeros(cv)
  map_score = np.zeros(cv)
  
  weights = model.get_weights()
  
  for i in range(cv):
        
    df_X_val = df_X.iloc[:size//cv,:]
    idx = df_X_val.index
    df_X = df_X.drop(idx,axis=0)
    df_Y_val = df_Y.loc[idx,:]
    df_Y = df_Y.drop(idx,axis=0)
    
    print('start pls')
    pls = PLS(n_components=n_feat)
    pls.fit(df_X.values,df_Y.values)
    x_train = pls.transform(df_X)
    x_val = pls.transform(df_X_val)
    print('done')
    
    fit_params = {
            'x': x_train,
            'y': df_Y.values,
            'epochs': epochs,
            'batch_size': batch_size,
            'verbose': 0,
            'validation_data': (x_val,df_Y_val.values)
            #'validation_freq': 2
    }
    
    model.set_weights(weights)
    history = model.fit(**fit_params)
    mae =  history.history['mean_absolute_error']
    val_map = history.history['val_mean_absolute_percentage_error']
    val_mae = history.history['val_mean_absolute_error']   
    score = np.sort(val_mae)[:40].mean()
    score_p = np.sort(val_map)[:40].mean()
    #score = np.array(val_mae[-40:]).mean()
    #score_p = np.array(val_map[-40:]).mean()
    mae_score[i] = score
    map_score[i] = score_p
    
    df_X = df_X.append(df_X_val)
    df_Y = df_Y.append(df_Y_val)
    
    if plot:
      fig, ax = plt.subplots(figsize=(6,6))
      ax.plot(mae,label='mae')
      ax.plot(val_mae,label='val_mae')
      # ax.set_ylim([0,6])
      ax.set_title('entropy @ 300K')
      ax.set_ylabel('MAE [J/Kmol]')
      ax.set_ylabel('epochs')
      ax.legend()
      #fig.savefig("e{}.jpg".format(str(T)))
      ax.grid()
      
  return (mae_score.mean(), map_score.mean())

def gen_model(n_features,bias=2):
    f_input = Input(shape=(n_features,))
    hidden1 = Dense(int(n_features/2)+bias,activation='relu')(f_input)
    #hidden2 = Dense(int(n_features/2)+2,activation='relu')(hidden1)
    out = Dense(1,activation='linear')(hidden1) #output is bounded below
    model = keras.models.Model(f_input,out)
    model.compile(loss = 'mae', metrics=['mae','mean_absolute_percentage_error'],
                  optimizer=keras.optimizers.Adam(0.01))
    return model
  
    
def grid_search(df_X,df_Y,n_features):
    iterations = len(n_features)
    i=1
    mae_score = []
    map_score = []
    for n in n_features:
        print('{}/{}'.format(i,iterations))
        model = gen_model(n,bias=16)
        best_score, best_scorep = CV_NN(model,df_X,df_Y,n)
        print("{} features, {:.2f}% map/ {:.2f} mae".format(n,best_scorep,best_score))
        mae_score += [best_score]
        map_score += [best_scorep]
        i+=1
    return (mae_score,map_score)

Using TensorFlow backend.


In [None]:
n_features = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 24, 28, 32, 35, 40, 45, 50, 55, 60,
      70, 80, 90, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 900, 1100]
mae_err, map_err = grid_search(df_X,df_Y,n_features=n_features)

1/38
Instructions for updating:
Colocations handled automatically by placer.
start pls
done
Instructions for updating:
Use tf.cast instead.
start pls
done
start pls
done
start pls
done
start pls
done
2 features, 30.79% map/ 5.61 mae
2/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
3 features, 34.63% map/ 6.01 mae
3/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
4 features, 26.34% map/ 4.52 mae
4/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
5 features, 28.39% map/ 4.78 mae
5/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
6 features, 27.15% map/ 4.42 mae
6/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
7 features, 30.02% map/ 4.87 mae
7/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
8 features, 25.26% map/ 4.31 mae
8/38
start pls
done
start pls
done
start pls
done
start pls
done
start pls
done
9 feature

In [None]:
err = np.array(mae_err)
np.save("pls_fit_results",err)