In [1]:
import keras
from keras.layers import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import multiprocessing as mp
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

df = pd.read_pickle('df_S300_featurized')
df = df.replace([np.inf, -np.inf, np.nan], 0)
df_X = df.drop('Input Data|S_300_atom',axis=1)
a = {}
for x in df_X.columns:
    if 'ChemEnvSiteFingerprint|GaussianSymmFunc' in x:
        a[x] = 'GaussianSymmFunc|'+x[39:]
df_X = df_X.rename(a,axis=1)

df_Y = df[['Input Data|S_300_atom']]

df_X=(df_X-df_X.min())/(df_X.max()-df_X.min())
#df_Y=(df_Y-df_Y.min())/(df_Y.max()-df_Y.min())
df_X = df_X.replace([np.inf, -np.inf, np.nan], 0)

f_test = mutual_info_regression(df_X,df_Y['Input Data|S_300_atom'])

def get_features(n):
    
    n_best = f_test.argsort()[-n:]
    
    return df_X.columns[n_best]

def CV_NN(model,df_X,df_Y,cv=5,epochs=200,batch_size=100,plot = False):
  df_X,df_Y = shuffle(df_X,df_Y)
  size = df_X.shape[0]
  mae_score = np.zeros(cv)
  map_score = np.zeros(cv)
  
  weights = model.get_weights()
  
  for i in range(cv):
    fit_params = {
            'x': df_X.values,
            'y': df_Y.values,
            'epochs': epochs+20,
            'batch_size': batch_size,
            'verbose': 0,
            'validation_split' : 1/cv
    }
    
    model.set_weights(weights)
    history = model.fit(**fit_params)
    mae =  history.history['mean_absolute_error']
    val_map = history.history['val_mean_absolute_percentage_error']
    val_mae = history.history['val_mean_absolute_error']   
    #score = np.sort(val_mae)[:40].mean()
    #score_p = np.sort(val_map)[:40].mean()
    score = np.array(val_mae[-40:]).mean()
    score_p = np.array(val_map[-40:]).mean()
    mae_score[i] = score
    map_score[i] = score_p
    temp = df_X.iloc[:size//cv,:]
    idx = temp.index
    df_X = df_X.drop(idx,axis=0)
    df_X = df_X.append(temp)
    temp = df_Y.loc[idx,:]
    df_Y = df_Y.drop(idx,axis=0)
    df_Y = df_Y.append(temp)
    if plot:
      fig, ax = plt.subplots(figsize=(6,6))
      ax.plot(mae,label='mae')
      ax.plot(val_mae,label='val_mae')
      # ax.set_ylim([0,6])
      ax.set_title('entropy @ 300K')
      ax.set_ylabel('MAE [J/Kmol]')
      ax.set_ylabel('epochs')
      ax.legend()
      #fig.savefig("e{}.jpg".format(str(T)))
      ax.grid()
      
  return (mae_score.mean(), map_score.mean())

def gen_model(n_features,bias=2):
    f_input = Input(shape=(n_features,))
    hidden1 = Dense(int(n_features/2)+bias,activation='relu')(f_input)
    #hidden2 = Dense(int(n_features/2)+2,activation='relu')(hidden1)
    out = Dense(1,activation='linear')(hidden1) #output is bounded below
    model = keras.models.Model(f_input,out)
    model.compile(loss = 'mae', metrics=['mae','mean_absolute_percentage_error'],
                  optimizer=keras.optimizers.Adam(0.01))
    return model
  
    
def grid_search(df_X,df_Y,n_features):
    iterations = len(n_features)
    i=1
    mae_score = []
    map_score = []
    for n in n_features:
        print('{}/{}'.format(i,iterations))
        features = get_features(n)
        X = df_X[features]
        Y = df_Y
        model = gen_model(n,bias=16)
        best_score, best_scorep = CV_NN(model,X,Y)
        print("{} features, {:.2f}% map/ {:.2f} mae".format(n,best_scorep,best_score))
        mae_score += [best_score]
        map_score += [best_scorep]
        i+=1
    return (mae_score,map_score)

Using TensorFlow backend.


In [2]:
n_features = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 24, 28, 32, 35, 40, 45, 50, 55, 60,
      70, 80, 90, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 900, 1100]
mae_err, map_err = grid_search(df_X,df_Y,n_features=n_features)

1/38
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
2 features, 54.08% map/ 8.84 mae
2/38
3 features, 53.85% map/ 8.83 mae
3/38
4 features, 54.68% map/ 8.93 mae
4/38
5 features, 54.43% map/ 8.85 mae
5/38
6 features, 54.39% map/ 8.91 mae
6/38
7 features, 53.92% map/ 8.81 mae
7/38
8 features, 53.53% map/ 8.73 mae
8/38
9 features, 53.37% map/ 8.73 mae
9/38
10 features, 53.51% map/ 8.75 mae
10/38
12 features, 53.47% map/ 8.75 mae
11/38
14 features, 53.48% map/ 8.75 mae
12/38
16 features, 53.32% map/ 8.74 mae
13/38
18 features, 53.39% map/ 8.72 mae
14/38
20 features, 53.24% map/ 8.73 mae
15/38
24 features, 54.70% map/ 8.68 mae
16/38
28 features, 51.98% map/ 8.51 mae
17/38
32 features, 46.54% map/ 8.22 mae
18/38
35 features, 46.93% map/ 8.29 mae
19/38
40 features, 47.11% map/ 8.37 mae
20/38
45 features, 36.34% map/ 6.05 mae
21/38
50 features, 34.81% map/ 5.98 mae
22/38
55 features, 34.41% map/ 5.91 mae
23/38
60 features

In [4]:
err = np.array(mae_err)
np.save("mi_fit_results",err)