In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import tensorflow as tf
import statistics as stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam
import tensorflow.compat.v1 as tf
from tensorflow.python.keras import backend as K
from math import sqrt
from numpy.random import seed

Using TensorFlow backend.


In [2]:
# ensure repeatability
np.random.seed(23)

In [3]:
data = pd.read_csv("ML_data.csv", sep="|")
pd.set_option("display.max_columns", None)

In [4]:
data.head()

Unnamed: 0,Type,AB,Site,Impurity,dH (A-rich),dH (B-rich),(+2/+1),(+1/0),(0/-1),(-1/-2),Z_A,Z_B,PBE_delta_H,PBE_latt_const,PBE_gap,Eps_elec,Eps_ion,CM1,CM2,CM3,CM4,CM5,CM6,CM7,CM8,Ion_rad,BP,MP,Density,At_wt,ICSD_vol,Cov_rad,Ion_Energy,At_rad_1,Elec_Aff,At_rad_2,At_vol,Mend_num,Ion_pot_1,Ion_pot_2,Thermal_expn,Sp_heat_cap,Therm_cond,Elec_cond,Heat_fusion,Heat_vap,Electronegativity,At_num,Period,Group,Valence,Ox_state
0,IV-IV,SiC,M_i_A,In,15.963,15.963,3.891,4.66,5.109,5.629,14,6,-0.42,4.25,1.27,18.06,2.27,373.01,373.01,373.01,373.01,138.44,138.44,138.44,79.93,0.8,2350.0,429.78,7.31,114.82,26.1,1.44,558.3,1.66,29.0,2.0,15.7,75,5.79,18.87,32.1,0.23,81.6,3.4,3.26,226.34,1.78,49,5,13,3,3
1,IV-IV,SiC,M_i_B,La,19.972,19.554,3.863,4.539,4.991,5.556,14,6,-0.42,4.25,1.27,18.06,2.27,375.78,375.78,375.78,216.96,185.96,185.96,185.96,185.96,1.16,3737.0,1191.0,6.15,138.91,37.4,1.25,538.1,1.88,50.0,2.74,20.73,13,5.58,11.06,5.2,0.19,13.5,1.9,11.3,399.57,1.1,57,6,3,3,3
2,IV-IV,SiC,M_i_neut,Zn,7.157,7.157,3.206,4.244,4.734,5.363,14,6,-0.42,4.25,1.27,18.06,2.27,152.25,238.53,238.53,238.53,102.23,102.23,102.23,65.25,0.74,1180.0,692.73,7.13,65.39,15.1,1.25,906.4,1.33,0.0,1.53,9.2,69,9.39,17.96,30.2,0.39,116.0,16.9,7.38,115.3,1.65,30,4,12,2,2
3,IV-IV,SiC,M_i_B,Al,6.951,7.11,3.503,4.242,4.65,5.319,14,6,-0.42,4.25,1.27,18.06,2.27,85.7,85.7,85.7,49.48,42.41,42.41,42.41,42.41,0.54,2740.0,933.5,2.7,26.98,16.6,1.18,577.6,1.43,45.0,1.62,10.0,73,5.99,18.83,23.1,0.9,237.0,37.7,10.7,290.8,1.61,13,3,13,3,3
4,IV-IV,SiC,M_B,I,8.783,9.201,3.614,4.221,5.058,5.608,14,6,-0.42,4.25,1.27,18.06,2.27,403.46,403.46,403.46,403.46,6874.36,105.89,105.89,105.89,2.2,457.5,386.7,4.93,126.9,42.5,1.33,1008.4,1.33,295.3,1.32,25.74,96,10.45,19.13,87.0,0.14,0.45,0.0,7.76,20.9,2.66,53,5,17,7,1


In [5]:
############ DATA PRE-PROCESSING ############ 

x_data = data.iloc[:, 10:53]
y_data = data.iloc[:, 6:10]

# separate categorical and continuous data 
categorical=pd.DataFrame()
continuous=pd.DataFrame()

for index in x_data.columns:
    if(x_data[index].dtypes == "int"):
        categorical[index]=x_data[index]
    elif(x_data[index].dtypes == "float"):
        continuous[index]=x_data[index]
    else:
        pass

# one hot encode categorical data
onehotencoder = OneHotEncoder()
categorical = onehotencoder.fit_transform(categorical).toarray()

# standardize continuous data
scaler = StandardScaler()
continuous = scaler.fit_transform(continuous)

# re-combine categorical and continuous data
x = np.concatenate((continuous, categorical), axis=1)

# extract y data and standardize (DFT predicted / output)
y = scaler.fit_transform(y_data)

# split training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=23)

# input and output dimension
in_dim = x.shape[1]
out_dim = y.shape[1]

In [6]:
# construct model

def construct_model(hidden_layers, activation, learning_rate, adam_decay):
    
    # Create and add layers to model
    model = Sequential()
    model.add(Dense(out_dim*(2**(hidden_layers+1)), input_dim=in_dim, 
                    activation=activation))
    for layer in range(hidden_layers, 0, -1):
        model.add(Dense(out_dim*(2**layer), activation=activation))
    model.add(Dense(out_dim))

    # configure optimizer & compile model
    opt = Adam(lr=learning_rate, decay=adam_decay)
    model.compile(loss="mse", optimizer=opt)

    #### summarize model
    # print(model.summary())
    
    return model

In [7]:
# Set parameter bounds to be optimized
hidden_layers = Integer(low=1, high=8, name="hidden_layers")
epochs = Integer(low=30, high=50, name="epochs")
batch_size = Integer(low=10, high=70, name="batch_size")
activation = Categorical(categories=["relu", "sigmoid"], name="activation")
learning_rate = Real(low=1e-5, high=1e-2, prior="log-uniform", 
                     name="learning_rate")
adam_decay = Real(low=0,high=0.1,name="adam_decay")

dimensions = [hidden_layers,
              epochs,
              batch_size,
              activation,
              learning_rate,
              adam_decay,
             ]

default_parameters = [6, 30, 35, 'relu', 1e-3, 0.01]

In [8]:
@use_named_args(dimensions)
def train_model(hidden_layers, epochs, batch_size, activation, learning_rate, 
                adam_decay, verbose=0):

    model = construct_model(hidden_layers=hidden_layers, 
                            activation=activation, 
                            learning_rate=learning_rate,
                            adam_decay=adam_decay)
    
    # train model
    prediction = model.fit(x_train, y_train,
                          validation_data=(x_test, y_test),
                          epochs=epochs,
                          batch_size=batch_size,
                          verbose=verbose)
    
    # Finding mean RMSE of testing data
    pred_test = model.predict(x_test)
    RMSE_test = (stats.mean([sqrt(mean_squared_error(y_test[:,0], 
                                                     pred_test[:,0])), 
                            sqrt(mean_squared_error(y_test[:,1], 
                                                    pred_test[:,1])),  
                            sqrt(mean_squared_error(y_test[:,2], 
                                                    pred_test[:,2])),
                            sqrt(mean_squared_error(y_test[:,3], 
                                                    pred_test[:,3]))
                            ])
                )
    
    # clear session & reset model graphs
    K.clear_session()
    tf.reset_default_graph()

    return RMSE_test

In [9]:
# Run Bayesian Optimization and return its results
bayopt_results = gp_minimize(func=train_model, 
                             dimensions=dimensions, 
                             n_random_starts=5,
                             n_calls=10, 
                             n_jobs=-1,
                             kappa=5,
                             x0=default_parameters,
                             random_state=23,
                             verbose=True
                            )

Iteration No: 1 started. Evaluating function at provided point.
Iteration No: 1 ended. Evaluation done at provided point.
Time taken: 7.3245
Function value obtained: 0.4332
Current minimum: 0.4332
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 5.4595
Function value obtained: 1.0997
Current minimum: 0.4332
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.8769
Function value obtained: 0.9461
Current minimum: 0.4332
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.6904
Function value obtained: 0.9614
Current minimum: 0.4332
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 3.5455
Function value obtained: 0.9859
Current minimum: 0.4332
Iteration No: 6 started. Evalua

In [10]:
column_names = ["hidden_layers",
              "epochs",
              "batch_size",
              "activation",
              "learning_rate",
              "adam_decay",
             ]

In [11]:
iterations = pd.DataFrame(data=bayopt_results.x_iters, 
                          index=bayopt_results.func_vals, columns=column_names)
print(bayopt_results.x)
iterations

[6, 30, 35, 'relu', 0.001, 0.01]


Unnamed: 0,hidden_layers,epochs,batch_size,activation,learning_rate,adam_decay
0.433236,6,30,35,relu,0.001,0.01
1.099691,6,34,57,sigmoid,0.000212,0.0184
0.94611,2,34,60,relu,0.001835,0.099687
0.961448,1,50,35,sigmoid,0.001811,0.057886
0.985864,4,40,33,sigmoid,0.001855,0.083368
0.520045,2,34,62,relu,0.004248,0.047061
0.907479,1,50,21,relu,0.000415,0.012823
0.987143,4,50,27,relu,1e-05,0.1
0.450508,5,30,48,relu,0.0023,0.020085
0.778052,8,38,70,relu,1e-05,0.0


In [12]:
from datetime import datetime
import pytz

tz_NY = pytz.timezone('America/Los_Angeles') 
datetime_NY = datetime.now(tz_NY)
print("Finished Time:", datetime_NY.strftime("%H:%M:%S"))

Finished Time: 15:16:41


In [13]:
iterations.to_excel(r'BayOpt3.xlsx')