### Python packages used in this code

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import random
import os
import pickle
import time
import sklearn
import platform
import sys
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import itertools
from IPython.display import clear_output

## Keras
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model, model_from_json
from tensorflow.keras.layers import Dense, Input, Add, Lambda, Dropout, Subtract, Multiply, Concatenate, Dot, BatchNormalization, Activation, LeakyReLU, ReLU
from tensorflow.keras.losses import mse
import keras.backend as K
from tensorflow.keras import regularizers
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
tensorflow.get_logger().setLevel("ERROR")

%matplotlib inline

In [2]:
"""
Environments

--Platform--
OS : Windows-10-10.0.19044-SP0
--Version--
python :  3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]
numpy : 1.21.5
pandas : 1.4.1
sklearn : 1.0.2
tensorflow : 2.3.1
keras : 2.4.0
"""

print('--Platform--')
print('OS :', platform.platform())
print('--Version--')
print('python : ', sys.version)
print('numpy :', np.__version__)
print('pandas :', pd.__version__)
print('sklearn :', sklearn.__version__)
print('tensorflow :', tensorflow.__version__)
print('keras :', keras.__version__)

--Platform--
OS : Windows-10-10.0.19044-SP0
--Version--
python :  3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
numpy : 1.23.1
pandas : 1.4.3
sklearn : 1.1.1
tensorflow : 2.9.1
keras : 2.9.0


# Preparation

## fix_seed function

In [3]:
def fix_seed(seed):
    # Numpy
    np.random.seed(seed)
    # Tensorflow
    tensorflow.random.set_seed(seed)
    # for built-in random
    random.seed(seed)
    # for hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)

## PrintDot function

In [4]:
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch%100==0: print('')
        if epoch%100==1: print(epoch)
        print('.', end='')

## Save directory

In [5]:
if not os.path.isdir('../30_Output/10_Model/100_MakeSourceModel'):
    os.makedirs('../30_Output/10_Model/100_MakeSourceModel')
if not os.path.isdir('../30_Output/20_Plot/100_MakeSourceModel'):
    os.makedirs('../30_Output/20_Plot/100_MakeSourceModel')
if not os.path.isdir('../30_Output/30_csv/100_MakeSourceModel'):
    os.makedirs('../30_Output/30_csv/100_MakeSourceModel')
if not os.path.isdir('../30_Output/40_pkl/100_MakeSourceModel'):
    os.makedirs('../30_Output/40_pkl/100_MakeSourceModel')

## Settings

In [6]:
# Dataset
data_all = joblib.load('../10_Data/SPSTC_290.pkl')
drop_list = ['min:gs_mag_moment','min:num_d_unfilled','min:num_f_unfilled','min:num_f_valence','ave:num_f_unfilled','ave:num_f_valence','sum:num_f_unfilled','sum:num_f_valence','var:num_f_unfilled','var:num_f_valence','max:num_f_unfilled','max:num_f_valence']

x_all = data_all['desc']
x_all = x_all.drop(drop_list,axis=1)
y_all = data_all['data']['SPS (cm)']

## Scaling parameters
x_mean = x_all.mean()
x_std = x_all.std()
y_logmean = np.log(y_all).mean()
y_logstd = np.log(y_all).std()

dim_x = 290 - len(drop_list)
early_stop = keras.callbacks.EarlyStopping(monitor='val_mse', patience=50)
i_list = np.arange(0,100,1)

# Main codes

In [7]:
# Saving dataframe
df_result = pd.DataFrame(np.zeros([0,328]), columns=['Itr','layers','lambda', 'dropout rate','learning rate', 'MSE', 'Corr', 'MAE']+x_all.index.values.tolist())

# Loop 100 times
for i in i_list:
    rnd1 = i
    rnd2 = i*2
    print('try : '+str(i)+' ('+str(rnd1)+', '+str(rnd2)+')')
    fix_seed(373)
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, train_size=256, random_state=int(rnd1))
    
    ## Scaling
    x_train_scal = (x_train - x_mean)/x_std
    x_test_scal = (x_test - x_mean)/x_std
    y_train_scal = (np.log(y_train)-y_logmean)/y_logstd

    fix_seed(int(rnd1))
    num_layers = 3
    width_layers = np.zeros(num_layers)+np.random.randint(50,100)

    ## Model definision
    lmbd = 1e-2
    def make_model():
        model_input = Input(shape=(dim_x,))
        h = model_input
        for i_model in range(num_layers):
            h = Dense(units=width_layers[i_model], kernel_regularizer=regularizers.L2(lmbd))(h)
            h = LeakyReLU(alpha=0.01)(h)
            
        out = Dense(units=1, kernel_regularizer=regularizers.L2(lmbd))(h) 
        model = Model(model_input, out)
        optimizer = keras.optimizers.Adam(1e-4)
        model.compile(loss='mse',
                     optimizer=optimizer,
                     metrics=['mae', 'mse'])
        return(model)
    model = make_model()

    ## Model training
    fix_seed(int(rnd1))
    history = model.fit(
        x_train_scal,
        y_train_scal,
        batch_size=8,
        epochs=1000,
        validation_split = 0.2,
        verbose=0,
        callbacks=[early_stop, PrintDot()])

    text_layers = '-'.join(map(str, map(int,np.append(np.append(np.array([dim_x]),width_layers),np.array([1]) ))))

    y_fits_scal = model.predict(x_train_scal)
    y_fits = np.exp(y_fits_scal * y_logstd + y_logmean)
    y_pred_scal = model.predict(x_test_scal)
    y_pred = np.exp(y_pred_scal * y_logstd + y_logmean)
    
    ## Save
    ### Plot
    y_obs1 = y_train
    y_prd1 = y_fits.reshape(-1)
    y_obs2 = y_test
    y_prd2 = y_pred.reshape(-1)
    fig = plt.figure(figsize=(5,5))
    plt.scatter(y_obs1, y_prd1, alpha=0.7, zorder=2, s=50, label='Train')
    plt.scatter(y_obs2, y_prd2, alpha=0.7, zorder=3, s=50, color='darkorange', label='Test') #darkorange
    plt.title('Model '+str(i)+' : '+text_layers, size=15)
    plt.legend(loc='lower right')
    plt.xlabel('Observation', size=10)
    plt.ylabel('Prediction', size=10)
    plt.axis('equal')
    plt.axis('square')
    plt.grid(color='gray', linestyle='dotted', linewidth=1, alpha=0.5)
    fig.text(0.15, 0.83, 'Corr : '+str(round(np.corrcoef(y_prd2.reshape(y_prd2.shape[0]), y_obs2)[0,1], 4)), size=15)
    fig.text(0.15, 0.78, 'MSE : '+str(round(np.mean((y_obs2-y_prd2)**2), 4)), size=15)
    fig.text(0.15, 0.73, 'MAE : '+str(round(np.mean(np.abs(y_obs2-y_prd2)), 4)), size=15)
    plt.xlim([min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])])
    plt.ylim([min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])])
    _ = plt.plot([-300, 300], [-300, 300], color='gray', linewidth=0.5, zorder=1)
    fig.savefig('../30_Output/20_Plot/100_MakeSourceModel/100_Model_'+str(i)+'.png')
    plt.close(fig)

    ### Dataframe
    dr_rate = 0
    learning_rate = 1e-4
    result_mse = np.mean((y_obs2-y_prd2)**2)
    result_corr = np.corrcoef(y_prd2.reshape(y_prd2.shape[0]), y_obs2.values)[0,1]
    result_mae = np.mean(np.abs(y_obs2-y_prd2))
    tmp_df = pd.DataFrame(np.ones([1,256]), columns=x_train_scal.index.values.tolist(), index=['Model '+str(i)])
    tmp_df2 = pd.DataFrame(np.array([i, text_layers, lmbd, dr_rate, learning_rate, result_mse, result_corr, result_mae]).reshape(1, -1), columns=['Itr','layers', 'lambda', 'dropout rate','learning rate', 'MSE', 'Corr', 'MAE'], index=['Model '+str(i)])
    df_result = pd.concat([df_result, pd.concat([tmp_df2, tmp_df], axis=1)], axis=0)
    df_result.to_csv('../30_Output/30_csv/100_MakeSourceModel/100_Result.csv')

    ### Pickle
    result_list = {'x_train': x_train,
                  'x_test' : x_test,
                  'y_train' : y_train,
                  'y_test' : y_test,
                  'x_mean' : x_mean,
                  'x_std' : x_std,
                  'text_layers' : text_layers,
                  'width_layers' : width_layers,
                  'y_logmean' : y_logmean,
                  'y_logstd': y_logstd}
    json_string = model.to_json()
    open('../30_Output/10_Model/100_MakeSourceModel/100_Model_'+str(i)+'.json', 'w').write(json_string)
    model.save_weights('../30_Output/10_Model/100_MakeSourceModel/100_Model_'+str(i)+'.hdf5')
    f = open('../30_Output/40_pkl/100_MakeSourceModel/100_Model_'+str(i)+'.pkl','wb')
    pickle.dump(result_list,f)
    f.close
    
    K.clear_session()
    del model
    clear_output(True)
    
clear_output(True)
print('*** Succeeded ***')

*** Succeeded ***
