In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.model_selection import cross_val_score,cross_val_predict,cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import explained_variance_score

import keras
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.models import save_model, load_model
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

In [None]:
params = {'font.family'    : 'serif',  # 衬线字体族
          'font.serif'     : 'Times New Roman',
          'legend.fontsize': 20,
          'figure.dpi'     : 100,
          'savefig.dpi'    : 600,
        #   'figure.figsize' : (15, 5),
          'axes.labelsize' : 20,
          'axes.titlesize' : 20,
          'xtick.labelsize': 20,
          'ytick.labelsize': 20,
          }
plt.rcParams.update(params)

In [None]:
data = pd.read_excel("FE_trainData_AB_AB.xlsx", index_col=0, header=0)

x = data.iloc[:, :-1]
y = data.iloc[:, -1]
x_train_dim = x.shape[1]

x = MinMaxScaler().fit_transform(x)
y = np.array(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [None]:
def rmse(y_true, y_pred):
	"""自定义损失函数RMSE"""
	rmse = K.sqrt(K.mean(K.square(y_pred - y_true)))
	return rmse

def r_square(y_true, y_pred):
	"""自定义评估函数R2"""
	RSS = K.sum(K.square(y_true - y_pred))
	TSS = K.sum(K.square(y_true-K.mean(y_true)))
	r_square = 1 - RSS/TSS
	return r_square

In [None]:
feature_layers = [
Dense(512,input_shape=(x_train_dim,), kernel_regularizer=regularizers.l1(0.001), activation='relu', name="Input_Layer"),
# Dense(512, activation='relu', name="Hidden_Layer_5"),
# Dropout(0.3),
Dense(256, activation='relu', name="Hidden_Layer_9"),
# Dropout(0.2),
Dense(128, activation='relu', name="Hidden_Layer_12"),
# Dropout(0.1),
Dense(64, activation='relu', name="Hidden_Layer_14"),
Dense(32, activation='relu', name="Hidden_Layer_15"),
Dense(1, activation='linear', name="Output_Layer"),
]

# Create and compile model
model = Sequential(feature_layers)
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='mae', optimizer=opt, metrics=[r_square, rmse, 'mae', 'mse'])

In [None]:
## define 10-fold cross validation test harness
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# 保存交叉验证的评分
cross_validation = pd.DataFrame(index=list(range(kfold.n_splits)), 
                                columns=['LOSS', 'R2', 'RMSE', 'MAE', 'MSE'])

# 保存交叉验证模型在独立测试集上的评分
holdout_test = pd.DataFrame(index=list(range(kfold.n_splits)), 
                            columns=['train_R2','train_RMSE','train_MAE','test_R2','test_RMSE','test_MAE'])

for fold_index, (cv_train_index, cv_test_index) in enumerate(kfold.split(x_train, y_train)):
    checkpointer = ModelCheckpoint(filepath='{}_best_model.h5'.format(fold_index),
                                monitor='val_loss',
                                mode='auto',
                                verbose=0,
                                save_best_only=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                factor=0.2,
                                patience=5,
                                min_lr=0.001)
    callback_lists=[checkpointer]
    # Fit the model
    history = model.fit(x_train[cv_train_index], y_train[cv_train_index], 
                        validation_data=(x_train[cv_test_index], y_train[cv_test_index]),
                        callbacks=callback_lists,
                        epochs=300, batch_size=16, verbose=0)
    # evaluate the model
    scores = model.evaluate(x_train[cv_test_index], y_train[cv_test_index], batch_size=32, verbose=0)
    cross_validation.loc[fold_index] = scores 

    
    # 模型预测
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # 保存每一次交叉验证的预测结果
    y_train_pred = y_train_pred.reshape(-1)
    y_test_pred = y_test_pred.reshape(-1)
    # train_data = pd.DataFrame({"y_train":y_train,"y_train_pred":y_train_pred}, index=x_train_index)
    # test_data = pd.DataFrame({"y_test":y_test,"y_test_pred":y_test_pred}, index=x_test_index)
    train_data = pd.DataFrame({"y_train":y_train,"y_train_pred":y_train_pred})
    test_data = pd.DataFrame({"y_test":y_test,"y_test_pred":y_test_pred})
    train_data.to_excel("{}_train_set.xlsx".format(fold_index))
    test_data.to_excel("{}_test_set.xlsx".format(fold_index))
    
    # 预测得分
    train_R2 = r2_score(y_train, y_train_pred)
    train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_MAE = mean_absolute_error(y_train, y_train_pred)

    test_R2 = r2_score(y_test, y_test_pred)
    test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_MAE = mean_absolute_error(y_test, y_test_pred)
    
    # 保存每一次交叉验证模型在独立测试集上的预测得分
    holdout_test.loc[fold_index] = [train_R2,train_RMSE,train_MAE,test_R2,test_RMSE,test_MAE]

    fig=plt.figure(figsize=(15,5))
    fig.patch.set_alpha(1)  # 设置绘图背景不透明
    spec=fig.add_gridspec(nrows=1,ncols=2,width_ratios=[2,3])    
    # 绘制model预测结果散点图
    ax1=fig.add_subplot(spec[0,0])
    # fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 5), constrained_layout=True)
    ax1.scatter(y_train, y_train_pred, c='y', s=50, alpha=0.5, label='train set')
    ax1.scatter(y_test, y_test_pred, c='r', s=50, alpha=0.5, label='test set')
    ax1.plot([-4, 0.2],[-4, 0.2], c='k', linewidth=1)
    ax1.axis([-4, 0.2,-4, 0.2])
    ax1.set_xlabel('DFT')
    ax1.set_ylabel('ANN')
    ax1.legend(loc='upper left', framealpha=0, markerscale=1.5)
    # 在图中绘制表格
    train_R2   = "{:.2f}".format(train_R2)
    train_RMSE = "{:.4f}".format(train_RMSE)
    train_MAE  = "{:.4f}".format(train_MAE)
    test_R2    = "{:.2f}".format(test_R2)
    test_RMSE  = "{:.4f}".format(test_RMSE)
    test_MAE   = "{:.4f}".format(test_MAE)
    row_labels = [r'$R^2$','RMSE','MAE']
    col_labels = ['Train', 'Test']
    table_vals = [[train_R2, test_R2],[train_RMSE, test_RMSE] ,[train_MAE, test_MAE]]
    table = plt.table(cellText=table_vals, 
                    rowLabels=row_labels, 
                    colLabels=col_labels,
                    colWidths=[0.2]*3,
                    cellLoc='center', 
                    rowLoc='center', 
                    colLoc='center',
                    loc='lower right',
                    )
    table.set_fontsize(16) # 表格字体大小
    table.scale(1, 1.8)    # 表格缩放
    # plt.savefig('{}_result.png'.format(fold_index))
    
    # 绘制模型metrics趋势线
    ax2=fig.add_subplot(spec[0, 1])
    # fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6), constrained_layout=True)
    ax2.plot(history.history['rmse'], 'b-', label='train_rmse')
    ax2.plot(history.history['val_rmse'], 'r-', label='test_rmse')
    ax2.plot(history.history['mae'], 'g-', label='train_mae')
    ax2.plot(history.history['val_mae'], 'y-', label='test_mae')
    ax2.set_ylim([0, 1])
    ax2.legend(loc='center', framealpha=0, ncol=1)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('RMSE(eV)')
    # 绘制R2
    ax3 = ax2.twinx()
    ax3.plot(history.history['r_square'], 'b-', label=r'train_$R^2$')
    ax3.plot(history.history['val_r_square'], 'r-', label=r'test_$R^2$')
    ax3.set_ylim([0, 1])
    ax3.legend(loc='center right', framealpha=0, ncol=1)
    ax3.set_ylabel(r'$R^2$')
    plt.savefig('{}_metrics.png'.format(fold_index))

cross_validation.loc['mean'] = np.mean(cross_validation)      # 添加一行平均值
cross_validation.loc['std'] = np.std(cross_validation)        # 添加一行标准差
cross_validation = np.around(cross_validation.astype('float64'), decimals=4) # 保留4位小数，返回结果为DataFrame

holdout_test.loc['mean'] = np.mean(holdout_test)      # 添加一行平均值
holdout_test.loc['std'] = np.std(holdout_test)        # 添加一行标准差
holdout_test = np.around(holdout_test.astype('float64'), decimals=4)    # 保留4位小数，返回结果为DataFrame

In [None]:
cross_validation.to_excel('cross_validation.xlsx')
cross_validation

In [None]:
holdout_test.to_excel('holdout_test.xlsx')
holdout_test