In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error, median_absolute_error
from math import sqrt
pd.set_option('display.float_format',lambda x : '%.5f' % x)

In [2]:
# sns.color_palette('coolwarm',20)
# sns.color_palette('RdBu_r',20)
# sns.color_palette('turbo',20)
# sns.color_palette('seismic',30)

In [4]:
# 设置全局控制的变量
freq = 'D'
freq_ = 'd'
region = 'NA'

if freq == 'H':
    data_num = 17520
    list_col = ['NO2+h','NO2-h','NO2-2h','NO2-3h']
    file_name = 'pol_mete_' + region + '_' + freq + '.csv'
    file_name_pre = 'pol_mete_' + region + '_' + freq + '_predict.csv'
if freq == '3H':
    data_num = 5840
    list_col = ['NO2+3h','NO2-3h','NO2-6h','NO2-9h']
    file_name = 'pol_mete_' + region + '_' + freq + '.csv'
    file_name_pre = 'pol_mete_' + region + '_' + freq + '_predict.csv'
if freq == 'D':
    data_num = 730
    list_col = ['NO2+d','NO2-d','NO2-2d','NO2-3d']
    file_name = 'pol_mete_' + region + '_' + freq + '.csv'
    file_name_pre = 'pol_mete_' + region + '_' + freq + '_predict.csv'

file_path = 'data/'
file_path_model = ''

## Load predictions

In [7]:
data = pd.read_csv(file_path + 'demo_predictions.csv')

In [7]:
# data = data[['index','station','lon','lat',list_col[0],list_col[0]+'_xgb',list_col[0]+'_gru',list_col[0]+'_transformer']]
# 删除神经网络缺失值
# data =data.dropna(axis=0,how='any')
data['index'] = pd.to_datetime(data['index'])

In [8]:
# Spliting train and test dataset
train = data[data['index'].dt.year < 2020]
test = data[data['index'].dt.year == 2020]

## Ensemble model

In [9]:
# # 加权集成模型计算 calculation of ensemble model by weighting averages
def ensemble_model(train, data):
    r2_list = []
    for sta,group in train.groupby('station'):
        # print(group.shape)
        r2_transformer = r2_score(group[list_col[0]], group['NO2+' + freq_  + '_transformer'])
        r2_xgb = r2_score(group[list_col[0]], group['NO2+' + freq_ + '_xgb'])
        r2_gru = r2_score(group[list_col[0]], group['NO2+' + freq_ +'_gru'])
        sum = r2_transformer + r2_xgb + r2_gru
        coe_transformer = r2_transformer / sum
        coe_xgb = r2_xgb / sum
        coe_gru = (r2_gru) / sum
        r2_list.append([sta, r2_transformer, r2_xgb, r2_gru, coe_transformer, coe_xgb, coe_gru])

    r2_list = pd.DataFrame(np.array(r2_list),columns=['station','r2_transformer','r2_xgb','r2_gru','coe_transformer','coe_xgb','coe_gru'])

    r2_list['coe_transformer'] = r2_list['coe_transformer'].astype('float')
    r2_list['coe_xgb'] = r2_list['coe_xgb'].astype('float')
    r2_list['coe_gru'] = r2_list['coe_gru'].astype('float')
    r2_list['r2_transformer'] = r2_list['r2_transformer'].astype('float')
    r2_list['r2_xgb'] = r2_list['r2_xgb'].astype('float')
    r2_list['r2_gru'] = r2_list['r2_gru'].astype('float')

    # print(data['station'].unique().shape, r2_list['station'].unique().shape)
    data = pd.merge(data, r2_list, on='station', how='inner')

    data['NO2+'+freq_+'_ensemble'] = data['NO2+'+freq_+'_transformer'] * data['coe_transformer'] + data['NO2+'+freq_+'_xgb'] * data['coe_xgb'] + data['NO2+'+freq_+'_gru'] * data['coe_gru']
    data['NO2+'+freq_+'_ensemble'] = data['NO2+'+freq_+'_ensemble'].abs()
    # data = data[['index','station','lon','lat',list_col[0],list_col[0]+'_transformer',list_col[0]+'_xgb',list_col[0]+'_gru','NO2+'+freq_+'_ensemble']]
    data = data[['index','station',list_col[0],list_col[0]+'_transformer',list_col[0]+'_xgb',list_col[0]+'_gru','NO2+'+freq_+'_ensemble']]
    return data

In [10]:
# calculate
data = ensemble_model(train, data)

## Evaluation

In [11]:
# 对传入的数据，依计算和NO2+3h的rmse和r2 performance metrics
def model_evaluation(df):
    df_1 = df.drop(columns=['index','station'])
    col_name = df_1.columns
    MAE = pd.DataFrame(index=range(1),columns=col_name)
    RMSE = pd.DataFrame(index=range(1),columns=col_name)
    R2 = pd.DataFrame(index=range(1),columns=col_name)
    MedAE = pd.DataFrame(index=range(1),columns=col_name)
    for i in col_name:
        if i != list_col[0]:
            real_value = df_1[list_col[0]]
            pre_value = df_1[i]
            mae = mean_absolute_error(real_value,pre_value)
            rmse = sqrt(mean_squared_error(real_value,pre_value))
            r2 = r2_score(real_value,pre_value)
            medae = median_absolute_error(real_value, pre_value)
            MAE[i] = mae
            RMSE[i] = rmse
            R2[i] = r2
            MedAE[i] = medae
            print(i, 'R2:',r2, 'RMSE:',rmse, 'mae:',mae, 'MedAE',medae,)

    return R2, RMSE, MAE, MedAE

In [12]:
train = data[data['index'].dt.year < 2020]
test = data[data['index'].dt.year == 2020]

In [13]:
# 训练集模型性能 performance of train-dataset
res_train = model_evaluation(train)

NO2+d_transformer R2: 0.7707313150256979 RMSE: 9.240477809376552 mae: 6.550211353638694 MedAE 4.659218000000003
NO2+d_xgb R2: 0.8140565522076636 RMSE: 8.321708486275881 mae: 5.975943813700956 MedAE 4.333333333333332
NO2+d_gru R2: 0.787673863627838 RMSE: 8.892498221401702 mae: 6.326745925800356 MedAE 4.5387270000000015
NO2+d_ensemble R2: 0.8025232564687165 RMSE: 8.575906489711842 mae: 6.138748237165968 MedAE 4.414118367145484


In [14]:
# 测试集模型性能 # performance of test-dataset
res_test = model_evaluation(test)

NO2+d_transformer R2: 0.7632748337999657 RMSE: 7.509512010397549 mae: 5.386461153965289 MedAE 3.8487693333333333
NO2+d_xgb R2: 0.750604086054625 RMSE: 7.7078662155785365 mae: 5.437810050804811 MedAE 3.833333333333332
NO2+d_gru R2: 0.7649825600298719 RMSE: 7.482376317674469 mae: 5.344065513876895 MedAE 3.7989630434782597
NO2+d_ensemble R2: 0.7714833988355226 RMSE: 7.378165269937355 mae: 5.270927120073616 MedAE 3.7445919238427967


In [15]:
# save results
data.to_csv(file_path + file_name_pre, index=False)

## Cross validation

In [15]:
# load cross validation predictions
model_sty = ['xgb', 'gru', 'transformer']
df_cv_xgb = np.load(file_path + 'cross_validation/NA_' + freq + '_predictions_' + model_sty[0] + '.npz', allow_pickle=True)
df_cv_gru = np.load(file_path + 'cross_validation/NA_' + freq + '_predictions_' + model_sty[1] + '.npz', allow_pickle=True)
df_cv_trans = np.load(file_path + 'cross_validation/NA_' + freq + '_predictions_' + model_sty[2] + '.npz', allow_pickle=True)

In [16]:
# load normalization parameters
norm_params = pd.read_csv(file_path + 'normalization_params/normalization_params_D_no2-idx-0')
norm_std, norm_mean = norm_params['std'][0], norm_params['mean'][0]

In [17]:
df_cv_list = []
# 按照交叉验证结果进行集成模型结果计算 calculation of CV predictions
for i in ['cv0', 'cv01', 'cv02', 'cv3', 'cv4',]:
    for df, name in zip([df_cv_xgb, df_cv_gru, df_cv_trans], ['xgb', 'gru', 'transformer']):
        df = pd.DataFrame(df[i][:, 1:], columns=[list_col[0] + '_' + name, list_col[0], 'index', 'station'])
        df.iloc[:, :2] = df.iloc[:, :2]  * sqrt(norm_std) + norm_mean
        if name == 'xgb':
            df_cv = df
        else:
            df_cv = pd.merge(df_cv, df.drop(columns = list_col[0]), on=['index', 'station'], how='left')
    df_cv = ensemble_model(train, df_cv)
    df_cv_list.append(df_cv)

In [18]:
metrics = []
for cv in range(5):
    print('CV : ', cv)
    metrics.append(model_evaluation(df_cv_list[cv]))

CV :  0
NO2+d_transformer R2: 0.8257986534892452 RMSE: 8.048376821882368 mae: 5.7909940540623115 MedAE 4.22818510220206
NO2+d_xgb R2: 0.8171237870165248 RMSE: 8.246338495618588 mae: 5.8076788429201525 MedAE 4.233070688723528
NO2+d_gru R2: 0.845007999370183 RMSE: 7.591667698571052 mae: 5.462334173260056 MedAE 4.03070389841389
NO2+d_ensemble R2: 0.8446867461497213 RMSE: 7.599531282626735 mae: 5.45857150433591 MedAE 4.013352755439952
CV :  1
NO2+d_transformer R2: 0.8364323467957723 RMSE: 7.790036587603887 mae: 5.654201315112127 MedAE 4.160396467969832
NO2+d_xgb R2: 0.8190995344852687 RMSE: 8.192390180054508 mae: 5.834110095998496 MedAE 4.244165919582553
NO2+d_gru R2: 0.8448224264715626 RMSE: 7.587614615666091 mae: 5.473712033091583 MedAE 4.041025573099944
NO2+d_ensemble R2: 0.847740178009078 RMSE: 7.515942431667686 mae: 5.432268963784472 MedAE 4.001536698932924
CV :  2
NO2+d_transformer R2: 0.8361295620552626 RMSE: 7.8365683731066405 mae: 5.656889852932087 MedAE 4.1712851508463
NO2+d_xgb 

In [19]:
MAE, RMSE, R2, MedAE = [], [], [], []
for i in range(5):
    MAE.append(metrics[i][2])
    RMSE.append(metrics[i][1])
    R2.append(metrics[i][0])
    MedAE.append(metrics[i][3])

MAE = pd.concat(MAE).reset_index(drop=True)
RMSE = pd.concat(RMSE).reset_index(drop=True)
R2 = pd.concat(R2).reset_index(drop=True)
MedAE = pd.concat(MedAE).reset_index(drop=True)

In [20]:
print('R2:', R2.mean())
print('RMSE:', RMSE.mean())
print('MAE:', MAE.mean())
print('MedAE:', MedAE.mean())

R2: NO2+d                   NaN
NO2+d_transformer   0.83444
NO2+d_xgb           0.81875
NO2+d_gru           0.84466
NO2+d_ensemble      0.84700
dtype: float64
RMSE: NO2+d                   NaN
NO2+d_transformer   7.85165
NO2+d_xgb           8.21588
NO2+d_gru           7.60599
NO2+d_ensemble      7.54858
dtype: float64
MAE: NO2+d                   NaN
NO2+d_transformer   5.68646
NO2+d_xgb           5.81514
NO2+d_gru           5.46759
NO2+d_ensemble      5.43150
dtype: float64
MedAE: NO2+d                   NaN
NO2+d_transformer   4.18187
NO2+d_xgb           4.23287
NO2+d_gru           4.03281
NO2+d_ensemble      4.00319
dtype: float64
