In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def NSE(y_pred, y_true):
    y_mean = np.mean(y_true)
    a = np.sum(np.square(y_true[:16] - y_pred[:16]))
    b = np.sum(np.square(y_true[:16] - y_mean))
    c = np.sum(np.square(y_true[16:] - y_pred[16:]))
    d = np.sum(np.square(y_true[16:] - y_mean))
    return 100 * (1 - 0.65 * a / b - 0.35 * c / d)

In [3]:
water = pd.read_excel('../input/入库流量数据.xlsx')

y_test1 = water.loc[(water['TimeStample'] >= '2017-02-01 02:00:00') & (water['TimeStample'] <= '2017-02-07 23:00:00')]
y_test2 = water.loc[(water['TimeStample'] >= '2017-08-01 02:00:00') & (water['TimeStample'] <= '2017-08-07 23:00:00')]
y_test3 = water.loc[(water['TimeStample'] >= '2017-11-01 02:00:00') & (water['TimeStample'] <= '2017-11-07 23:00:00')]
print(y_test1.shape, y_test2.shape, y_test3.shape)

y_test1 = y_test1['Qi'].values
y_test2 = y_test2['Qi'].values
y_test3 = y_test3['Qi'].values

(56, 2) (56, 2) (56, 2)


In [4]:
train_test1 = pd.read_csv('../input/train_test1_2017.csv')
train_test2 = pd.read_csv('../input/train_test2_2017.csv')
train_test3 = pd.read_csv('../input/train_test3_2017.csv')

In [5]:
train_test1.head()

Unnamed: 0,TimeStample,Qi
0,2017-01-01 02:00:00,0.083045
1,2017-01-01 05:00:00,0.087865
2,2017-01-01 08:00:00,0.074158
3,2017-01-01 11:00:00,0.062911
4,2017-01-01 14:00:00,0.040116


In [6]:
train_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   TimeStample  304 non-null    object 
 1   Qi           248 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.9+ KB


In [7]:
data = [train_test1, train_test2, train_test3]
for i in data:
    i.sort_values(by='TimeStample', inplace=True)

In [8]:
stat_feat = []

In [9]:
for i in data:
    # 5天
    for j in range(1, 28):
        i['Qi_shift{}'.format(j)] = i['Qi'].shift(j)
        if j <= 16:
            stat_feat.append('Qi_shift{}'.format(j))

In [10]:
train_test1.loc[:, 'Qi_shift1': 'Qi_shift4'].head()

Unnamed: 0,Qi_shift1,Qi_shift2,Qi_shift3,Qi_shift4
0,,,,
1,0.083045,,,
2,0.087865,0.083045,,
3,0.074158,0.087865,0.083045,
4,0.062911,0.074158,0.087865,0.083045


In [11]:
for i in data:
    # 9小时
    i['1_3_sum'] = i.loc[:, 'Qi_shift1': 'Qi_shift3'].sum(1)
    i['1_3_mea'] = i.loc[:, 'Qi_shift1': 'Qi_shift3'].mean(1)
    i['1_3_max'] = i.loc[:, 'Qi_shift1': 'Qi_shift3'].max(1)
    i['1_3_min'] = i.loc[:, 'Qi_shift1': 'Qi_shift3'].min(1)
    # 12小时
    i['1_4_sum'] = i.loc[:, 'Qi_shift1': 'Qi_shift4'].sum(1)
    i['1_4_mea'] = i.loc[:, 'Qi_shift1': 'Qi_shift4'].mean(1)
    i['1_4_max'] = i.loc[:, 'Qi_shift1': 'Qi_shift4'].max(1)
    i['1_4_min'] = i.loc[:, 'Qi_shift1': 'Qi_shift4'].min(1)
    # 24个小时
    i['1_8_sum'] = i.loc[:, 'Qi_shift1': 'Qi_shift8'].sum(1)
    i['1_8_mea'] = i.loc[:, 'Qi_shift1': 'Qi_shift8'].mean(1)
    i['1_8_max'] = i.loc[:, 'Qi_shift1': 'Qi_shift8'].max(1)
    i['1_8_min'] = i.loc[:, 'Qi_shift1': 'Qi_shift8'].min(1)
    # 36个小时
    i['1_12_sum'] = i.loc[:, 'Qi_shift1': 'Qi_shift12'].sum(1)
    i['1_12_mea'] = i.loc[:, 'Qi_shift1': 'Qi_shift12'].mean(1)
    i['1_12_max'] = i.loc[:, 'Qi_shift1': 'Qi_shift12'].max(1)
    i['1_12_min'] = i.loc[:, 'Qi_shift1': 'Qi_shift12'].min(1)
    # 48个小时
    i['1_16_sum'] = i.loc[:, 'Qi_shift1': 'Qi_shift16'].sum(1)
    i['1_16_mea'] = i.loc[:, 'Qi_shift1': 'Qi_shift16'].mean(1)
    i['1_16_max'] = i.loc[:, 'Qi_shift1': 'Qi_shift16'].max(1)
    i['1_16_min'] = i.loc[:, 'Qi_shift1': 'Qi_shift16'].min(1)
    
    Qi_stat_feat = ['1_3_sum', '1_3_mea', '1_3_max', '1_3_min',
                    '1_4_sum', '1_4_mea', '1_4_max', '1_4_min',
                    '1_8_sum', '1_8_mea', '1_8_max', '1_8_min',
                    '1_12_sum', '1_12_mea', '1_12_max', '1_12_min',
                    '1_16_sum', '1_16_mea', '1_16_max', '1_16_min',]
    stat_feat += Qi_stat_feat
    
    
    # 趋势 diff
    i['1_2_diff'] = i['Qi_shift1'] - i['Qi_shift2']
    i['1_3_diff'] = i['Qi_shift1'] - i['Qi_shift3']
    i['1_4_diff'] = i['Qi_shift1'] - i['Qi_shift4']
    i['1_8_diff'] = i['Qi_shift1'] - i['Qi_shift8']
    i['1_16_diff'] = i['Qi_shift1'] - i['Qi_shift16']
    
    i['2_3_diff'] = i['Qi_shift2'] - i['Qi_shift3']
    i['2_4_diff'] = i['Qi_shift2'] - i['Qi_shift4']
    i['2_5_diff'] = i['Qi_shift2'] - i['Qi_shift5']
    i['2_9_diff'] = i['Qi_shift2'] - i['Qi_shift9']
    i['2_17_diff'] = i['Qi_shift2'] - i['Qi_shift17']
    
    i['3_4_diff'] = i['Qi_shift3'] - i['Qi_shift4']
    i['3_5_diff'] = i['Qi_shift3'] - i['Qi_shift5']
    i['3_6_diff'] = i['Qi_shift3'] - i['Qi_shift6']
    i['3_10_diff'] = i['Qi_shift3'] - i['Qi_shift10']
    i['3_18_diff'] = i['Qi_shift3'] - i['Qi_shift18']
    
    trend_stat_feat = ['1_2_diff', '1_3_diff', '1_4_diff', '1_8_diff', '1_16_diff',
                       '2_3_diff', '2_4_diff', '2_5_diff', '2_9_diff', '2_17_diff',
                       '3_4_diff', '3_5_diff', '3_6_diff', '3_10_diff', '3_18_diff']
    stat_feat += trend_stat_feat
    
    
    # 环比
    i['1_2_chain_rate'] = i['Qi_shift1'] / i['Qi_shift2']
    i['2_3_chain_rate'] = i['Qi_shift2'] / i['Qi_shift3']
    i['3_4_chain_rate'] = i['Qi_shift3'] / i['Qi_shift4']
    i['4_5_chain_rate'] = i['Qi_shift4'] / i['Qi_shift5']
    i['5_6_chain_rate'] = i['Qi_shift5'] / i['Qi_shift6']
    chain_rate_stat_feat = ['1_2_chain_rate', '2_3_chain_rate', '3_4_chain_rate', '4_5_chain_rate', '5_6_chain_rate']
    stat_feat += chain_rate_stat_feat
    
    # 环比比
    i['1_2_chain_rate_2_3_chain_rate'] = i['1_2_chain_rate'] / i['2_3_chain_rate']
    i['2_3_chain_rate_3_4_chain_rate'] = i['2_3_chain_rate'] / i['3_4_chain_rate']
    i['3_4_chain_rate_4_5_chain_rate'] = i['3_4_chain_rate'] / i['4_5_chain_rate']
    i['4_5_chain_rate_5_6_chain_rate'] = i['4_5_chain_rate'] / i['5_6_chain_rate']
    chain_rate_rate_stat_feat = ['1_2_chain_rate_2_3_chain_rate', '2_3_chain_rate_3_4_chain_rate',
                                 '3_4_chain_rate_4_5_chain_rate', '4_5_chain_rate_5_6_chain_rate']
    stat_feat += chain_rate_rate_stat_feat
    
    # 同比
    i['1_3_yoy'] = (i['Qi_shift1'] - i['Qi_shift3']) / i['Qi_shift3']
    i['1_4_yoy'] = (i['Qi_shift1'] - i['Qi_shift4']) / i['Qi_shift4']
    i['1_5_yoy'] = (i['Qi_shift1'] - i['Qi_shift5']) / i['Qi_shift5']
    i['1_8_yoy'] = (i['Qi_shift1'] - i['Qi_shift8']) / i['Qi_shift8']
    i['1_16_yoy'] = (i['Qi_shift1'] - i['Qi_shift16']) / i['Qi_shift16']
    i['1_24_yoy'] = (i['Qi_shift1'] - i['Qi_shift24']) / i['Qi_shift24']
    
    i['2_4_yoy'] = (i['Qi_shift2'] - i['Qi_shift4']) / i['Qi_shift4']
    i['2_5_yoy'] = (i['Qi_shift2'] - i['Qi_shift5']) / i['Qi_shift5']
    i['2_6_yoy'] = (i['Qi_shift2'] - i['Qi_shift6']) / i['Qi_shift6']
    i['2_9_yoy'] = (i['Qi_shift2'] - i['Qi_shift9']) / i['Qi_shift9']
    i['2_17_yoy'] = (i['Qi_shift2'] - i['Qi_shift17']) / i['Qi_shift17']
    i['2_25_yoy'] = (i['Qi_shift2'] - i['Qi_shift25']) / i['Qi_shift25']
    yoy_stat_feat = ['1_3_yoy', '1_4_yoy', '1_5_yoy', '1_8_yoy', '1_16_yoy', '1_24_yoy',
                     '2_4_yoy', '2_5_yoy', '2_6_yoy', '2_9_yoy', '2_17_yoy', '2_25_yoy']
    stat_feat += yoy_stat_feat

In [12]:
train1 = train_test1.loc[train_test1['Qi'].notnull(), :]
test1 = train_test1.loc[train_test1['Qi'].isnull(), :]
# sub1 = test1[['TimeStamp']]

train2 = train_test2.loc[train_test2['Qi'].notnull(), :]
test2 = train_test2.loc[train_test2['Qi'].isnull(), :]
# sub2 = test2[['TimeStamp']]

train3 = train_test3.loc[train_test3['Qi'].notnull(), :]
test3 = train_test3.loc[train_test3['Qi'].isnull(), :]
# sub3 = test3[['TimeStamp']]


X_train1 = train1[stat_feat]
y_train1 = train1['Qi']
X_test1 = test1[stat_feat]

X_train2 = train2[stat_feat]
y_train2 = train2['Qi']
X_test2 = test2[stat_feat]

X_train3 = train3[stat_feat]
y_train3 = train3['Qi']
X_test3 = test3[stat_feat]

lgb_train1 = lgb.Dataset(X_train1, y_train1)
lgb_train2 = lgb.Dataset(X_train2, y_train2)
lgb_train3 = lgb.Dataset(X_train3, y_train3)

In [13]:
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'mae',
          'learning_rate': 0.05,
          'seed': 2020}

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'num_leaves': 2 ** 5 - 1,
    'reg_alpha': 0.25,
    'reg_lambda': 0.25,
    'metric': 'mse',
    'n_estimators': 300,
    'learning_rate': 0.05,
    'min_child_samples': 5,
    'colsample_bytree': 0.7,
    'random_state': 2020
}

model1 = lgb.train(params, lgb_train1)
model2 = lgb.train(params, lgb_train2)
model3 = lgb.train(params, lgb_train3)

In [14]:
y_train1_pred = model1.predict(X_train1)
y_test1_pred = model1.predict(X_test1)

y_train2_pred = model2.predict(X_train2)
y_test2_pred = model2.predict(X_test2)

y_train3_pred = model3.predict(X_train3)
y_test3_pred = model3.predict(X_test3)

In [15]:
print('train 1: ', NSE(y_train1, y_train1_pred))
print('train 2: ', NSE(y_train2, y_train2_pred))
print('train 3: ', NSE(y_train3, y_train3_pred))

train 1:  15.884665457435847
train 2:  98.53221855865976
train 3:  98.39642080191159


In [16]:
print('test 1: ', NSE(y_test1, y_test1_pred))
print('test 2: ', NSE(y_test2, y_test2_pred))
print('test 3: ', NSE(y_test3, y_test3_pred))

test 1:  -13755731.53937047
test 2:  -1028.9309751097496
test 3:  -2323.5534003156745


In [17]:
cols = ['Prediction{}'.format(i) for i in range(1, 57)]

sub1 = pd.DataFrame(y_test1_pred).T
sub1.columns = cols
sub1['SeqNum'] = 1
sub1 = sub1[['SeqNum'] + cols]

sub2 = pd.DataFrame(y_test2_pred).T
sub2.columns = cols
sub2['SeqNum'] = 2
sub2 = sub1[['SeqNum'] + cols]

sub3 = pd.DataFrame(y_test3_pred).T
sub3.columns = cols
sub3['SeqNum'] = 3
sub3 = sub1[['SeqNum'] + cols]

sub = pd.concat([sub1, sub2, sub3])
sub.to_csv('../sub/sub.csv', index=False)