In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def NSE(y_pred, y_true):
    y_mean = np.mean(y_true)
    a = np.sum(np.square(y_true[:16] - y_pred[:16]))
    b = np.sum(np.square(y_true[:16] - y_mean))
    c = np.sum(np.square(y_true[16:] - y_pred[16:]))
    d = np.sum(np.square(y_true[16:] - y_mean))
    return 100 * (1 - 0.65 * a / b - 0.35 * c / d)

In [3]:
station_rain = pd.read_excel('../遥测站降雨数据.xlsx')
station_rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46056 entries, 0 to 46055
Data columns (total 40 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   TimeStample  46056 non-null  datetime64[ns]
 1   R1           46056 non-null  float64       
 2   R2           46056 non-null  float64       
 3   R3           46056 non-null  float64       
 4   R4           46056 non-null  float64       
 5   R5           46056 non-null  float64       
 6   R6           46056 non-null  float64       
 7   R7           46056 non-null  float64       
 8   R8           46056 non-null  float64       
 9   R9           46056 non-null  float64       
 10  R10          46056 non-null  float64       
 11  R11          46056 non-null  float64       
 12  R12          46056 non-null  float64       
 13  R13          46056 non-null  float64       
 14  R14          46056 non-null  float64       
 15  R15          46056 non-null  float64       
 16  R16 

In [4]:
rain = pd.read_excel('../降雨预报数据.xlsx')
rain['dt'] = rain['TimeStample'].dt.strftime('%Y-%m-%d')
rain.drop('TimeStample', axis=1, inplace=True)
rain.head()

Unnamed: 0,D1,D2,D3,D4,D5,dt
0,0.0625,0.018182,0.014286,0.1,0.1,2013-03-11
1,0.125,0.0,0.014286,0.2,0.1,2013-03-12
2,0.025,0.090909,0.142857,0.1,0.0,2013-03-13
3,0.0375,0.181818,0.071429,0.0,0.1,2013-03-14
4,0.1,0.036364,0.0,0.16,0.1,2013-03-15


In [5]:
rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1836 entries, 0 to 1835
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   D1      1836 non-null   float64
 1   D2      1836 non-null   float64
 2   D3      1836 non-null   float64
 3   D4      1836 non-null   float64
 4   D5      1836 non-null   float64
 5   dt      1836 non-null   object 
dtypes: float64(5), object(1)
memory usage: 86.2+ KB


In [6]:
environment = pd.read_excel('../环境表.xlsx')
environment.rename(columns={'TimeStample': 'dt'}, inplace=True)
environment.head()

Unnamed: 0,dt,T,w,wd
0,2013-01-01,0.168571,0.173913,999012
1,2013-01-02,0.157143,0.478261,999004
2,2013-01-03,0.128571,0.717391,999004
3,2013-01-04,0.037143,0.304348,999003
4,2013-01-05,0.071429,0.23913,999003


In [7]:
environment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1919 entries, 0 to 1918
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   dt      1919 non-null   object 
 1   T       1914 non-null   float64
 2   w       1914 non-null   float64
 3   wd      1919 non-null   int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 60.1+ KB


In [8]:
water = pd.read_excel('../入库流量数据.xlsx')
water['dt'] = water['TimeStample'].dt.strftime('%Y-%m-%d')
water.head()

Unnamed: 0,TimeStample,Qi,dt
0,2013-01-01 02:00:00,0.018201,2013-01-01
1,2013-01-01 05:00:00,0.018196,2013-01-01
2,2013-01-01 08:00:00,0.030095,2013-01-01
3,2013-01-01 11:00:00,0.123196,2013-01-01
4,2013-01-01 14:00:00,0.133178,2013-01-01


In [9]:
water.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15019 entries, 0 to 15018
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   TimeStample  15019 non-null  datetime64[ns]
 1   Qi           15019 non-null  float64       
 2   dt           15019 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 352.1+ KB


In [10]:
train1 = water.loc[(water['TimeStample'] >= '2017-01-01 02:00:00') & (water['TimeStample'] < '2017-02-01 00:00:00')]
train2 = water.loc[(water['TimeStample'] >= '2017-07-01 02:00:00') & (water['TimeStample'] < '2017-08-01 00:00:00')]
train3 = water.loc[(water['TimeStample'] >= '2017-10-01 02:00:00') & (water['TimeStample'] < '2017-11-01 00:00:00')]
print(train1.shape, train2.shape, train3.shape)


test1 = water.loc[(water['TimeStample'] >= '2017-02-01 02:00:00') & (water['TimeStample'] <= '2017-02-07 23:00:00')]
test2 = water.loc[(water['TimeStample'] >= '2017-08-01 02:00:00') & (water['TimeStample'] <= '2017-08-07 23:00:00')]
test3 = water.loc[(water['TimeStample'] >= '2017-11-01 02:00:00') & (water['TimeStample'] <= '2017-11-07 23:00:00')]
print(test1.shape, test2.shape, test3.shape)

(248, 3) (248, 3) (248, 3)
(56, 3) (56, 3) (56, 3)


In [11]:
df_list = [train1, train2, train3, test1, test2, test3]
data = pd.concat(df_list)
data.shape

(912, 3)

In [12]:
data = data.merge(rain, how='left', on='dt')
data = data.merge(environment, how='left', on='dt')
data = data.merge(station_rain, how='left', on='TimeStample')
data.head()

Unnamed: 0,TimeStample,Qi,dt,D1,D2,D3,D4,D5,T,w,...,R30,R31,R32,R33,R34,R35,R36,R37,R38,R39
0,2017-01-01 02:00:00,0.083045,2017-01-01,0.025,0.036364,0.114286,0.24,0.24,0.342857,0.123913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2017-01-01 05:00:00,0.087865,2017-01-01,0.025,0.036364,0.114286,0.24,0.24,0.342857,0.123913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-01-01 08:00:00,0.074158,2017-01-01,0.025,0.036364,0.114286,0.24,0.24,0.342857,0.123913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-01-01 11:00:00,0.062911,2017-01-01,0.025,0.036364,0.114286,0.24,0.24,0.342857,0.123913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-01-01 14:00:00,0.040116,2017-01-01,0.025,0.036364,0.114286,0.24,0.24,0.342857,0.123913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
train1 = data.loc[(data['TimeStample'] >= '2017-01-01 02:00:00') & (data['TimeStample'] < '2017-02-01 00:00:00')]
train2 = data.loc[(data['TimeStample'] >= '2017-07-01 02:00:00') & (data['TimeStample'] < '2017-08-01 00:00:00')]
train3 = data.loc[(data['TimeStample'] >= '2017-10-01 02:00:00') & (data['TimeStample'] < '2017-11-01 00:00:00')]
print(train1.shape, train2.shape, train3.shape)


test1 = data.loc[(data['TimeStample'] >= '2017-02-01 02:00:00') & (data['TimeStample'] <= '2017-02-07 23:00:00')]
test2 = data.loc[(data['TimeStample'] >= '2017-08-01 02:00:00') & (data['TimeStample'] <= '2017-08-07 23:00:00')]
test3 = data.loc[(data['TimeStample'] >= '2017-11-01 02:00:00') & (data['TimeStample'] <= '2017-11-07 23:00:00')]
print(test1.shape, test2.shape, test3.shape)

(248, 50) (248, 50) (248, 50)
(56, 50) (56, 50) (56, 50)


In [14]:
train_y1 = train1['Qi']
train1.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
train_x1 = train1

train_y2 = train2['Qi']
train2.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
train_x2 = train2

train_y3 = train3['Qi']
train3.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
train_x3 = train3

test_y1 = test1['Qi']
test1.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
test_x1 = test1

test_y2 = test2['Qi']
test2.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
test_x2 = test2

test_y3 = test3['Qi']
test3.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
test_x3 = test3

In [15]:
lgb_train1 = lgb.Dataset(train_x1, label=train_y1)
lgb_test1 = lgb.Dataset(test_x1, label=test_y1, reference=lgb_train1)

lgb_train2 = lgb.Dataset(train_x2, label=train_y2)
lgb_test2 = lgb.Dataset(test_x2, label=test_y2, reference=lgb_train2)

lgb_train3 = lgb.Dataset(train_x3, label=train_y3)
lgb_test3 = lgb.Dataset(test_x3, label=test_y3, reference=lgb_train3)

In [16]:
# regression
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': None,
          'learning_rate': 0.05,
          'seed': 2020,
          'first_metric_only': True}

In [17]:
# 自定义评估函数
def self_metric(preds, train_data):
    labels = train_data.get_label()
    y_mean = np.mean(labels)
    a = np.sum(np.square(labels[:16] - preds[:16]))
    b = np.sum(np.square(labels[:16] - y_mean))
    c = np.sum(np.square(labels[16:] - preds[16:]))
    d = np.sum(np.square(labels[16:] - y_mean))
    nse = 100* (1 - 0.65 * a / b - 0.35 * c / d)

    return 'self_metric', nse, True

In [18]:
lgb_model1 = lgb.train(params,
                       lgb_train1,
                       valid_sets=[lgb_test1, lgb_train1],
                       num_boost_round=1000,
                       early_stopping_rounds=200,
                       verbose_eval=50,
                       feval=self_metric)

pred1 = lgb_model1.predict(test_x1)

nse1 = NSE(pred1, test_y1)
nse1

Training until validation scores don't improve for 200 rounds
[50]	training's l2: 0.0010402	training's self_metric: 11.6925	valid_0's l2: 0.00148599	valid_0's self_metric: -13.113
[100]	training's l2: 0.0010275	training's self_metric: 12.4957	valid_0's l2: 0.00149079	valid_0's self_metric: -13.5384
[150]	training's l2: 0.00102584	training's self_metric: 12.6019	valid_0's l2: 0.00149649	valid_0's self_metric: -13.9529
[200]	training's l2: 0.00102552	training's self_metric: 12.6163	valid_0's l2: 0.00149706	valid_0's self_metric: -13.999
[250]	training's l2: 0.00102543	training's self_metric: 12.6201	valid_0's l2: 0.00149803	valid_0's self_metric: -14.0715
Early stopping, best iteration is:
[67]	training's l2: 0.00103256	training's self_metric: 12.1561	valid_0's l2: 0.00148321	valid_0's self_metric: -12.9704
Evaluated only: l2


-12.970397897605391

In [19]:
lgb_model2 = lgb.train(params,
                       lgb_train2,
                       valid_sets=[lgb_test2, lgb_train2],
                       num_boost_round=1000,
                       early_stopping_rounds=200,
                       verbose_eval=50,
                       feval=self_metric)

pred2 = lgb_model2.predict(test_x2)

nse2 = NSE(pred2, test_y2)
nse2

Training until validation scores don't improve for 200 rounds
[50]	training's l2: 0.0104859	training's self_metric: 69.9829	valid_0's l2: 0.014017	valid_0's self_metric: -722.448
[100]	training's l2: 0.00868914	training's self_metric: 77.5304	valid_0's l2: 0.0151432	valid_0's self_metric: -783.858
[150]	training's l2: 0.0079168	training's self_metric: 80.6511	valid_0's l2: 0.0149376	valid_0's self_metric: -785.491
[200]	training's l2: 0.00738055	training's self_metric: 82.5766	valid_0's l2: 0.014382	valid_0's self_metric: -773.428
Early stopping, best iteration is:
[18]	training's l2: 0.015525	training's self_metric: 49.1743	valid_0's l2: 0.0127749	valid_0's self_metric: -852.421
Evaluated only: l2


-852.4210662516131

In [20]:
lgb_model3 = lgb.train(params,
                       lgb_train3,
                       valid_sets=[lgb_test3, lgb_train3],
                       num_boost_round=1000,
                       early_stopping_rounds=200,
                       verbose_eval=50,
                       feval=self_metric)

pred3 = lgb_model3.predict(test_x3)

nse3 = NSE(pred3, test_y3)
nse3

Training until validation scores don't improve for 200 rounds
[50]	training's l2: 0.00550017	training's self_metric: 87.525	valid_0's l2: 0.0120427	valid_0's self_metric: -9519.14
[100]	training's l2: 0.00465307	training's self_metric: 89.2428	valid_0's l2: 0.00740787	valid_0's self_metric: -6431.96
[150]	training's l2: 0.00423097	training's self_metric: 90.0783	valid_0's l2: 0.00712693	valid_0's self_metric: -6315.57
[200]	training's l2: 0.00397619	training's self_metric: 90.5895	valid_0's l2: 0.00681963	valid_0's self_metric: -6240.12
[250]	training's l2: 0.00380478	training's self_metric: 91.0539	valid_0's l2: 0.0065177	valid_0's self_metric: -6065.27
[300]	training's l2: 0.00366681	training's self_metric: 91.4292	valid_0's l2: 0.00645735	valid_0's self_metric: -5937.39
[350]	training's l2: 0.00356409	training's self_metric: 91.7455	valid_0's l2: 0.00694006	valid_0's self_metric: -6346.01
[400]	training's l2: 0.00347665	training's self_metric: 91.9196	valid_0's l2: 0.00721676	valid_

-5911.613453313634

In [21]:
train1 = water.loc[(water['TimeStample'] >= '2018-01-01 02:00:00') & (water['TimeStample'] < '2018-02-01 00:00:00')]
train2 = water.loc[(water['TimeStample'] >= '2018-07-01 02:00:00') & (water['TimeStample'] < '2018-08-01 00:00:00')]
train3 = water.loc[(water['TimeStample'] >= '2018-10-01 02:00:00') & (water['TimeStample'] < '2018-11-01 00:00:00')]
print(train1.shape, train2.shape, train3.shape)


test1 = pd.DataFrame({'TimeStample': pd.date_range(start='2018-02-01 02:00:00', periods=56, freq='3h')})
test2 = pd.DataFrame({'TimeStample': pd.date_range(start='2018-08-01 02:00:00', periods=56, freq='3h')})
test3 = pd.DataFrame({'TimeStample': pd.date_range(start='2018-11-01 02:00:00', periods=56, freq='3h')})
print(test1.shape, test2.shape, test3.shape)

(248, 3) (248, 3) (248, 3)
(56, 1) (56, 1) (56, 1)


In [22]:
df_list = [train1, train2, train3, test1, test2, test3]
data = pd.concat(df_list)
data.shape

(912, 3)

In [23]:
data = data.merge(rain, how='left', on='dt')
data = data.merge(environment, how='left', on='dt')
data = data.merge(station_rain, how='left', on='TimeStample')
data.head()

Unnamed: 0,TimeStample,Qi,dt,D1,D2,D3,D4,D5,T,w,...,R30,R31,R32,R33,R34,R35,R36,R37,R38,R39
0,2018-01-01 02:00:00,0.020184,2018-01-01,0.0625,0.272727,0.214286,0.1,0.1,0.308571,0.065217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-01-01 05:00:00,0.025757,2018-01-01,0.0625,0.272727,0.214286,0.1,0.1,0.308571,0.065217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-01-01 08:00:00,0.022995,2018-01-01,0.0625,0.272727,0.214286,0.1,0.1,0.308571,0.065217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-01-01 11:00:00,0.015113,2018-01-01,0.0625,0.272727,0.214286,0.1,0.1,0.308571,0.065217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-01-01 14:00:00,0.017975,2018-01-01,0.0625,0.272727,0.214286,0.1,0.1,0.308571,0.065217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
train1 = data.loc[(data['TimeStample'] >= '2018-01-01 02:00:00') & (data['TimeStample'] < '2018-02-01 00:00:00')]
train2 = data.loc[(data['TimeStample'] >= '2018-07-01 02:00:00') & (data['TimeStample'] < '2018-08-01 00:00:00')]
train3 = data.loc[(data['TimeStample'] >= '2018-10-01 02:00:00') & (data['TimeStample'] < '2018-11-01 00:00:00')]
print(train1.shape, train2.shape, train3.shape)

test1 = data.loc[(data['TimeStample'] >= '2018-02-01 02:00:00') & (data['TimeStample'] <= '2018-02-07 23:00:00')]
test2 = data.loc[(data['TimeStample'] >= '2018-08-01 02:00:00') & (data['TimeStample'] <= '2018-08-07 23:00:00')]
test3 = data.loc[(data['TimeStample'] >= '2018-11-01 02:00:00') & (data['TimeStample'] <= '2018-11-07 23:00:00')]
print(test1.shape, test2.shape, test3.shape)

(248, 50) (248, 50) (248, 50)
(56, 50) (56, 50) (56, 50)


In [25]:
train_y1 = train1['Qi']
train1.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
train_x1 = train1

train_y2 = train2['Qi']
train2.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
train_x2 = train2

train_y3 = train3['Qi']
train3.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
train_x3 = train3

# test_y1 = test1['Qi']
sub1 = test1[['TimeStample']]
test1.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
test_x1 = test1

# test_y2 = test2['Qi']
sub2 = test2[['TimeStample']]
test2.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
test_x2 = test2

# test_y3 = test3['Qi']
sub3 = test3[['TimeStample']]
test3.drop(['TimeStample', 'dt', 'Qi'], axis=1, inplace=True)
test_x3 = test3

In [26]:
lgb_train1 = lgb.Dataset(train_x1, label=train_y1)
# lgb_test1 = lgb.Dataset(test_x1, label=test_y1, reference=lgb_train1)

lgb_train2 = lgb.Dataset(train_x2, label=train_y2)
# lgb_test2 = lgb.Dataset(test_x2, label=test_y2, reference=lgb_train2)

lgb_train3 = lgb.Dataset(train_x3, label=train_y3)
# lgb_test3 = lgb.Dataset(test_x3, label=test_y3, reference=lgb_train1)

In [27]:
lgb_model1 = lgb.train(params,
                       lgb_train1,
#                        valid_sets=[lgb_test1, lgb_train1],
                       num_boost_round=lgb_model1.best_iteration,
                       verbose_eval=10)

pred1 = lgb_model1.predict(test_x1)
sub1 = pd.DataFrame(pred1).T
sub1.columns = ['Prediction{}'.format(i) for i in range(1, 57)]
sub1['SeqNum'] = 1
l = ['SeqNum'] + ['Prediction{}'.format(i) for i in range(1, 57)]
sub1 = sub1[l]

In [28]:
lgb_model2 = lgb.train(params,
                       lgb_train2,
#                        valid_sets=[lgb_test1, lgb_train1],
                       num_boost_round=lgb_model2.best_iteration,
                       verbose_eval=10)

pred2 = lgb_model2.predict(test_x2)
sub2 = pd.DataFrame(pred2).T
sub2.columns = ['Prediction{}'.format(i) for i in range(1, 57)]
sub2['SeqNum'] = 2
l = ['SeqNum'] + ['Prediction{}'.format(i) for i in range(1, 57)]
sub2 = sub2[l]

In [29]:
lgb_model3 = lgb.train(params,
                       lgb_train3,
#                        valid_sets=[lgb_test3, lgb_train3],
                       num_boost_round=lgb_model3.best_iteration,
                       verbose_eval=10)

pred3 = lgb_model1.predict(test_x3)
sub3 = pd.DataFrame(pred3).T
sub3.columns = ['Prediction{}'.format(i) for i in range(1, 57)]
sub3['SeqNum'] = 3
l = ['SeqNum'] + ['Prediction{}'.format(i) for i in range(1, 57)]
sub3 = sub3[l]

In [30]:
sub = pd.concat([sub1, sub2, sub3])
sub

Unnamed: 0,SeqNum,Prediction1,Prediction2,Prediction3,Prediction4,Prediction5,Prediction6,Prediction7,Prediction8,Prediction9,...,Prediction47,Prediction48,Prediction49,Prediction50,Prediction51,Prediction52,Prediction53,Prediction54,Prediction55,Prediction56
0,1,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,...,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201
0,2,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,...,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017,0.421017
0,3,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,...,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201,0.105201


In [31]:
sub.to_csv('sub.csv', index=False)