In [2]:
# data
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
color = sns.color_palette()

# machine learning
import tensorflow as tf
from sklearn.metrics import r2_score, mean_squared_error

# keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.utils.layer_utils import print_summary

# tqdm
from tqdm import tqdm_notebook as tqdm

np.random.seed(7)
tf.set_random_seed(777)  # reproducibility

#miscellaneous
import random
import os, gc
import warnings
warnings.filterwarnings('ignore')
import operator
import json
from pprint import pprint
from xml.etree.ElementTree import parse
import datetime
import re
import math
import requests
from dateutil.relativedelta import relativedelta
import traceback

Using TensorFlow backend.


In [3]:
URL = 'http://api.hrfco.go.kr/220C13DE-9809-4646-9592-A60DC6C1ACB1/dam/list/1D/'

In [4]:
# utility functions
def getTimeStamp(fileName):
    startDate = fileName.split('.')[0][:4]
    endDate = fileName.split('.')[0][4:8]
    
    return datetime.datetime.strptime(startDate, timeformat)

In [5]:
dam_df = pd.read_json('data/damInfo.json', orient='records')
dam_df = dam_df.replace('', np.nan)
cols = ['dmobscd', 'obsnm', 'agcnm', 'addr', 'etcaddr', 'lon', 'lat', 'pfh', 'fldlmtwl']
dam_df = dam_df[cols]
dam_df = dam_df.dropna(axis=0, how='any')
dam_df

Unnamed: 0,dmobscd,obsnm,agcnm,addr,etcaddr,lon,lat,pfh,fldlmtwl
0,1003110,충주댐,한국수자원공사,충청북도 충주시,종민동,127-59-44,37-00-13,145.0,138.0
2,1004310,괴산댐,한국수력원자력,충청북도 괴산군,칠성면 산막이옛길 43,127-50-42,36-45-45,136.93,134.0
3,1006110,횡성댐,한국수자원공사,강원도 횡성군,갑천면 대관대리,128-01-57,37-32-42,180.0,178.2
5,1010310,화천댐,한국수력원자력,강원도 화천군,간동면 어룡동길 42,127-47-00,38-07-00,183.0,175.0
6,1010320,춘천댐,한국수력원자력,강원도 춘천시,신북읍 영서로 3741,127-40-15,37-58-00,104.9,102.0
7,1012110,소양강댐,한국수자원공사,강원도 춘천시,동면 월곡리,127-48-44,37-56-46,198.0,190.3
8,1013310,의암댐,한국수력원자력,강원도 춘천시,신동면 옛경춘로 62-15,127-41-22,37-50-08,73.36,70.5
9,1015310,청평댐,한국수력원자력,경기도 가평군,설악면 유명로 2630,127-25-28,37-43-15,52.0,50.0
12,2001110,안동댐,한국수자원공사,경상북도 안동시,성곡동,128-46-26,36-35-06,161.7,160.0
14,2002110,임하댐,한국수자원공사,경상북도 안동시,임하면 임하리,128-52-60,36-32-16,164.7,161.7


In [6]:
timeformat = '%y%m'
outformat = '%Y%m%d%H%M'
testYM = datetime.datetime.strptime('1607', timeformat)

targetYM = datetime.datetime.strptime('2001', timeformat)

testYM = testYM + relativedelta(months=12)

print(targetYM.strftime(outformat))

i=0
while(True):
    if (testYM + relativedelta(months=12) < targetYM):
        resStr = testYM.strftime(outformat) + '/' + (testYM + relativedelta(months=12)).strftime(outformat)
        print(resStr)
        testYM += relativedelta(months=12)
    else:
        resStr = testYM.strftime(outformat) + '/' + targetYM.strftime(outformat)
        print(resStr)
        break

202001010000
201707010000/201807010000
201807010000/201907010000
201907010000/202001010000


In [23]:
def dam_data_load(damInfo, damName, startYM, endYM, columns=['dmobscd', 'ymdhm', 'swl','inf', 'sfw', 'ecpc', 'tototf', 'links']):
    timeformat = '%y%m'
    outformat = '%Y%m%d%H%M'
    res_df = pd.DataFrame(columns=columns)
    strURL = URL + damInfo[damInfo['obsnm'] == damName]['dmobscd'].values.astype(str)[0]
    
    startYM = datetime.datetime.strptime(startYM, timeformat)
    endYM = datetime.datetime.strptime(endYM, timeformat)
    
    while(True):
        if(startYM + relativedelta(months=12) < endYM):
            resURL = strURL + '/' + startYM.strftime(outformat) + '/' + (startYM + relativedelta(months=12)).strftime(outformat) + '.json'
            res = requests.get(resURL)
            file_df = pd.read_json(json.dumps(json.loads(res.text)['content']), orient='records')
            file_df = file_df.reindex(index=file_df.index[::-1])
            res_df = pd.concat([res_df, file_df]).reset_index(drop=True)
            startYM += relativedelta(months=12)
        else:
            resURL = strURL + '/' + startYM.strftime(outformat) + '/' + endYM.strftime(outformat) + '.json'
            res = requests.get(resURL)
            file_df = pd.read_json(json.dumps(json.loads(res.text)['content']), orient='records')
            file_df = file_df.reindex(index=file_df.index[::-1])
            res_df = pd.concat([res_df, file_df]).reset_index(drop=True)
            break
        
    res_df.drop_duplicates(subset=['ymdhm'], keep=False)
    
    return res_df

def dam_data_make(damInfo, damData, mergeCols=['dmobscd', 'pfh', 'fldlmtwl']):
    merge_df = pd.merge(damData, damInfo[mergeCols], on='dmobscd')
    
    # drop unnecessary columns
    merge_df = merge_df.drop(['dmobscd', 'sfw'], axis=1)
    
    # make new columns
    #merge_df['fldlmtwl'] = merge_df['fldlmtwl'].astype(float)
    #merge_df['isOverFLDLMTWL'] = merge_df.apply(lambda row : 1 if row.swl > row.fldlmtwl else 0, axis=1)

    # drop columns that already used
    merge_df = merge_df.drop(['pfh', 'fldlmtwl'], axis=1)
    
    # drop outliers
    merge_df = merge_df.drop(merge_df[merge_df['swl'] < 10].index, axis=0)

    #cols = ['ymdhm', 'swl', 'inf', 'ecpc', 'tototf', 'isOverFLDLMTWL']
    cols = ['swl', 'inf', 'ecpc', 'tototf']
    
    
    return merge_df[cols]

def build_dataset(damName, startYM, endYM):
    return dam_data_make(dam_df, dam_data_load(dam_df, damName, startYM, endYM))

def create_dataset(dataset, colNum, lookBack=1):
    dataX, dataY = [], []
    
    for i in range(len(dataset) - lookBack - 1):
        a = dataset[i:(i + 1)].values
        dataX.append(a)
        b = dataset[i + lookBack: i + lookBack + 1].values
        dataY.append(b)
        
    return np.array(dataX).reshape((-1, colNum)), np.array(dataY).reshape((-1, colNum))

def make_train_test(dataset, size, lookBack=1):
    train_size = int(len(dataset) * size)
    test_size = len(dataset) - train_size
    
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    
    trainX, trainY = create_dataset(train, 4, lookBack)
    testX, testY = create_dataset(test, 4, lookBack)
    
    return trainX, trainY, testX, testY

def data_scaling_and_reshaping(trainX, trainY, testX, testY, scaler, lookBack):
    trainX = scaler.fit_transform(trainX)
    trainY, testX, testY = scaler.transform(trainY), scaler.transform(testX), scaler.transform(testY)
    
    return np.reshape(trainX, (-1, lookBack, trainX.shape[1])), trainY, np.reshape(testX, (-1, lookBack, testX.shape[1])), testY, scaler

def final_data_scaling_and_reshaping(Xdata, scaler, lookBack):
    Xdata = scaler.transform(Xdata)
    
    return np.reshape(Xdata, (-1, lookBack, Xdata.shape[1]))

In [12]:
def build_model(lookBack):
    model = Sequential()
    model.add(LSTM(16, input_shape=(lookBack, 4)))
    model.add(Dense(4))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model

def predict(model, trainX, testX, scaler):
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)
    
    trainPredict = scaler.inverse_transform(trainPredict)
    testPredict = scaler.inverse_transform(testPredict)
    
    return trainPredict, testPredict

def final_predict(model, Xdata, scaler):
    predict = model.predict(Xdata)
    predict = scaler.inverse_transform(predict)
    
    return predict[-2:]

def inverse_Ydata(trainY, testY, scaler):
    trainY = scaler.inverse_transform(trainY)
    testY = scaler.inverse_transform(testY)
    
    return trainY, testY

def print_score(trainPredict, trainY, testPredict, testY):
    trainScore = math.sqrt(mean_squared_error(trainY[:, 0], trainPredict[:, 0]))
    testScore = math.sqrt(mean_squared_error(testY[:, 0], testPredict[:, 0]))
    print('Train Score: %.2f RMSE' % (trainScore))
    print('Test Score: %.2f RMSE' % (testScore))

In [13]:
def make_final_plot(dataset, trainPredict, testPredict, lookBack, target='swl', targetIndex=0):
    trainPredictPlot = np.empty_like(dataset[target])
    trainPredictPlot[:] = np.nan
    trainPredictPlot[lookBack:len(trainPredict[:, targetIndex]) + lookBack] = trainPredict[:, targetIndex]
    
    testPredictPlot = np.empty_like(dataset[target])
    testPredictPlot[:] = np.nan
    testPredictPlot[len(trainPredict[:, targetIndex]) + (lookBack*2) + 1:len(dataset[target]) - 1] = testPredict[:, targetIndex]
    
    plt.subplots(1, 1, figsize=(16, 12))
    plt.plot(dataset[target])
    plt.plot(trainPredictPlot)
    plt.plot(testPredictPlot)
    
    plt.show()
    
def make_various_plot(trainPredict, trainY, testPredict, testY, targetIndex=0):
    trP = pd.Series(trainPredict[:, targetIndex])
    trY = pd.Series(trainY[:, targetIndex])
    teP = pd.Series(testPredict[:, targetIndex])
    teY = pd.Series(testY[:, targetIndex])
    
    f, axes = plt.subplots(2, 2, figsize=(16, 12))
    trP.plot(ax=axes[0, 0], color='red')
    trY.plot(ax=axes[0, 1], color='red')
    teP.plot(ax=axes[1, 0], color='green')
    teY.plot(ax=axes[1, 1], color='green')
    
    plt.show()

In [28]:
# params
lookBack = 1
startYM = '1601'
endYM = '1709'
resDict = {}
for code in dam_df['obsnm'].values:
    try:
        scaler = MinMaxScaler(feature_range=(0, 1))
        timeformat='%y%m'

        dataset = build_dataset(code, startYM, endYM)
        trainX, trainY, testX, testY = make_train_test(dataset, 0.98, lookBack=7)
        trainX, trainY, testX, testY, scaler = data_scaling_and_reshaping(trainX, trainY, testX, testY, scaler, 1)

        model = build_model(1)
        print('----------', code ,'training start----------')
        #model.summary()
        model.fit(trainX, trainY, nb_epoch=5, batch_size=1, verbose=1)

        resDict[code] = {}
        resDict[code]['model'] = model
        

        trainPredict, testPredict = predict(model, trainX, testX, scaler)
        trainY, testY = inverse_Ydata(trainY, testY, scaler)
        print_score(trainPredict, trainY, testPredict, testY)
        
        finalData = final_data_scaling_and_reshaping(dataset, scaler, 1)
        
        resDict[code]['final'] = final_predict(model, finalData, scaler)
    
        print(resDict[code]['final'])
        #make_final_plot(dataset, trainPredict, testPredict, 1)
        #make_various_plot(trainPredict, trainY, testPredict, testY)

        print('----------', code, 'training end----------')
    except Exception as error:
        print(traceback.format_exc())
        continue

---------- 충주댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 1.71 RMSE
Test Score: 2.65 RMSE
[[ 135.9463501   165.59507751  757.06219482  294.76626587]
 [ 136.70320129  248.58731079  708.16101074  375.30804443]]
---------- 충주댐 training end----------
---------- 괴산댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 0.58 RMSE
Test Score: 0.40 RMSE
[[ 132.58175659   17.32330132    9.93825245    9.19543457]
 [ 132.68531799   15.05734444    9.76253414    9.36896133]]
---------- 괴산댐 training end----------
---------- 횡성댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 0.89 RMSE
Test Score: 0.24 RMSE
[[ 175.43838501   -0.52996296   25.76965904    5.0847435 ]
 [ 175.37080383   -0.70947051   26.04388046    4.97897482]]
---------- 횡성댐 training end----------
---------- 화천댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 1.74 RMSE
Test Score:

---------- 합천댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 0.82 RMSE
Test Score: 0.47 RMSE
[[ 152.45243835   16.61965752  515.73797607   12.41034985]
 [ 152.45739746   16.50614166  515.49957275   12.38855076]]
---------- 합천댐 training end----------
---------- 남강댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 0.69 RMSE
Test Score: 1.63 RMSE
[[  39.07603836   50.94016647  213.15864563   58.70307159]
 [  39.29640198   37.29830551  209.13006592   44.32718658]]
---------- 남강댐 training end----------
---------- 밀양댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 1.46 RMSE
Test Score: 1.59 RMSE
[[ 184.17550659    2.35566354   45.21578217    1.13345838]
 [ 183.94650269    2.16870332   45.59900665    1.2809366 ]]
---------- 밀양댐 training end----------
---------- 용담댐 training start----------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Score: 0.76 RMSE
Test Score:

In [99]:
resDict

{'괴산댐': {'final': array([[ 135.29592896,   73.58440399,    6.21298981,   61.12730408],
         [ 135.32566833,   98.6449585 ,    6.07532263,   79.44819641]], dtype=float32),
  'model': <keras.models.Sequential at 0x7f5e94676e80>},
 '군위댐': {'final': array([[ 196.22035217,    3.00721145,   21.70759201,    0.81013978],
         [ 196.532547  ,    3.98300171,   21.08267212,    1.01165318]], dtype=float32),
  'model': <keras.models.Sequential at 0x7f5e8b529908>},
 '김천부항댐': {'final': array([[ 191.39155579,    1.37715971,   17.7601738 ,    3.78818345],
         [ 191.91737366,    1.74423611,   17.28165817,    4.77132463]], dtype=float32),
  'model': <keras.models.Sequential at 0x7f5e8a507a58>},
 '남강댐': {'final': array([[  41.98963165,  163.75434875,  145.25598145,  217.99345398],
         [  42.20518112,  225.99360657,  141.58253479,  288.39874268]], dtype=float32),
  'model': <keras.models.Sequential at 0x7f5e880c6320>},
 '대청댐': {'final': array([[  75.3052063 ,  162.15396118,  324.71783447,

In [30]:
resD = {}
for key in resDict:
    resD[key] = str(resDict[key]['final'][:, 0])

In [31]:
resD

{'괴산댐': '[ 132.58175659  132.68531799]',
 '군위댐': '[ 190.99235535  190.94377136]',
 '김천부항댐': '[ 181.64414978  181.53309631]',
 '남강댐': '[ 39.07603836  39.29640198]',
 '대청댐': '[ 73.40707397  73.6125412 ]',
 '대청댐조정지': '[ 28.74837875  28.7183094 ]',
 '밀양댐': '[ 184.17550659  183.94650269]',
 '보령댐': '[ 57.21138763  57.65487289]',
 '섬진강댐': '[ 182.30703735  182.19245911]',
 '소양강댐': '[ 187.73268127  187.30599976]',
 '안동댐': '[ 147.56422424  147.62705994]',
 '용담댐': '[ 250.80595398  250.78396606]',
 '의암댐': '[ 70.1151123   70.17626953]',
 '임하댐': '[ 152.20788574  152.25366211]',
 '장흥댐': '[ 66.10131836  66.14823151]',
 '주암댐': '[ 95.69728088  95.70973206]',
 '청평댐': '[ 49.63264465  49.72107697]',
 '춘천댐': '[ 101.63954926  101.65202332]',
 '충주댐': '[ 135.9463501   136.70320129]',
 '합천댐': '[ 152.45243835  152.45739746]',
 '화천댐': '[ 172.38208008  172.21357727]',
 '횡성댐': '[ 175.43838501  175.37080383]'}

In [76]:
resJson = json.dumps(resD)
resJson

'{"\\ub300\\uccad\\ub310": "75.675", "\\uc8fc\\uc554\\ub310": "104.121", "\\uc784\\ud558\\ub310": "156.088", "\\uc6a9\\ub2f4\\ub310": "251.218", "\\uc18c\\uc591\\uac15\\ub310": "185.362", "\\uc601\\uc8fc\\ub310": "136.371", "\\uad34\\uc0b0\\ub310": "135.246", "\\ubc00\\uc591\\ub310": "206.967", "\\uad70\\uc704\\ub310": "196.269", "\\ud654\\ucc9c\\ub310": "174.616", "\\ud6a1\\uc131\\ub310": "178.306", "\\ucda9\\uc8fc\\ub310": "136.887", "\\ub0a8\\uac15\\ub310": "41.6753", "\\ud569\\ucc9c\\ub310": "164.099", "\\ubcf4\\ub839\\ub310": "66.7814", "\\uae40\\ucc9c\\ubd80\\ud56d\\ub310": "191.649", "\\ucd98\\ucc9c\\ub310": "102.553", "\\ub300\\uccad\\ub310\\uc870\\uc815\\uc9c0": "29.3005", "\\uc7a5\\ud765\\ub310": "75.8029", "\\uc12c\\uc9c4\\uac15\\ub310": "189.32", "\\uc758\\uc554\\ub310": "71.1359", "\\uccad\\ud3c9\\ub310": "50.8081", "\\uc548\\ub3d9\\ub310": "151.598"}'

In [106]:
with open('damResult.txt', 'w') as outfile:
    outfile.write(str(resD))
    #json.dump(resD, outfile)

In [82]:
import sys
sys.stdout.encoding

'UTF-8'

In [86]:
resD

{'괴산댐': '135.246',
 '군위댐': '196.269',
 '김천부항댐': '191.649',
 '남강댐': '41.6753',
 '대청댐': '75.675',
 '대청댐조정지': '29.3005',
 '밀양댐': '206.967',
 '보령댐': '66.7814',
 '섬진강댐': '189.32',
 '소양강댐': '185.362',
 '안동댐': '151.598',
 '영주댐': '136.371',
 '용담댐': '251.218',
 '의암댐': '71.1359',
 '임하댐': '156.088',
 '장흥댐': '75.8029',
 '주암댐': '104.121',
 '청평댐': '50.8081',
 '춘천댐': '102.553',
 '충주댐': '136.887',
 '합천댐': '164.099',
 '화천댐': '174.616',
 '횡성댐': '178.306'}

In [96]:
x = [1, 2, 3, 4, 5]
x[]

[4, 5]

In [22]:
data1 = build_dataset(code, startYM, endYM)
final_data_scaling_and_reshaping(data1, scaler=scaler, lookBack=3)
#np.reshape(data1, (-1, 1, 4))

array([[[  3.44876325e-01,   6.42260758e-03,   6.99634282e-01,
           7.96045786e-02],
        [  3.44876325e-01,   5.78034682e-03,   7.00524726e-01,
           4.94276795e-02],
        [  3.44876325e-01,   5.65189467e-03,   7.01574177e-01,
           4.83870968e-02]],

       [[  3.44169611e-01,   4.30314708e-03,   7.01733185e-01,
           4.99479709e-02],
        [  3.44169611e-01,   5.71612075e-03,   7.02114804e-01,
           4.89073881e-02],
        [  3.43462898e-01,   4.17469493e-03,   7.02496422e-01,
           4.89073881e-02]],

       [[  3.41342756e-01,   4.17469493e-03,   7.03768485e-01,
           7.38813736e-02],
        [  3.39929329e-01,   5.78034682e-03,   7.05485769e-01,
           7.38813736e-02],
        [  3.38515901e-01,   5.71612075e-03,   7.05517570e-01,
           7.38813736e-02]],

       ..., 
       [[  1.20989399e+00,   7.74116891e-01,  -1.30672603e-01,
           2.93236212e+00],
        [  1.15759717e+00,   5.68850353e-01,  -2.25632056e-01,
        