In [1]:
import copy
import random
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)    
matplotlib.rcParams['axes.unicode_minus'] = False      
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', 150)

from xgboost import XGBRegressor

import torch 
import torch.nn as nn 
import torch.optim as optim 
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
torch.set_printoptions(sci_mode=False)
random_seed = 616

### 기본 데이터

In [2]:
y_col = '유입량'
PK_col = ['홍수사상번호','연','월','일','시간']

In [3]:
data_raw = pd.read_excel('../data/01_제공데이터/2021 빅콘테스트_데이터분석분야_퓨처스리그_홍수ZERO_댐유입량,강우,수위데이터_210803.xlsx', header=[0,1])
data_raw = data_raw.dropna()

data_raw.columns = ['_'.join(col).strip() if "Unnamed" not in col[1] else col[0] for col in data_raw.columns.values]

In [4]:
models = {
    "LR" : LinearRegression(),
    "DT" : DecisionTreeRegressor(random_state = random_seed),
    "SGDRegressor" : SGDRegressor(),
    "KernelRidge" : KernelRidge(),
    "XGB" : XGBRegressor()
}

standard_scaler = StandardScaler()

In [5]:
# 결과 값 저장할 dataframe 생성

predict_df = pd.DataFrame()
predict_df[PK_col] = data_raw[PK_col]
predict_df['true'] = data_raw.유입량

model_names = []
for model_name in models.keys():
    for i in range(4):
        model_names.append(model_name)

score_df = pd.DataFrame(columns=[model_names, ['RMSE','RMSLE','R2_score','MAPE']*len(models.keys())])

## source code

In [6]:
first_values = data_raw.groupby('홍수사상번호').first()['유입량']

def model_predict(name, models, data, scaler, option = 'None'):
    predict_df = pd.DataFrame()
    predict_df[PK_col] = data[['홍수사상번호','연','월','일','시간']]
    predict_df['true'] = data.유입량
    for model_name, model, in models.items():
        print(f"{model_name} Start!")
        pred_values = []
        for num in data.홍수사상번호.unique():
            train = data[data['홍수사상번호'] != num].dropna().copy()
            valid = data[data['홍수사상번호'] == num].copy()

            X_train = scaler.fit_transform(train.drop(columns=[y_col]+PK_col))
            y_train = train[y_col]

            X_valid = valid.drop(columns=[y_col]+PK_col)
            y_valid = valid[y_col]
        
            model.fit(X_train, y_train)

            if option == 'change':
                pred_valid = model.predict((scaler.transform(X_valid))[1:])
                pred_values.append(first_values[num])
                for value in pred_valid:
                    pred_values.append(pred_values[-1] + value)
                    
            elif option == 'shift_y':
                pred_values.append(first_values[num])
                X_valid.iloc[1, -1] = first_values[num]

                for i in range(1, len(X_valid)):
                    pred_valid = model.predict(scaler.transform(X_valid.iloc[[i]]))
                    pred_values += list(pred_valid)
                    try:
                        X_valid.iloc[i+1, -1] = pred_valid
                    except:
                        pass
                    
            elif option == 'shift':
                pred_values.append(first_values[num])
                pred_valid = model.predict((scaler.transform(X_valid))[1:])
                pred_values += list(pred_valid)
                    
            else:
                pred_valid = model.predict(scaler.transform(X_valid))
                pred_values += list(pred_valid)

        predict_df[f'pred_{model_name}_{name}'] = np.clip(pred_values, 1, 25000)
        
    return predict_df

In [7]:
def fill_nan(predict_df):
    for num in predict_df.홍수사상번호:
        predict_num = predict_df[predict_df.홍수사상번호 == num]
        predict_num = predict_num.fillna(method='ffill').fillna(method='bfill')
        predict_df.loc[predict_df.홍수사상번호 == num] = predict_num
        
def rmse(predict_df, name):
    pred = predict_df[predict_df[f'pred_{name}'].notnull()][f'pred_{name}']
    true = predict_df.loc[pred.index]['true']
    return np.sqrt(mean_squared_error(true, pred))

def rmsle(predict_df, name):
    pred = predict_df[predict_df[f'pred_{name}'].notnull()][f'pred_{name}']
    true = predict_df.loc[pred.index]['true']
    return np.sqrt(mean_squared_log_error(true, pred))
    
def r2(predict_df, name):
    pred = predict_df[predict_df[f'pred_{name}'].notnull()][f'pred_{name}']
    true = predict_df.loc[pred.index]['true']
    return r2_score(true, pred)

def mape(predict_df, name):
    pred = predict_df[predict_df[f'pred_{name}'].notnull()][f'pred_{name}']
    true = predict_df.loc[pred.index]['true']
    return np.mean(np.abs((true - pred) / true))*100

# Predict

In [8]:
name = 'base'
data_base = data_raw.copy()

In [9]:
predict_temp = model_predict(name, models, data_base, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [10]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29


# Predict_ 21 23 제외

In [11]:
name = 'base_2123'

data_2123 = data_raw[~data_raw.홍수사상번호.isin([21,23])].copy()

In [12]:
predict_temp = model_predict(name, models, data_2123, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [13]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9


## 모든 데이터 t-1 t+1

In [14]:
name = 't-1 t+1'

data_t = data_raw.copy()

for col in data_t.columns.difference([y_col] + PK_col):
    data_t[f'{col}_shift 1'] = data_t[col].shift()
    data_t[f'{col}_shift1 -1'] = data_t[col].shift(-1)

shift_col = data_t.filter(regex='shift').columns
data_t['홍수사상번호_shift 1'] = data_t['홍수사상번호'].shift()
data_t['홍수사상번호_shift -1'] = data_t['홍수사상번호'].shift(-1)

data_t.loc[(data_t['홍수사상번호'] != data_t['홍수사상번호_shift 1']), shift_col] = np.nan
data_t.loc[(data_t['홍수사상번호'] != data_t['홍수사상번호_shift -1']), shift_col] = np.nan

data_t = data_t.dropna()

In [15]:
predict_temp = model_predict(name, models, data_t, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [16]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27


## 수위 E 지역만 t-1 t+1

In [17]:
name = 'waterlevel t-1 t+1'

data_waterlevel_t = data_raw.copy()

waterlevel_col = data_waterlevel_t.filter(regex='수위\(E지역\)')
for col in waterlevel_col:
    data_waterlevel_t[f'{col}_shift 1'] = data_waterlevel_t[col].shift()
    data_waterlevel_t[f'{col}_shift1 -1'] = data_waterlevel_t[col].shift(-1)

shift_col = data_waterlevel_t.filter(regex='shift').columns
data_waterlevel_t['홍수사상번호_shift 1'] = data_waterlevel_t['홍수사상번호'].shift()
data_waterlevel_t['홍수사상번호_shift -1'] = data_waterlevel_t['홍수사상번호'].shift(-1)

data_waterlevel_t.loc[(data_waterlevel_t['홍수사상번호'] != data_waterlevel_t['홍수사상번호_shift 1']), shift_col] = np.nan
data_waterlevel_t.loc[(data_waterlevel_t['홍수사상번호'] != data_waterlevel_t['홍수사상번호_shift -1']), shift_col] = np.nan

data_waterlevel_t = data_waterlevel_t.dropna()

In [18]:
predict_temp = model_predict(name, models, data_waterlevel_t, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [19]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38


## 클러스터링 넣기

In [20]:
for n in range(3,7):
    print('------------------')
    print(f"Clustering K : {n} Start")
    print('------------------')
    
    name = f'{n} cluster'

    data_cluster = data_raw.copy()

    clustering_df = pd.read_csv(f'../data/clustering/{n}_clustering_result.csv',index_col=0)
    cluster_list = [f'k{i}' for i in range(n)]
    
    data_cluster[cluster_list] = clustering_df
    
    predict_temp = model_predict(name, models, data_cluster, standard_scaler)

    if predict_temp.columns.isin(predict_df.columns).all():
        columns = predict_temp.columns.difference(PK_col+['true'])
        predict_df[columns] = predict_temp[columns]
    else:
        predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
    if not predict_temp.index.equals(predict_df.index):
        fill_nan(predict_df)
    
    for model in models.keys():
        score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
        score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
        score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
        score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)

------------------
Clustering K : 3 Start
------------------
LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!
------------------
Clustering K : 4 Start
------------------
LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!
------------------
Clustering K : 5 Start
------------------
LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!
------------------
Clustering K : 6 Start
------------------
LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [21]:
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84


### log 데이터

In [22]:
name = 'log'    
data_log = data_raw.copy()

log_col = data_log.columns.difference(list(data_log.filter(regex="수위\(D지역\)").columns) + [y_col] + PK_col)
for col in log_col:
    data_log[col] = data_log[col].apply(lambda x : np.log(x - data_log[col].min() + 0.01))

In [23]:
predict_temp = model_predict(name, models, data_log, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [24]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## origin + log 변환

In [25]:
name = 'origin + log'
data_originlog = data_raw.copy()

log_col = data_originlog.columns.difference(list(data_originlog.filter(regex="수위\(D지역\)").columns) + [y_col] + PK_col)
for col in log_col:
    data_originlog[col + '_zero'] = (data_originlog[col] == 0).astype(int)
    data_originlog[col + '_log'] = data_originlog[col].apply(lambda x : np.log(x - data_originlog[col].min() + 0.01))

In [26]:
predict_temp = model_predict(name, models, data_log, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [27]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41
origin + log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1661.74,1.51,0.42,113.34,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## x^2 변환

In [28]:
name = 'x2'
data_x2 = data_raw.copy()

x2_col = data_x2.filter(regex="수위\(E지역\)").columns
for col in x2_col:
    data_x2[col + '_x2'] = data_x2[col] ** 2

In [29]:
predict_temp = model_predict(name, models, data_x2, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [30]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41
origin + log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1661.74,1.51,0.42,113.34,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## clustering  + x2

In [31]:
n = 5
name = f'{n}cluster + x2'

data_cluster_x2 = data_raw.copy()

x2_col = data_cluster_x2.filter(regex="수위\(E지역\)").columns
for col in x2_col:
    data_cluster_x2[col + '_x2'] = data_cluster_x2[col] ** 2

clustering_df = pd.read_csv(f'../data/clustering/{n}_clustering_result.csv',index_col=0)
cluster_list = [f'k{i}' for i in range(n)]

data_cluster_x2[cluster_list] = clustering_df

data_cluster_x2 = data_cluster_x2.dropna()

In [32]:
predict_temp = model_predict(name, models, data_cluster_x2, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [33]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41
origin + log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1661.74,1.51,0.42,113.34,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## clustering + t-1 t+1

In [34]:
n = 5
name = f'{n}cluster + t'

data_cluster_t = data_raw.copy()

x2_col = data_cluster_t.filter(regex="수위\(E지역\)").columns
for col in x2_col:
    data_cluster_t[col + '_x2'] = data_cluster_t[col] ** 2

clustering_df = pd.read_csv(f'../data/clustering/{n}_clustering_result.csv',index_col=0)
cluster_list = [f'k{i}' for i in range(n)]

data_cluster_t[cluster_list] = clustering_df

for col in data_cluster_t.columns.difference([y_col] + PK_col + cluster_list):
    data_cluster_t[f'{col}_shift 1'] = data_cluster_t[col].shift()
    data_cluster_t[f'{col}_shift1 -1'] = data_cluster_t[col].shift(-1)

shift_col = data_cluster_t.filter(regex='shift').columns
data_cluster_t['홍수사상번호_shift 1'] = data_cluster_t['홍수사상번호'].shift()
data_cluster_t['홍수사상번호_shift -1'] = data_cluster_t['홍수사상번호'].shift(-1)

data_cluster_t.loc[(data_cluster_t['홍수사상번호'] != data_cluster_t['홍수사상번호_shift 1']), shift_col] = np.nan
data_cluster_t.loc[(data_cluster_t['홍수사상번호'] != data_cluster_t['홍수사상번호_shift -1']), shift_col] = np.nan


data_cluster_t = data_cluster_t.dropna()

In [35]:
predict_temp = model_predict(name, models, data_cluster_t, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [36]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41
origin + log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1661.74,1.51,0.42,113.34,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## t-1 t+1 + x2

In [37]:
n = 5
name = f't + x2'

data_t_x2 = data_raw.copy()

x2_col = data_t_x2.filter(regex="수위\(E지역\)").columns
for col in x2_col:
    data_t_x2[col + '_x2'] = data_t_x2[col] ** 2

for col in data_t_x2.columns.difference([y_col] + PK_col + cluster_list):
    data_t_x2[f'{col}_shift 1'] = data_t_x2[col].shift()
    data_t_x2[f'{col}_shift1 -1'] = data_t_x2[col].shift(-1)

shift_col = data_t_x2.filter(regex='shift').columns
data_t_x2['홍수사상번호_shift 1'] = data_t_x2['홍수사상번호'].shift()
data_t_x2['홍수사상번호_shift -1'] = data_t_x2['홍수사상번호'].shift(-1)

data_t_x2.loc[(data_t_x2['홍수사상번호'] != data_t_x2['홍수사상번호_shift 1']), shift_col] = np.nan
data_t_x2.loc[(data_t_x2['홍수사상번호'] != data_t_x2['홍수사상번호_shift -1']), shift_col] = np.nan


data_t_x2 = data_t_x2.dropna()

In [38]:
predict_temp = model_predict(name, models, data_t_x2, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [39]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41
origin + log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1661.74,1.51,0.42,113.34,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## clustering + t-1 t+1 + x2

In [40]:
n = 5
name = f'{n}cluster + t + x2'

data_cluster_t_x2 = data_raw.copy()

x2_col = data_cluster_t_x2.filter(regex="수위\(E지역\)").columns
for col in x2_col:
    data_cluster_t_x2[col + '_x2'] = data_cluster_t_x2[col] ** 2

clustering_df = pd.read_csv(f'../data/clustering/{n}_clustering_result.csv',index_col=0)
cluster_list = [f'k{i}' for i in range(n)]

data_cluster_t_x2[cluster_list] = clustering_df

for col in data_cluster_t_x2.columns.difference([y_col] + PK_col + cluster_list):
    data_cluster_t_x2[f'{col}_shift 1'] = data_cluster_t_x2[col].shift()
    data_cluster_t_x2[f'{col}_shift1 -1'] = data_cluster_t_x2[col].shift(-1)

shift_col = data_cluster_t_x2.filter(regex='shift').columns
data_cluster_t_x2['홍수사상번호_shift 1'] = data_cluster_t_x2['홍수사상번호'].shift()
data_cluster_t_x2['홍수사상번호_shift -1'] = data_cluster_t_x2['홍수사상번호'].shift(-1)

data_cluster_t_x2.loc[(data_cluster_t_x2['홍수사상번호'] != data_cluster_t_x2['홍수사상번호_shift 1']), shift_col] = np.nan
data_cluster_t_x2.loc[(data_cluster_t_x2['홍수사상번호'] != data_cluster_t_x2['홍수사상번호_shift -1']), shift_col] = np.nan


data_cluster_t_x2 = data_cluster_t_x2.dropna()

In [41]:
predict_temp = model_predict(name, models, data_cluster_t_x2, standard_scaler)

if predict_temp.columns.isin(predict_df.columns).all():
    columns = predict_temp.columns.difference(PK_col+['true'])
    predict_df[columns] = predict_temp[columns]
else:
    predict_df = pd.merge(predict_df, predict_temp, on=PK_col+['true'], how='outer')
    
if not predict_temp.index.equals(predict_df.index):
    fill_nan(predict_df)

LR Start!
DT Start!
SGDRegressor Start!
KernelRidge Start!
XGB Start!


In [42]:
for model in models.keys():
    score_df.loc[name, (model, "RMSE")] = rmse(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "RMSLE")] = rmsle(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "R2_score")] = r2(predict_df,model+ '_' + name)
    score_df.loc[name, (model, "MAPE")] = mape(predict_df,model+ '_' + name)
score_df

Unnamed: 0_level_0,LR,LR,LR,LR,DT,DT,DT,DT,SGDRegressor,SGDRegressor,SGDRegressor,SGDRegressor,KernelRidge,KernelRidge,KernelRidge,KernelRidge,XGB,XGB,XGB,XGB
Unnamed: 0_level_1,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE,RMSE,RMSLE,R2_score,MAPE
base,884.28,1.83,0.84,45.92,1128.63,0.53,0.73,57.2,908.58,1.94,0.83,48.34,1445.3,4.31,0.56,80.92,984.81,0.46,0.8,43.29
base_2123,906.61,1.91,0.84,47.59,1182.44,0.55,0.72,58.89,911.34,2.03,0.84,49.6,1466.98,4.26,0.57,80.61,1028.46,0.54,0.79,44.9
t-1 t+1,842.03,1.8,0.85,45.41,1096.56,0.5,0.75,49.46,890.92,1.98,0.83,49.38,1430.58,4.32,0.57,81.4,995.59,0.46,0.79,41.27
waterlevel t-1 t+1,885.96,1.79,0.84,45.99,1155.14,0.55,0.72,53.8,940.83,2.02,0.81,49.95,1453.28,4.32,0.56,81.18,1047.31,0.47,0.77,45.38
3 cluster,884.93,1.7,0.84,45.79,1129.31,0.55,0.73,60.4,914.61,1.75,0.82,48.82,1429.99,4.28,0.57,80.49,1001.29,0.49,0.79,47.6
4 cluster,888.91,1.67,0.83,45.44,1147.74,0.54,0.72,58.3,878.98,1.79,0.84,47.84,1435.74,4.3,0.57,80.75,1000.71,0.46,0.79,47.62
5 cluster,889.01,1.7,0.83,46.46,1113.78,0.55,0.74,61.74,907.92,1.82,0.83,48.98,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
6 cluster,889.01,1.7,0.83,46.46,1137.28,0.56,0.73,61.65,866.48,1.86,0.84,49.72,1438.71,4.3,0.57,80.87,1003.92,0.52,0.79,47.84
log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1634.47,1.57,0.44,109.69,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41
origin + log,1442.59,1.63,0.56,91.82,1126.59,0.53,0.73,56.86,1661.74,1.51,0.42,113.34,2021.06,3.84,0.14,79.41,985.59,0.46,0.8,43.41


## 결과 저장

In [43]:
predict_df.to_csv('predict/predict_df_ML.csv')
score_df.to_csv('predict/score_df_ML.csv')