# 모듈 import

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob
from scipy import interpolate
import warnings
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose 
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
from math import sqrt
from statsmodels.tsa.seasonal import seasonal_decompose 
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
sys.path.append(str(Path(os.getcwd())))

# Path 설정

In [2]:
base_path = Path(os.getcwd()).parent.parent
sys.path.append(base_path)

In [3]:
water_lst = glob(f'{base_path}/datasource/competition_data/water_data/*.csv')
rain_lst = glob(f'{base_path}/datasource/competition_data/rf_data/*.csv')

water_df = pd.DataFrame()
rain_df = pd.DataFrame()
for w in water_lst:
    water_df = water_df.append(pd.read_csv(w))
for r in rain_lst:
    rain_df = rain_df.append(pd.read_csv(r))

# 폰트설정

In [4]:
plt.rc('font', family = 'Malgun Gothic')

# 타입 수정 및 데이터 추가 병합(water+rain)

In [5]:
# datetime
water_df['ymdhm'] = pd.to_datetime(water_df['ymdhm'], format = '%Y-%m-%d %H:%M:%S' )
rain_df['ymdhm'] = pd.to_datetime(water_df['ymdhm'], format = '%Y-%m-%d %H:%M:%S' )

# merge to dataframe
merge_df = pd.merge(water_df, rain_df, how = 'left', on = 'ymdhm')

# 컬럼 수정

In [6]:
new_cols ={'ymdhm':"date",
           'swl':'팔당댐 현재수위',
           'inf':'팔당댐 유입량',
           'sfw':'팔당댐 저수량',
           'ecpc':'팔당댐 공용량',
           'tototf':'총 방류량',
           'tide_level':"강화대교 조위",
           'fw_1018662':'청담대교 유량',
           'fw_1018680':'잠수교 유량',
           'fw_1018683':'한강대교 유량',
           'fw_1019630':'행주대교 유량',
           'wl_1018662':'청담대교 수위',    
           'wl_1018680':'잠수교 수위',
           'wl_1018683':"한강대교 수위",
           'wl_1019630':"행주대교 수위",
           'rf_10184100':'대곡교 강수량',
           'rf_10184110':'진관교 강수량',
           'rf_10184140':'송정동 강수량'}
merge_df =merge_df.rename(columns = new_cols)

# 데이터 분리

In [7]:
# test set: 2022 06 01 ~
train = merge_df[-((merge_df['date'].dt.year==2022) & (merge_df['date'].dt.month>=6))].reset_index(drop =True)
test = merge_df[((merge_df['date'].dt.year==2022) & (merge_df['date'].dt.month>=6))].reset_index(drop =True)

# 기본 전처리

In [8]:
# 0이하의 값 일괄 nan값으로 변경

# 학습 및 검증셋
for i in list(train.columns[1:6]):
    error_lst = list(train[train[i]<=0][i].index)
    train.loc[error_lst, i] = np.nan
    
# 최종 예측을 위한 테스트셋    
for i in list(test.columns[1:6]):
    error_lst2 = list(test[test[i]<=0][i].index)
    test.loc[error_lst2, i] = np.nan

# Moving Average + Regression

In [9]:
# 2013년 데이터만 사용 
df_2013 = train[(train['date'].dt.year==2013)].reset_index(drop = True)
df_2013 = df_2013.fillna(df_2013.rolling(3, min_periods = 1, axis = 0,center = True).mean())
# df_2013 = df_2013.fillna(df_2013.rolling(6, min_periods = 1, axis = 0,center = True).mean())

# 잠수교 유량데이터는 측정 센서가 없기때문에 전체 Null & 강화대교 조위 역시 필요없다고 판단
train_df =df_2013.drop(columns = ['잠수교 유량','강화대교 조위'])
train_df.isna().sum()

date          0
팔당댐 현재수위      0
팔당댐 유입량     214
팔당댐 저수량       0
팔당댐 공용량       0
총 방류량         0
청담대교 수위       0
청담대교 유량       0
잠수교 수위        0
한강대교 수위       0
한강대교 유량       0
행주대교 수위       0
행주대교 유량       0
대곡교 강수량       0
진관교 강수량       0
송정동 강수량       0
dtype: int64

In [10]:
# scaler = StandardScaler()
# concat_df = train_df['팔당댐 유입량']
# train_df = pd.DataFrame(scaler.fit_transform(train_df[['팔당댐 현재수위','팔당댐 저수량','총 방류량']]),columns = ['팔당댐 현재수위','팔당댐 저수량','총 방류량'])
# train_df = pd.concat([train_df,concat_df],axis = 1)
# train_df

In [11]:
scaler = StandardScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df[['팔당댐 현재수위','팔당댐 유입량','팔당댐 저수량','총 방류량']]),columns = ['팔당댐 현재수위','팔당댐 유입량','팔당댐 저수량','총 방류량'])
train_df

Unnamed: 0,팔당댐 현재수위,팔당댐 유입량,팔당댐 저수량,총 방류량
0,1.249199,-0.520569,1.264104,-0.513550
1,1.241197,-0.505047,1.257363,-0.498022
2,1.241197,-0.505047,1.257363,-0.498022
3,1.233194,-0.505047,1.248375,-0.498022
4,1.153172,-0.573060,1.165236,-0.497954
...,...,...,...,...
26491,0.777068,-0.506340,0.796731,-0.499316
26492,0.721052,-0.575034,0.729322,-0.499929
26493,0.681041,-0.575306,0.686629,-0.500202
26494,0.681041,-0.575579,0.686629,-0.500474


In [12]:

reg_df = train_df
# train
reg_x_data = reg_df[-reg_df['팔당댐 유입량'].isna()][['팔당댐 현재수위','팔당댐 저수량', '총 방류량']].reset_index(drop=True)
reg_y_data = reg_df[-reg_df['팔당댐 유입량'].isna()]['팔당댐 유입량'].reset_index(drop=True)
# train 데이터를 train/val분리
reg_x_train = reg_x_data.iloc[:-5000]
reg_y_train = reg_y_data.iloc[:-5000]
reg_x_val = reg_x_data.iloc[-5000:]
reg_y_val = reg_y_data.iloc[-5000:]
# test
reg_x_test = reg_df[reg_df['팔당댐 유입량'].isna()][['팔당댐 현재수위','팔당댐 저수량', '총 방류량']].reset_index(drop=True)
reg_y_test = reg_df[reg_df['팔당댐 유입량'].isna()]['팔당댐 유입량'].reset_index(drop=True)

In [13]:
reg_df.shape, reg_x_train.shape,reg_x_val.shape,reg_x_test.shape, reg_y_train.shape, reg_y_val.shape, reg_y_test.shape

((26496, 4), (21282, 3), (5000, 3), (214, 3), (21282,), (5000,), (214,))

# 모델 테스트

## 랜덤포레스트

In [14]:
# RandongForest
kfold = KFold(n_splits=2, shuffle=True)

rf = RandomForestRegressor(n_jobs=-1)

params = {
    "n_estimators" : (100, 150, 200)
}

grid_cv = GridSearchCV(rf,
                       param_grid=params,
                       cv = kfold,
                       n_jobs=-1)
grid_cv.fit(reg_x_train, reg_y_train)

# rmse 
model = grid_cv.best_estimator_
reg_y_val_pred = model.predict(reg_x_val)
rmse = np.sqrt(mean_squared_error(reg_y_val, reg_y_val_pred))
print(f'RMSE : {rmse}')

RMSE : 0.10137954261638009


## 릿지

In [15]:
# Ridge
ridge = Ridge(alpha = 0.05, max_iter = 10000)
model2 = ridge.fit(reg_x_train, reg_y_train)
reg_y_val_pred = model2.predict(reg_x_val)
rmse = np.sqrt(mean_squared_error(reg_y_val, reg_y_val_pred))
print(f'RMSE : {rmse}')

RMSE : 0.08038879341492373


In [16]:
label =model2.predict(reg_x_test)
reg_y_test[:] = list(label)

In [17]:
reg_y_test

0     -0.369663
1     -0.369066
2     -0.434838
3     -0.236519
4     -0.234183
         ...   
209   -0.500351
210   -0.498726
211   -0.496052
212   -0.526891
213   -0.522828
Name: 팔당댐 유입량, Length: 214, dtype: float64

In [18]:
new_train_df = pd.concat([reg_x_train,reg_y_train],axis =1)
new_val_df = pd.concat([reg_x_val,reg_y_val],axis = 1)
new_test_df = pd.concat([reg_x_test,reg_y_test],axis = 1)

In [19]:
new_train_df.shape, new_val_df.shape, new_test_df.shape

((21282, 4), (5000, 4), (214, 4))

In [20]:
new_data = pd.concat([new_train_df, new_val_df])
new = pd.concat([new_data,new_test_df])

In [21]:
new_df = pd.DataFrame(scaler.inverse_transform(new),columns =new.columns )
new_df

Unnamed: 0,팔당댐 현재수위,팔당댐 저수량,총 방류량,팔당댐 유입량
0,25.290,2772.396441,228.568710,140.692910
1,25.289,2762.495074,228.637813,163.485642
2,25.289,2762.495074,228.637813,163.485642
3,25.288,2749.293252,228.637813,163.485642
4,25.278,2627.176397,228.638117,63.617487
...,...,...,...,...
26491,25.144,1029.755916,228.622053,170.380338
26492,25.140,976.948627,228.622659,172.766320
26493,25.145,1042.957738,228.642966,176.693008
26494,25.140,976.948627,228.496576,131.410738


이렇게되면 데이터가 다섞인다...test데이터를 마지막에CONCAT했으니 뒤에서 N번째 것으로 자른다??

In [22]:
new

Unnamed: 0,팔당댐 현재수위,팔당댐 저수량,총 방류량,팔당댐 유입량
0,1.249199,1.264104,-0.513550,-0.520569
1,1.241197,1.257363,-0.498022,-0.505047
2,1.241197,1.257363,-0.498022,-0.505047
3,1.233194,1.248375,-0.498022,-0.505047
4,1.153172,1.165236,-0.497954,-0.573060
...,...,...,...,...
209,0.080874,0.077697,-0.501564,-0.500351
210,0.048865,0.041745,-0.501428,-0.498726
211,0.088876,0.086685,-0.496865,-0.496052
212,0.048865,0.041745,-0.529758,-0.526891
