# 모듈 import

In [19]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob
from scipy import interpolate
import warnings
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose 
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
from math import sqrt
from statsmodels.tsa.seasonal import seasonal_decompose 
warnings.filterwarnings("ignore")
sys.path.append(str(Path(os.getcwd())))

# Path 설정

In [20]:
base_path = Path(os.getcwd()).parent.parent
sys.path.append(base_path)

In [21]:
water_lst = glob(f'{base_path}/datasource/competition_data/water_data/*.csv')
rain_lst = glob(f'{base_path}/datasource/competition_data/rf_data/*.csv')

water_df = pd.DataFrame()
rain_df = pd.DataFrame()
for w in water_lst:
    water_df = water_df.append(pd.read_csv(w))
for r in rain_lst:
    rain_df = rain_df.append(pd.read_csv(r))

# 타입 수정 및 데이터 추가 병합(water + rain)

In [22]:
# datetime
water_df['ymdhm'] = pd.to_datetime(water_df['ymdhm'], format = '%Y-%m-%d %H:%M:%S' )
rain_df['ymdhm'] = pd.to_datetime(water_df['ymdhm'], format = '%Y-%m-%d %H:%M:%S' )

# merge to dataframe
merge_df = pd.merge(water_df, rain_df, how = 'left', on = 'ymdhm')

# 컬럼 수정

In [23]:
new_cols ={'ymdhm':"date",
           'swl':'팔당댐 현재수위',
           'inf':'팔당댐 유입량',
           'sfw':'팔당댐 저수량',
           'ecpc':'팔당댐 공용량',
           'tototf':'총 방류량',
           'tide_level':"강화대교 조위",
           'fw_1018662':'청담대교 유량',
           'fw_1018680':'잠수교 유량',
           'fw_1018683':'한강대교 유량',
           'fw_1019630':'행주대교 유량',
           'wl_1018662':'청담대교 수위',    
           'wl_1018680':'잠수교 수위',
           'wl_1018683':"한강대교 수위",
           'wl_1019630':"행주대교 수위",
           'rf_10184100':'대곡교 강수량',
           'rf_10184110':'진관교 강수량',
           'rf_10184140':'송정동 강수량'}
merge_df =merge_df.rename(columns = new_cols)

# 데이터분리

In [24]:
# test set: 2022 06 01 ~
train = merge_df[-((merge_df['date'].dt.year==2022) & (merge_df['date'].dt.month>=6))].reset_index(drop =True)
test = merge_df[((merge_df['date'].dt.year==2022) & (merge_df['date'].dt.month>=6))].reset_index(drop =True)

# 기본전처리

In [25]:
# 0이하의 값 일괄 nan값으로 변경

# 학습 및 검증셋
for i in list(train.columns[1:6]):
    error_lst = list(train[train[i]<=0][i].index)
    train.loc[error_lst, i] = np.nan
    
# 최종 예측을 위한 테스트셋    
for i in list(test.columns[1:6]):
    error_lst2 = list(test[test[i]<=0][i].index)
    test.loc[error_lst2, i] = np.nan

# Method 3. Moving Average + interpolate(2012~2022)

In [26]:
# 2012년 
df_2012 = train[(train['date'].dt.year==2012)]
df_2012 = df_2012.fillna(df_2012.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2012 = df_2012.set_index('date').interpolate(method = 'time').reset_index()# 강화대교 조위만 해당
# 2013년 
df_2013 = train[(train['date'].dt.year==2013)].reset_index(drop = True)
df_2013 = df_2013.fillna(df_2013.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2013 = df_2013.fillna(df_2013.rolling(6, min_periods = 1, axis = 0,center = True).mean())
# 2014년
df_2014 = train[(train['date'].dt.year==2014)].reset_index(drop = True)
df_2014 = df_2014.fillna(df_2014.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2014 = df_2014.fillna(df_2014.rolling(6, min_periods = 1, axis = 0,center = True).mean())
# 2015년
df_2015 = train[(train['date'].dt.year==2015)].reset_index(drop = True)
df_2015 = df_2015.fillna(df_2015.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2015 = df_2015.fillna(df_2015.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2015 = df_2015.fillna(df_2015.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2015 = df_2015.fillna(df_2015.rolling(10, min_periods = 1, axis = 0,center = True).mean())
df_2015 = df_2015.set_index('date').interpolate(method = 'time').reset_index()
# 2016년
df_2016 = train[(train['date'].dt.year==2016)].reset_index(drop = True)
df_2016 = df_2016.fillna(df_2016.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2016 = df_2016.fillna(df_2016.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2016 = df_2016.fillna(df_2016.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2016 = df_2016.set_index('date').interpolate(method = 'time').reset_index()
# 2017년
df_2017 = train[(train['date'].dt.year==2017)].reset_index(drop = True)
df_2017 = df_2017.fillna(df_2017.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2017 = df_2017.fillna(df_2017.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2017 = df_2017.fillna(df_2017.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2017 = df_2017.set_index('date').interpolate(method = 'time').reset_index()
# 2018년
df_2018 = train[(train['date'].dt.year==2018)].reset_index(drop = True)
df_2018 = df_2018.fillna(df_2018.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2018 = df_2018.fillna(df_2018.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2018 = df_2018.fillna(df_2018.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2018 = df_2018.set_index('date').interpolate(method = 'time').reset_index()
# 2019년
df_2019 = train[(train['date'].dt.year==2019)].reset_index(drop = True)
df_2019 = df_2019.fillna(df_2019.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2019 = df_2019.fillna(df_2019.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2019 = df_2019.fillna(df_2019.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2019 = df_2019.set_index('date').interpolate(method = 'time').reset_index()
# 2020년
df_2020 = train[(train['date'].dt.year==2020)].reset_index(drop = True)
df_2020 = df_2020.fillna(df_2020.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2020 = df_2020.fillna(df_2020.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2020 = df_2020.fillna(df_2020.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2020 = df_2020.set_index('date').interpolate(method = 'time').reset_index()
# 2021년
df_2021 = train[(train['date'].dt.year==2021)].reset_index(drop = True)
df_2021 = df_2021.fillna(df_2021.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2021 = df_2021.fillna(df_2021.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2021 = df_2021.fillna(df_2021.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2021 = df_2021.set_index('date').interpolate(method = 'time').reset_index()
# 2022년
df_2022 = train[(train['date'].dt.year==2022)].reset_index(drop = True)
df_2022 = df_2022.fillna(df_2022.rolling(3, min_periods = 1, axis = 0,center = True).mean())
df_2022 = df_2022.fillna(df_2022.rolling(6, min_periods = 1, axis = 0,center = True).mean())
df_2022 = df_2022.fillna(df_2022.rolling(8, min_periods = 1, axis = 0,center = True).mean())
df_2022 = df_2022.set_index('date').interpolate(method = 'time').reset_index()



# 테스트 셋 
test = test.drop(columns = ['잠수교 유량','강화대교 조위'])
test = test.fillna(test.rolling(3, min_periods = 1, axis = 0, center = True).mean())
test = test.fillna(test.rolling(6, min_periods = 1, axis = 0, center = True).mean())
test = test.fillna(test.rolling(9, min_periods = 1, axis = 0, center = True).mean())
test = test.set_index('date').interpolate(method = 'time').reset_index()

In [28]:
train_df = pd.concat([df_2012,df_2013],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2014],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2015],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2016],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2017],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2018],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2019],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2020],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2021],axis = 0).reset_index(drop = True)
train_df = pd.concat([train_df,df_2022],axis = 0).reset_index(drop = True)

In [30]:
train_df =train_df.drop(columns = ['잠수교 유량','강화대교 조위'])

# 데이터 split

In [32]:
x_data = train_df[['팔당댐 현재수위', '팔당댐 유입량', '팔당댐 저수량', '팔당댐 공용량', '총 방류량', '청담대교 유량','한강대교 유량', '행주대교 유량', '대곡교 강수량','진관교 강수량','송정동 강수량']]
y_data =train_df[['청담대교 수위','잠수교 수위', '한강대교 수위', '행주대교 수위']]
x_train = x_data.iloc[:-50000]
x_valid = x_data.iloc[-50000:]
y_train = y_data.iloc[:-50000]
y_valid = y_data.iloc[-50000:]

# 최종 예측을 위한 테스트셋
x_test = test[['팔당댐 현재수위', '팔당댐 유입량', '팔당댐 저수량', '팔당댐 공용량', '총 방류량', '청담대교 유량','한강대교 유량', '행주대교 유량', '대곡교 강수량','진관교 강수량','송정동 강수량']]
y_test = test[['청담대교 수위','잠수교 수위', '한강대교 수위', '행주대교 수위']]

In [33]:
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((219424, 11), (50000, 11), (219424, 4), (50000, 4))

In [34]:
x_test.shape, y_test.shape

((6912, 11), (6912, 4))

# Scalling

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaler = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_valid_scaler = pd.DataFrame(scaler.transform(x_valid), columns = x_valid.columns)
x_test_scaler = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns)

이후 간단한 설명
1. 바로 위의 x_train,y_train을 학습하셔서 x_valid를 통해서 y예측값을 뽑습니다.
2. y의 예측값과 y_valid값의 차이를 RMSE를 통해 확인합니다. --> 성능확인( 검증셋 y_test를 통해 성능검증)
3. 성능이 잘 나왔다면 해당 모델을 통해 x_test_scaler데이터를 넣어서 예측값 y_pred를 뽑은후 제출