In [48]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

In [49]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    for files in os.listdir(path):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [50]:
path = 'data'
_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

In [51]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission_raw.copy()
submission.name = "submission"

In [52]:
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,
                              format=format)
    return df

In [53]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

In [54]:
df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [55]:
# 데이터 시간대 확인하기
def check_datetime(df):
    print(df.name)
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

rain_data
2012-05-01 00:00:00
2022-07-18 23:50:00
water_data
2012-05-01 00:00:00
2022-07-18 23:50:00
submission
2022-06-01 00:00:00
2022-07-18 23:50:00


In [56]:
# data target 분리하기
target = df_water.loc[:,submission.columns]
data = pd.concat((df_rf,df_water.drop(submission.columns,axis=1)),axis=1)

In [57]:
# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
_target = target.reset_index(drop=True)
_data = data.reset_index(drop=True)

_data.index += 1

tot=pd.concat((_data,_target),axis=1)
tot=tot.sort_index()

tot=tot.iloc[1:-1]

target = tot.loc[:,submission.columns]
data = tot.drop(submission.columns,axis=1)

In [58]:
train_target=target.iloc[:-len(submission),:]
test_target=target.iloc[-len(submission):,:]

train_data=data.iloc[:-len(submission),:]
test_data=data.iloc[-len(submission):,:]

In [59]:
train_target.fillna(train_target.mean(),inplace=True)
test_target.fillna(train_target.mean(),inplace=True)
train_data.fillna(train_data.mean(),inplace=True)
test_data.fillna(train_data.mean(),inplace=True)

In [60]:
# print(train_target)
# print(train_data)

In [61]:
import seaborn as sns
from icecream import ic
df = pd.concat([train_data, train_target],axis=1)
# print(df)
df_corr = df.corr()
# print(df_corr)
# df_corr.fillna(0,inplace=True)
# sns.clustermap(df_corr, 
#                annot = True,      # 실제 값 화면에 나타내기
#                cmap = 'RdYlBu_r',  # Red, Yellow, Blue 색상으로 표시
#                vmin = -1, vmax = 1, #컬러차트 -1 ~ 1 범위로 표시
#               )

In [63]:
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# x_train = scaler.fit_transform(train_data)   # 훈련할 데이터 변환  
# # x_test = scaler.transform(x_test)

In [64]:
print(train_data)

        rf_10184100  rf_10184110  rf_10184140     swl     inf     sfw   ecpc  \
1               0.0          0.0          0.0  24.800  555.00  219.07  24.93   
2               0.0          0.0          0.0  24.794  464.60  218.86  25.15   
3               0.0          0.0          0.0  24.789  478.10  218.69  25.31   
4               0.0          0.0          0.0  24.789  464.80  218.69  25.31   
5               0.0          0.0          0.0  24.789  478.10  218.69  25.31   
...             ...          ...          ...     ...     ...     ...    ...   
269419          0.0          0.0          0.0  25.180  140.89  217.99  26.01   
269420          0.0          0.0          0.0  25.180  140.94  217.99  26.01   
269421          0.0          0.0          0.0  25.180  141.07  217.99  26.01   
269422          0.0          0.0          0.0  25.180  141.01  217.99  26.01   
269423          0.0          0.0          0.0  25.190  755.75  218.36  25.64   

        tototf  tide_level  fw_1018662 

In [65]:
print('--data--')
print(train_data.shape)
print(test_data.shape)
print('--target--')
print(train_target.shape)
print(test_target.shape)

--data--
(269423, 13)
(6912, 13)
--target--
(269423, 4)
(6912, 4)


In [66]:
# from sklearn.model_selection import KFold
# kfold = KFold(n_splits=2, shuffle=True)

# from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_jobs=-1)

# params = {
#     "n_estimators" : (100, 150, 200, 400)
# }

# from sklearn.model_selection import train_test_split
# x_train,x_val,y_train,y_val = train_test_split(train_data, train_target, train_size=0.7, random_state=42)

In [67]:
# from sklearn.model_selection import GridSearchCV
# grid_cv = GridSearchCV(rf,
#                        param_grid=params,
#                        cv = kfold,
#                        n_jobs=-1)

In [68]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

model = RandomForestRegressor(n_estimators = 400, n_jobs=-1)
x_train,x_val,y_train,y_val = train_test_split(train_data, train_target, train_size=0.7, random_state=42)

In [69]:
# grid_cv.fit(x_train,y_train)

In [70]:
# grid_cv.best_estimator_

In [71]:
from sklearn.metrics import r2_score
# model = grid_cv.best_estimator_
model.fit(x_train, y_train)
y_pred = model.predict(x_val)
# print(y_val)
# print(y_pred)
print(r2_score(y_val, y_pred))
from sklearn.metrics import mean_squared_error

RMSE = mean_squared_error(y_val, y_pred)**0.5
print(RMSE)

0.9979091642008775
2.969959346149889


In [72]:
# model = grid_cv.best_estimator_
# model.fit(train_data,train_target)
# y_pred = model.predict(test_data)
# _submission_raw.iloc[:,:] = y_pred
# _submission_raw.to_csv('data/submission.csv')