In [193]:
import pandas as pd
import numpy as np
from datetime import datetime

In [194]:
train = pd.read_csv("data/train.csv")

In [195]:
# 이상치 판단
def outlier(data, column):
    q25 = np.quantile(data[column].dropna(), 0.25)
    q75 = np.quantile(data[column].dropna(), 0.75)
    iqr = q75 - q25
    iqr_cut = iqr * 3
    result = data[(data[column] > q75 + iqr_cut) | (data[column] < q25 - iqr_cut)].index
    return result

# 보간
def time_interpolate(data, column):
    tem = data[["일시", column]].copy()
    tem.index = pd.to_datetime(tem['일시'])
    tem = tem.drop(["일시"], axis = 1)
    tem = tem.interpolate(method="time")
    return tem[column].values

# 강수량은 기상청에서 정한 강수표현에 따라 구간을 나누는 것으로 수치 변경
# 비가 내리지 않음 : 0, 매우 약한 비 : 0~1, 약한 비 : 1~3, 보통 비 : 3~15, 강한 비 : 15~30, 매우 강한 비 : 30 이상
train.강수량 = pd.cut(train.강수량, bins = [0, 0.9, 2.9, 14.9, 29.9, max(train.강수량)], labels = [1, 2, 3, 4, 5])
train.강수량 = train.강수량.astype('float')
train.강수량 = train.강수량.fillna(0)

train.최고기온 = time_interpolate(train, "최고기온")
train.최저기온 = time_interpolate(train, "최저기온")

train.일교차 = train.최고기온 - train.최저기온

train.평균풍속 = time_interpolate(train, "평균풍속")

train.일조합 = time_interpolate(train, "일조합")

train.loc[0:4749, "일사합"] = 0
train.loc[4780:4854, "일사합"] = 0
train.일사합 = time_interpolate(train, "일사합")


# 삭제
train = train.iloc[train.일조율.dropna().index]

# 가조합
train["가조합"] = train.일조합/(train.일조율/100)
train.가조합 = [np.nan if i == float("inf") else i for i in train.가조합]
train.가조합 = time_interpolate(train, "가조합")

# 일사합/일조합
train["일사_일조"] = train.일사합/train.일조합
train.일사_일조 = [np.nan if i == float("inf") else i for i in train.일사_일조] # 분모가 0인 경우 임의로 값을 설정할 수 없어 보간으로 처리
train.일사_일조 = time_interpolate(train, "일사_일조")

# sin + cos
train["sin_cos"] = [-np.sin(2 * np.pi * int(datetime.strptime(i,"%Y-%m-%d").strftime("%j"))/365) - np.cos(2 * np.pi * int(datetime.strptime(i,"%Y-%m-%d").strftime("%j"))/365) for i in train.일시]

train["월"] = train.일시.str.split("-", expand = True)[1].astype("int")

# 사계절: 0 겨울, 1 봄, 2 여름, 3 가을
train["계절"] = pd.cut(train.월, bins = [0, 2, 5, 8, 11, 12], labels = ["겨울", "봄", "여름", "가을", "겨울2"])
train.계절 = ["겨울" if i == "겨울2" else i for i in train.계절]
train = pd.concat([train, pd.get_dummies(train.계절).astype("int")], axis=1)
train = train.drop(["계절"], axis = 1)

train = train.drop(["일시", "월"], axis = 1)

In [196]:
# 결측치 없음
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22645 entries, 366 to 23010
Data columns (total 17 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   최고기온     22645 non-null  float64
 1   최저기온     22645 non-null  float64
 2   일교차      22645 non-null  float64
 3   강수량      22645 non-null  float64
 4   평균습도     22645 non-null  float64
 5   평균풍속     22645 non-null  float64
 6   일조합      22645 non-null  float64
 7   일사합      22645 non-null  float64
 8   일조율      22645 non-null  float64
 9   평균기온     22645 non-null  float64
 10  가조합      22645 non-null  float64
 11  일사_일조    22645 non-null  float64
 12  sin_cos  22645 non-null  float64
 13  가을       22645 non-null  int32  
 14  겨울       22645 non-null  int32  
 15  봄        22645 non-null  int32  
 16  여름       22645 non-null  int32  
dtypes: float64(13), int32(4)
memory usage: 2.8 MB


In [197]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
import time

In [198]:
class Model():
    def __init__(self, data, target, input_size, output_size, test_size):
        self.data = data
        self.target = target        
        
        self.input_size = input_size
        self.output_size = output_size
        self.test_size = test_size
        
        self.col_len = len(self.data.columns)
        

In [199]:
def Scale(self, data):
    scaler_in = MinMaxScaler()
    scaler_out = MinMaxScaler()
    
    inputs = data.drop(self.target, axis = 1)
    inputs_col = inputs.columns
    outputs = data[self.target]
    
    scaler_in.fit(inputs)
    inputs = pd.DataFrame(scaler_in.transform(inputs), columns = inputs_col)
        
    scaler_out.fit(outputs)
    outputs = pd.DataFrame(scaler_out.fit_transform(outputs), columns = self.target)

    data = pd.concat([inputs, outputs], axis = 1)
    
    return data,  scaler_out
Model.Scale = Scale

In [200]:
def Split(self, data):
    data, _ = self.Scale(data)
    data = tf.keras.utils.timeseries_dataset_from_array(data = data,
                                                             targets = None,
                                                             sequence_length = self.input_size + self.output_size)
    inputs = np.concatenate([x[:, slice(0, self.input_size), :] for x in data], axis=0)
    outputs = np.concatenate([x[:, slice(self.input_size, self.input_size + self.output_size), :] for x in data], axis=0)

    outputs = outputs[:,:,self.col_len-1]
    outputs = outputs.reshape(-1, self.output_size, 1)
    
    train_in = inputs[:int(len(inputs)*0.8), :, :]
    train_out = outputs[:int(len(outputs)*0.8), :, :]
    
    test_in = inputs[int(len(inputs)*0.8):, :, :]
    test_out = outputs[int(len(outputs)*0.8):, :, :]
    
    return train_in, train_out, test_in, test_out
Model.Split = Split

In [201]:
@property
def Data(self):
    return self.Split(self.data)
Model.Data = Data

In [202]:
output_size = 358
space = {
    'input_size' : hp.choice("input_size", [output_size, output_size*2, output_size*3]),
    'lstm1_nodes' : hp.choice("lstm1_nodes", [32, 64, 128, 256]),
    #'lstm1_dropout' : hp.choice("lstm1_dropout", [0, 0.3, 0.5]),
    'lstm2_nodes' : hp.choice('lstm2_nodes', [32, 64, 128, 256]),
    #'lstm2_dropout' : hp.choice("lstm2_dropout", [0, 0.3, 0.5]),
    'num_layers' : hp.choice('num_layers',[
       # { 
       #     'layers' : 'two',
       # },
        {
            'layers' : 'three',
            'lstm3_nodes' : hp.choice('lstm3_nodes', [32, 64, 128, 256]),
     #       'lstm3_dropout' : hp.choice("lstm3_dropout", [0, 0.3, 0.5])
        }
    ]),
    'lr' : hp.choice('lr', [0, 0.001, 0.002, 0.003])
}

In [203]:
Parameter_loss = pd.DataFrame([], columns = ["Parameters", "Loss"])
def hyperopt_model(params):
    global Parameter_loss
    print("--------------------------------------------------------------------------------------------------------------")
    print("---------------------------------------[ START {}]-------------------------------------------------------------".format(len(Parameter_loss)))
    print("Parameter : {}".format(params))
    input_size = params['input_size']
    
    output_size = 358
    model = Model(data = train,
         target = ["평균기온"],
         input_size = params['input_size'],
         output_size = output_size,
         test_size = 0.3)
    
    train_in, train_out, test_in, test_out = model.Data
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                      mode = 'min',
                                                      patience = 3,
                                                      min_delta = 0.005)
    tf.random.set_seed = 1234
    initializer = tf.keras.initializers.GlorotUniform(seed=1234)
    
    lstm_model = tf.keras.Sequential()
    lstm_model.add(tf.keras.layers.GRU(params["lstm1_nodes"], 
                                       # dropout = params["lstm1_dropout"],
                                        return_sequences = True, 
                                        kernel_initializer=initializer))
    if params["num_layers"]["layers"] == "two":
        lstm_model.add(tf.keras.layers.GRU(params["lstm2_nodes"], 
                                          #  dropout = params["lstm2_dropout"],
                                            return_sequences = False, 
                                        kernel_initializer=initializer))
    else:
        lstm_model.add(tf.keras.layers.GRU(params["lstm2_nodes"], 
                                        #    dropout = params["lstm2_dropout"],
                                            return_sequences = True, 
                                        kernel_initializer=initializer))
        lstm_model.add(tf.keras.layers.GRU(params["num_layers"]["lstm3_nodes"], 
                                         #   dropout = params["num_layers"]["lstm3_dropout"],
                                            return_sequences = False, 
                                        kernel_initializer=initializer))
    lstm_model.add(tf.keras.layers.Dense(output_size, 
                                        kernel_initializer=initializer))
   
    lstm_model.compile(loss = tf.keras.losses.MeanAbsoluteError(),
                      optimizer = tf.keras.optimizers.Adam(learning_rate = params["lr"]),
                      metrics = [tf.keras.metrics.mean_absolute_error])
    
    history = lstm_model.fit(train_in, train_out,
                             epochs = 100,
                             validation_data = [test_in, test_out],
                             callbacks = [early_stopping],
                            verbose = 2)
    val_error = np.amin(history.history["val_loss"])
    Parameter_loss.loc[len(Parameter_loss)] = [params,val_error]
    
    print("val_error : {}".format(val_error))
    print("--------------------------------------------------------------------------------------------------------------")
    
    return {"loss" : val_error, "model":lstm_model, 'status': STATUS_OK}

In [186]:
start = time.time()
trials = Trials()
best = fmin(hyperopt_model,
            space = space,
            algo = tpe.suggest,
            max_evals = 10,
            trials = trials)
end = time.time()

--------------------------------------------------------------------------------------------------------------
---------------------------------------[ START 0]-------------------------------------------------------------
Parameter : {'input_size': 358, 'lr': 0.003, 'lstm1_nodes': 256, 'lstm2_nodes': 64, 'num_layers': {'layers': 'three', 'lstm3_nodes': 32}}
Epoch 1/100                                           

  0%|          | 0/10 [00:46<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [85]:
best

{'input_size': 0, 'lr': 0, 'lstm1_nodes': 3, 'lstm2_nodes': 2, 'num_layers': 0}

In [54]:
(end- start)/60

8.906722295284272

In [208]:
train

Unnamed: 0,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온,가조합,일사_일조,sin_cos,가을,겨울,봄,여름
366,-4.9,-15.0,10.1,0.0,73.8,0.8,2.7,0.00,28.1,-9.8,9.608541,0.000000,-1.017065,0,1,0,0
367,0.2,-12.3,12.5,0.0,72.3,1.4,4.9,0.00,51.0,-6.2,9.607843,0.000000,-1.033829,0,1,0,0
368,0.3,-4.7,5.0,3.0,77.0,2.6,0.0,0.00,0.0,-1.1,9.653806,0.000000,-1.050286,0,1,0,0
369,-4.7,-13.7,9.0,0.0,46.0,2.9,8.4,0.00,86.6,-9.5,9.699769,0.000000,-1.066433,0,1,0,0
370,-6.2,-16.5,10.3,0.0,54.3,0.9,6.2,0.00,63.9,-10.8,9.702660,0.000000,-1.082263,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23006,3.3,-7.3,10.6,0.0,69.8,1.8,8.8,10.25,91.7,-2.6,9.596510,1.164773,-0.928828,0,1,0,0
23007,0.1,-6.0,6.1,1.0,58.1,2.5,8.7,10.86,90.6,-3.3,9.602649,1.248276,-0.947047,0,1,0,0
23008,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.88,93.8,-2.9,9.594883,1.208889,-0.964986,0,1,0,0
23009,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.84,82.3,-1.8,9.599028,1.372152,-0.982638,0,1,0,0


In [209]:
output_size = 358
model = Model(data = train,
     target = ["평균기온"],
     input_size = output_size * 2,
     output_size = output_size,
     test_size = 0.3)
train_in, train_out, test_in, test_out = model.Data

In [210]:
def LSTM_fit(data):
    train_in, train_out, test_in, test_out = data
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                      mode = 'min',
                                                      patience = 5,
                                                      min_delta = 0)
   
    lstm_model = tf.keras.Sequential()
    lstm_model.add(tf.keras.layers.LSTM(128, return_sequences = True))
    lstm_model.add(tf.keras.layers.LSTM(128, return_sequences = True))
    lstm_model.add(tf.keras.layers.LSTM(128, return_sequences = False))
    lstm_model.add(tf.keras.layers.Dense(output_size))
   
    lstm_model.compile(loss = tf.keras.losses.MeanAbsoluteError(),
                      optimizer = tf.keras.optimizers.Adam(),
                      metrics = [tf.keras.metrics.mean_absolute_error])
    
    history = lstm_model.fit(train_in, train_out,
                             epochs = 100,
                             validation_data = [test_in, test_out],
                             callbacks = [early_stopping])
    
    return lstm_model, history

In [None]:
lstm_model, history = LSTM_fit(model.Data)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

In [207]:
_, _, test_in, _ = model.Split(train)
_, scaler_out = model.Scale(train)
pred = lstm_model.predict(test_in[-1].reshape(1, (output_size*2), 13))
pred = np.round(scaler.inverse_transform(pred))

ValueError: cannot reshape array of size 12410 into shape (1,730,13)

In [None]:
pred

In [182]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(train.iloc[-output_size:].평균기온.values, pred.reshape(-1))

2.693016759776536

In [None]:
# LSTM

# 1. 128, 128, 128 unit, out_put size = 358, input_size = 358 * 2  => 2.69

In [30]:
def GRU_fit(data):
    train_in, train_out, test_in, test_out = data
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                      mode = 'min',
                                                      patience = 5,
                                                      min_delta = 0)
   
    lstm_model = tf.keras.Sequential()
    lstm_model.add(tf.keras.layers.LSTM(128, return_sequences = True))
    lstm_model.add(tf.keras.layers.LSTM(128, return_sequences = True))
    lstm_model.add(tf.keras.layers.LSTM(128, return_sequences = False))
    lstm_model.add(tf.keras.layers.Dense(output_size))
   
    lstm_model.compile(loss = tf.keras.losses.MeanAbsoluteError(),
                      optimizer = tf.keras.optimizers.Adam(),
                      metrics = [tf.keras.metrics.mean_absolute_error])
    
    history = lstm_model.fit(train_in, train_out,
                             epochs = 100,
                             validation_data = [test_in, test_out],
                             callbacks = [early_stopping])
    
    return lstm_model, history

In [31]:
GRU_model, history = GRU_fit(model.Data)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [184]:
_, _, test_in, _ = model.Split(train)
_, scaler_out = model.Scale(train)
pred = GRU_model.predict(test_in[-1].reshape(1, (output_size*2), 13))
pred = np.round(scaler.inverse_transform(pred))
mean_absolute_error(train.iloc[-output_size:].평균기온.values, pred.reshape(-1))



2.787988826815643

In [None]:
# GRU

# 1. 128, 128, 128 unit, out_put size = 358, input_size = 358 * 2  => 2.78

In [None]:
sub = pd.read_csv("data/sample_submission.csv")
scale_data, scaler_in, scaler_out = model.Scale(train)
pred = GRU_model.predict(scale_data[-(output_size*2):].values.reshape(1, (output_size*2), 13))
pred = np.round(scaler.inverse_transform(pred))
sub.평균기온 = pred[0]
sub.to_csv("data/GRU_128_128_128.csv", index = False)