<a href="https://colab.research.google.com/github/minicks/BigI/blob/master/temp/temp3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. 라이브러리 및 데이터
Library & Data

In [0]:
import os, sys 
from google.colab import drive 
drive.mount('/content/drive')

In [0]:
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import os

In [0]:
# 재생산성을 위해 시드 고정
np.random.seed(7)
random.seed(7)


In [0]:
# 작업 환경 변경 
os.chdir('/content/drive/My Drive/AI프렌즈/')

In [0]:
train = pd.read_csv('data/train.csv', index_col = 'id')
test = pd.read_csv('data/test.csv', index_col = 'id')

#2. 데이터 전처리
Data Cleansing & Pre-Processing


In [0]:
# 기상청 데이터만 추출
X_train = train.loc[:,'X00':'X39']

# standardization을 위해 평균과 표준편차 구하기
MEAN = X_train.mean()
STD = X_train.std()

# 표준편차가 0일 경우 대비하여 1e-07 추가 
X_train = (X_train - MEAN) / (STD + 1e-07)

In [0]:
# RNN 모델에 입력 할 수 있는 시계열 형태로 데이터 변환 
def convert_to_timeseries(df, interval):
    sequence_list = []
    target_list = []
    
    for i in tqdm(range(df.shape[0] - interval)):
        sequence_list.append(np.array(df.iloc[i:i+interval,:-1]))
        target_list.append(df.iloc[i+interval,-1])
    
    sequence = np.array(sequence_list)
    target = np.array(target_list)
    
    return sequence, target

In [14]:
y_columns = ['Y15','Y16']
# t시점 이전 120분의 데이터로 t시점의 온도를 추정할 수 있는 학습데이터 형성
sequence = np.empty((0, 12, 40))
target = np.empty((0,))
for column in y_columns :
    
    concat = pd.concat([X_train, train[column]], axis = 1)

    _sequence, _target = convert_to_timeseries(concat.head(144*30), interval = 12)

    sequence = np.vstack((sequence, _sequence))
    target = np.hstack((target, _target))

100%|██████████| 4308/4308 [00:01<00:00, 3578.69it/s]
100%|██████████| 4308/4308 [00:01<00:00, 3759.21it/s]


In [0]:
# convert_to_timeseries 함수를 쓰기 위한 dummy feature 생성
X_train['dummy'] = 0

In [0]:
# train set에서 도출된 평균과 표준편차로 standardization 실시 
test = (test - MEAN) / (STD + 1e-07)

In [0]:
# convert_to_timeseries 함수를 쓰기 위한 dummy feature 생성
test['dummy'] = 0

In [18]:
# train과 test 기간을 합쳐서 120분 간격으로 학습데이터 재구축
X_test, _ = convert_to_timeseries(pd.concat([X_train, test], axis = 0), interval=12)

100%|██████████| 16260/16260 [00:05<00:00, 2788.75it/s]


In [0]:
# test set 기간인 후반부 80일에 맞게 자르기 
X_test = X_test[-11520:, :, :]

In [0]:
# 만들어 두었던 dummy feature 제거
X_train.drop('dummy', axis = 1, inplace = True)
test.drop('dummy', axis = 1, inplace = True)

#4. 변수 선택 및 모델 구축 
Feature Engineering & Initial Modeling

In [0]:
# Metric define
def mse_keras(y_true, y_pred):
  def mse_AIFrenz(y_true, y_pred):
    diff = abs(y_true - y_pred)
    less_then_one = np.where(diff < 1, 0, diff)
    # multi-column일 경우에도 계산 할 수 있도록 np.average를 한번 더 씌움
    try:
        score = np.average(np.average(less_then_one ** 2, axis = 0))
    except ValueError:
        score = mean_squared_error(y_true, y_pred)
    return score

  score = tf.py_function(func=mse_AIFrenz, inp=[y_true, y_pred], Tout=tf.float32,  name='custom_mse') # tf 2.x
  #score = tf.py_func( lambda y_true, y_pred : mse_AIFrenz(y_true, y_pred) , [y_true, y_pred], 'float32', stateful = False, name = 'custom_mse' ) # tf 1.x
  return score

# Model build
def build_model(lr):
  simple_lstm_model = tf.keras.models.Sequential([
      tf.keras.layers.LSTM(128, input_shape=sequence.shape[-2:]),
      tf.keras.layers.Dense(128, activation='linear'),
      tf.keras.layers.Dense(64, activation='linear'),
      tf.keras.layers.Dense(1)
  ])
  
  # opt = tf.keras.optimizers.RMSprop(lr)
  opt = tf.keras.optimizers.Adam(lr)
  
  simple_lstm_model.compile(optimizer=opt, loss='mse', metrics=[mse_keras])

  return simple_lstm_model

LEARNINGRATE = 0.001
simple_lstm_model = build_model(LEARNINGRATE)
simple_lstm_model.summary()

#5. 모델 학습 및 검증
Model Tuning & Evaluation

In [0]:
# loss가 4미만으로 떨어지면 학습 종료 시키는 기능
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        if(logs.get('loss') < 4):
            print('\n Loss is under 4, cancelling training')
            self.model.stop_training = True

In [0]:
callbacks = myCallback()

In [0]:
# 모델 학습
simple_lstm_model.fit(    
    sequence, target,
    epochs=60,
    batch_size=128,
    verbose=2,
    shuffle=False,
    callbacks = [callbacks]
)

In [0]:
# LSTM 레이어는 고정
simple_lstm_model.layers[0].trainable = False

In [26]:
# fine tuning 할 때 사용할 학습데이터 생성 (Y18)
finetune_X, finetune_y = convert_to_timeseries(pd.concat([X_train.tail(432), train['Y18'].tail(432)], axis = 1), interval=12)


100%|██████████| 420/420 [00:00<00:00, 2348.73it/s]


In [0]:
# LSTM 레이어는 고정 시켜두고, DNN 레이어에 대해서 fine tuning 진행 (Transfer Learning)
finetune_history = simple_lstm_model.fit(
            finetune_X, finetune_y,
            epochs=20,
            batch_size=64,
            shuffle=False,
            verbose = 2)

In [0]:
# 예측하기 
finetune_pred = simple_lstm_model.predict(X_test)

In [0]:
# 제출 파일 만들기
submit = pd.DataFrame({'id':range(144*33, 144*113),
              'Y18':finetune_pred.reshape(1,-1)[0]})

In [0]:
submit.to_csv('baseline_result2.csv', index = False)

In [0]:
X_test

In [0]:
a = finetune_pred.flatten()
print(len(a))

plt.figure(figsize=(16,8))
plt.plot(a)
plt.plot(test.loc[:,['X00','X07','X28','X31','X32']].values) # 강수량
#test.loc[:,['X00','X07','X28','X31','X32']].plot(figsize=(16,8),title='기온')
x_ = 144
while x_ < 144*80: 
  plt.axvline(x=x_, color='b', linestyle='--', linewidth=1)
  x_ += 144
plt.show()

In [0]:
test.loc[:,['X00','X07','X28','X31','X32']].plot(figsize=(16,8),title='기온')

In [0]:
test

#TEST

In [0]:
y18 = train[~train['Y18'].isnull()]['Y18'].reset_index(drop= True)
no_y18_target = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08','Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']
no_y18 = train[(train['Y18'].isnull()) & (train['id'] > 3887)][no_y18_target].reset_index(drop = True)
check_target = pd.concat([no_y18, y18], axis = 1)

In [0]:
correlations =  check_target.corr()
plt.figure(figsize = (14, 12))

# Heatmap of correlations
sns.heatmap(correlations, cmap = plt.cm.RdYlBu_r,  vmin = 0.2, annot = True, vmax = 0.9)
plt.title('Correlation Heatmap');

In [0]:
trainY.describe(include='all')

In [0]:
train.info()

In [0]:
plt.figure(figsize=(16,8))
plt.plot(train.loc[:,['Y18']].values)
plt.plot(train.loc[:,['X00','X07','X28','X31','X32']].values) # 기온
#test.loc[:,['X00','X07','X28','X31','X32']].plot(figsize=(16,8),title='기온')
x_ = 144
while x_ < 144*80: 
  plt.axvline(x=x_, color='b', linestyle='--', linewidth=1)
  x_ += 144
plt.show()