<a href="https://colab.research.google.com/github/minicks/BigI/blob/master/temp/temperature_estimation7_2ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#!pip install tensorflow==2.1.0

In [0]:
#!pip install tensorflow-gpu 

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import tensorflow as tf
import os
import pickle
import warnings 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from matplotlib import style
from sklearn.feature_selection import mutual_info_regression
from pandas import DataFrame as df

warnings.filterwarnings('ignore')
%matplotlib inline


In [0]:
print(tf.__version__)
print(tf.test.is_gpu_available())

In [0]:
np.random.seed(7)
random.seed(7)
tf.random.set_seed(7)

In [0]:
os.chdir('/content/drive/My Drive/Dacon/온도 추정')

# 데이터 로드 및 데이터 확인




> 데이터 설명 

모든 데이터는 시간순으로 정렬되어있으며, 10분 단위로 측정됨

*   train 데이터 


1.   X00 ~ X39 : 5개의 지역에 대해 8가지씩 속성 칼럼 (관측소 데이터) ( 30일 + 3일 )
2.   Y00 ~ Y17 : 센서로 온도 측정
( 30일 o , 3일 x )

3. Y18 : 예측 센서 ( 30일 x, 3일 o)

*   test 데이터

1.  X00 ~ X39 : 80일간 관측한 8가지 속성에대한 5개 지역의 관측소 데이터



## 기본적인 데이터 정보 확인


In [0]:
train = pd.read_csv('/content/drive/My Drive/Dacon/온도 추정/train.csv', index_col = 'id')
test = pd.read_csv('/content/drive/My Drive/Dacon/온도 추정/test.csv',index_col = 'id')

In [0]:
all_data = pd.concat([train.loc[:,'X00':'X39'],test], sort = False)

In [0]:
train.columns

In [0]:
test.columns

In [0]:
train.head()

In [0]:
train.shape

In [0]:
train.info()

In [0]:
test.head()

In [0]:
test.shape

In [0]:
test.info()

## 주요 속성 살펴보기(X00~X39)

In [0]:
# 데이터 정보
temperature_name = ["X00","X07","X28","X31","X32"] #기온
localpress_name  = ["X01","X06","X22","X27","X29"] #현지기압
speed_name       = ["X02","X03","X18","X24","X26"] #풍속
water_name       = ["X04","X10","X21","X36","X39"] #일일 누적강수량
press_name       = ["X05","X08","X09","X23","X33"] #해면기압
sun_name         = ["X11","X14","X16","X19","X34"] #일일 누적일사량 - sun으로 대체
humidity_name    = ["X12","X20","X30","X37","X38"] #습도
direction_name   = ["X13","X15","X17","X25","X35"] #풍향
press_sum_name = ["X01","X06","X22","X27","X29","X05","X08","X09","X23","X33"]
x_columns = [temperature_name, localpress_name, speed_name, water_name, press_name, sun_name, humidity_name, direction_name]

In [0]:
def show_plot(col_group):
    plt.plot(all_data[col_group])
    plt.show()
    sns.boxplot(x='variable', y='value', data=pd.melt(all_data[col_group]))
    plt.show()

In [0]:
def kde_plot(col_group):
    for col in col_group:
        sns.kdeplot(all_data[col], kernel='epa')
    plt.show()

###X00~ X39 Plot

In [0]:
#컬럼에 대한 기초통계량 확인 (행별 계산)
for i in x_columns:
  print(i)
  print(train[i].describe())

#요소를 인덱스별로 합쳐서 하나의 컬럼으로 생성!  (열별 계산)
#pd.Series(train[temperature_name].mean(axis = 1))

In [0]:
# 기온
show_plot(temperature_name)
kde_plot(temperature_name)

In [0]:
# 현지 기압
show_plot(localpress_name )
kde_plot(localpress_name )

In [0]:
# 풍속
show_plot(speed_name)
kde_plot(speed_name)

In [0]:
# 강수량
show_plot(water_name)
kde_plot(water_name)

In [0]:
# 해면기압
show_plot(press_name)
kde_plot(press_name)

In [0]:
# 누적일사량
show_plot(sun_name)
kde_plot(sun_name)

In [0]:
# 습도
show_plot(humidity_name)
kde_plot(humidity_name)

In [0]:
# 풍향
show_plot(direction_name)
kde_plot(direction_name)

## 주요 속성 살펴보기(Y00~Y18)


In [0]:
# Y00 ~ Y17 plot
col_y17 = []
for i in range(18):
    col_y17.append('Y{:02d}'.format(i))

plt.figure(figsize=(9, 6))
plt.plot(train[col_y17])
plt.show()

In [0]:
col_y18 = col_y17 + ['Y18']
plt.figure(figsize=(9, 6))
train[col_y18].boxplot()
plt.show()

In [0]:
# Y18
plt.figure(figsize=(9, 6))
sns.distplot(train['Y18'])
plt.show()

In [0]:
plt.figure(figsize=(9, 3))
sns.boxplot(train['Y18'])
plt.show()

## correlation 파악

### 전체 히트맵 그려보기


1.   X00~39와 Y00~17
2.   X00~39와 Y18
3.   각특성간 관계



In [0]:
train_for_heatmap = train.iloc[:,:-1]
train_for_heatmap = train_for_heatmap.dropna()
train_for_heatmap

In [0]:
plt.rcParams['figure.figsize'] = [60,30]
correlations = train_for_heatmap.corr()

sns.heatmap(correlations, cmap = plt.cm.RdYlBu_r, vmin = 0.2, annot= True, vmax = 0.9)
plt.title('Correlation Heatmap')

In [0]:
train_for_heatmap2 = train.iloc[4320:,0:40]
train_for_heatmap3 = train.iloc[:,-1]
train_for_heatmap3 = train_for_heatmap3.dropna()
train_for_heatmap_Y18 = pd.concat([train_for_heatmap2, train_for_heatmap3],axis = 1)

In [0]:
plt.rcParams['figure.figsize'] = [60,30]
correlations2 = train_for_heatmap_Y18.corr()

sns.heatmap(correlations2, cmap = plt.cm.RdYlBu_r, vmin = 0.2, annot= True, vmax = 0.9)
plt.title('Correlation Heatmap Y18')

In [0]:
# Y18과 관계없는 속성들
train.loc[:,['X04','X16','X19','X36']].T

In [0]:
train_for_heatmap4 = train.loc[:,humidity_name]
train_for_heatmap5 = train.loc[:,humidity_name]
train_for_heatmap_water = pd.concat([train_for_heatmap4, train_for_heatmap5],axis = 1)

plt.rcParams['figure.figsize'] = [30,15]
correlations_water = train_for_heatmap_water.corr()

sns.heatmap(correlations_water, cmap = plt.cm.RdYlBu_r, vmin = 0.2, annot= True, vmax = 0.9)
plt.title('Correlation Heatmap water')

### Y00~Y18 사이의 관계 파악하기


In [0]:
target = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08','Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17', 'Y18']
train[target].head()

In [0]:
temp = train[train['Y18'].isnull()]
temp

In [0]:
temp = train[~train['Y18'].isnull()]
temp

In [0]:
y18 = train[~train['Y18'].isnull()]['Y18'].reset_index(drop = True)
y18

In [0]:
no_y18_target = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08','Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']
no_y18 = train[(train['Y18'].isnull()) & (train.index > 3887)][no_y18_target].reset_index(drop=True)
no_y18

In [0]:
check_target = pd.concat([no_y18,y18],axis = 1)
check_target

In [0]:
correlations3 = check_target.corr()
plt.rcParams['figure.figsize'] = [14,12]

sns.heatmap(correlations3, cmap = plt.cm.RdYlBu_r, vmin = 0.2, annot = True, vmax = 0.9)
plt.title('Correlation Heatmap Y00~17~18')

In [0]:
check_target.reset_index(inplace = True)
check_target

In [0]:
#인풋: 기준 컬럼이 속한 데이터프레임, 기준 컬럼, 기준 상관계수 
#아웃풋: 기준 컬럼과 상관계수가 기준 상관계수보다 높은 컬럼들의 이름

def high_corr(df, col, ratio):
    #Y에 대한 상관계수 데이터프레임 생성
    Y_corr = df.corr()
    Y_high = Y_corr.loc[:,Y_corr[col]> ratio].columns
    return Y_high.drop(col)

#인풋 생성: Y컬럼들
df = train.loc[:,"Y00":"Y17"]
#아웃풋 생성: Y17과 상관계수가 0.8 이상인 Y컬럼들
Y_high = high_corr(df, "Y17", 0.8)
print("Y17와 상관계수가 높은 Y컬럼들 ", Y_high.tolist())

#응용
#Y_high = high_corr(df, "Y18", 0.8)

In [0]:
fig, ax = plt.subplots(figsize = (10,7))
sns.lineplot(data=check_target, x='index', y='Y18', color='green', ax=ax)
sns.lineplot(data=check_target, x='index', y='Y03', color='purple', ax=ax)
sns.lineplot(data=check_target, x='index', y='Y04', color='red', ax=ax)
plt.legend(['Y18', 'Y03', 'Y04'])
plt.show()

In [0]:
fig, ax = plt.subplots(figsize=(10, 7))
sns.lineplot(data=check_target, x='index', y='Y18', color='green', ax=ax)
sns.lineplot(data=check_target, x='index', y='Y16', color='purple', ax=ax)
sns.lineplot(data=check_target, x='index', y='Y15', color='red', ax=ax)
plt.legend(['Y18', 'Y16', 'Y15'])
plt.show()

In [0]:
# 결론
#Y00~Y18을 종합하여 하나의 target attribute를 만들어야 한다.
#Y18이 우리가 실제로 예측해야 할 장소이다.
#Y03과 Y04는 Y18과 다른 양상을 띄고 있다.
#해당 장소들은 아예 삭제를 하고 분석을 진행하는 등의 어떠한 조치가 필요할 것으로 생각된다.

# Transfer Learning 기법 베이스 라인


## 1. 데이터 전처리

### 1-1 id 시간 변수 만들기


In [0]:
minute = (train.index%144).astype(int)
hour= pd.Series((train.index%144/6).astype(int))

In [0]:
min_in_day = 24*6
hour_in_day = 24

minute_sin = np.sin(np.pi*minute/min_in_day) 
minute_cos = np.cos(np.pi*minute/min_in_day)

hour_sin  = np.sin(np.pi*hour/hour_in_day)
hour_cos  = np.cos(np.pi*hour/hour_in_day)

In [0]:
t1 = range(len(train.index[:144]))
plt.plot(t1, train.index[:144], 'r-')
plt.title("id")
plt.show()

In [0]:
t1 = range(len(minute_sin[:144]))
plt.plot(t1, minute_sin[:144], 
         t1, minute_cos[:144], 'r-')
plt.title("Sin & Cos")
plt.show()

In [0]:
t1 = range(len(minute_sin[:288]))
plt.plot(t1, minute_sin[:288], 
         t1, minute_cos[:288], 'r-')
plt.title("Sin & Cos")
plt.show()

### 1-2 X특성 처리


In [0]:
#온도 + 습도
X_all = ["X00","X07","X28","X31","X32","X12","X20","X30","X37","X38"]
X_1220 = ["X00","X31","X32","X30","X37","X38"]
X_3738 = ["X00","X31","X32","X12","X20","X30"]
X_2037 = ["X00","X31","X32","X12","X30","X38"]
X_1237 = ["X00","X31","X32","X20","X30","X38"]

In [0]:
#온도 + 습도 + 일사량 
M_all = ["X00","X07","X28","X31","X32","X12","X20","X30","X37","X38","X34","X11"]
M_1220 = ["X00","X31","X32","X30","X37","X38","X34","X11"]
M_122011 = ["X00","X31","X32","X30","X37","X38","X34"]
M_122034 = ["X00","X31","X32","X30","X37","X38","X11"]

In [0]:
#온도 + 습도 + 일사량 + 전부
A_04 = ["X00","X31","X32","X30","X37","X38","X11",
            "X01","X06","X22","X27","X29",
            "X02","X03","X18","X24","X26",
            "X04","X10","X21","X36","X39",
            "X05","X08","X09","X23","X33",
            "X13","X15","X17","X25","X35"]

In [0]:
#온도(07,28) + 습도(12,20) + 일사량(34) + 지면기압(6,22) + 해면기압(33) + 풍속(02) + 풍향(35) + 강수량(04,36)
# A_01 = ["X00","X31","X32","X30","X37","X38","X11","X01","X27","X29","X05","X08",
#         "X09","X23","X03","X18","X24","X26","X13","X15","X17","X25",
#         "X10","X21","X39"]

In [0]:
#온도(07,28) + 습도(12,20) + 일사량(34) + 풍향(35)
#A_02 = ["X00","X31","X32","X30","X37","X38","X11","X13","X15","X17","X25"]

In [0]:
#온도(07,28) + 습도(12,20) + 일사량(34) + 해면기압(33)
#A_03 = ["X00","X31","X32","X30","X37","X38","X11","X05","X08","X09","X23"]
#온도(07,28) + 습도(12,20) + 일사량(34) + 해면기압(33,5)
#A_03 = ["X00","X31","X32","X30","X37","X38","X11","X08","X09","X23"]
#온도(07,28) + 습도(12,20) + 일사량(34) + 해면기압(33,23)
#A_03 = ["X00","X31","X32","X30","X37","X38","X11","X05","X08","X09"]
#온도(07,28) + 습도(12,20) + 일사량(34) + 해면기압(33,23,5)
#A_03 = ["X00","X31","X32","X30","X37","X38","X11","X08","X09"]

In [0]:
#온도
T_all = ["X00","X07","X28","X31","X32"]
T_07 = ["X00","X28","X31","X32"]
T_28 = ["X00","X07","X31","X32"]
T_0728 = ["X00","X31","X32"]

In [0]:
#습도
H_all = ["X12","X20","X30","X37","X38"]
H_1220 = ["X30","X37","X38"]
H_3738 = ["X12","X20","X30"]
H_2037 = ["X12","X30","X38"]
H_1237 = ["X20","X30","X38"]

In [0]:
#일사량
sun = ["X34","X11"]

In [0]:
#기압
PL_all = ["X01","X06","X22","X27","X29","X05","X08","X09","X23","X33"]
            # "X01","X06","X22","X27","X29",
            # "X02","X03","X18","X24","X26",
            # "X04","X10","X21","X36","X39",
            # "X05","X08","X09","X23","X33",

In [0]:
#온도 + 습도 + 일사량 + 전부
A_05 = ["X00","X31","X32","X30","X37","X38","X11",
            "X13","X15","X17","X25","X35"]

In [0]:
X_train2 = train.loc[:,'X00':'X39'].copy()
X_test2 = test.copy()

In [0]:
X_train = train.loc[:,A_05].copy()
X_test = test.loc[:,A_05].copy()

In [0]:
# 제거된 특성은 일일 누적일사량, 모든 값이 0인 것으로 확인.
#removedFeature = ['X14','X16','X19']
# X_train.drop(removedFeature, axis = 1, inplace = True)
# X_test.drop(removedFeature, axis=1, inplace=True)

In [0]:
# 남은 누적일사량 X11, X34는 분포가 같고, 값의 크기 차이가 적다. 평균내서 단일 특성으로 활용하자.
'''X_train['sun'] = X_train[['X11','X34']].mean(axis=1)
X_test['sun'] = X_test[['X11','X34']].mean(axis=1)
X_train.drop(['X11','X34'], axis=1, inplace=True)
X_test.drop(['X11','X34'], axis=1, inplace=True)'''

In [0]:
# 기온의 분포가 거의 같아서 평균 내어 temp 특성으로 사용
X_train2['temp'] = X_train2[temperature_name].mean(axis=1)
X_test2['temp'] = X_test2[temperature_name].mean(axis=1)
X_train2.drop(temperature_name, axis=1, inplace=True)
X_test2.drop(temperature_name, axis=1, inplace=True)

In [0]:
# 기압은 합치고 평균
X_train['press'] = X_train[press_sum_name].mean(axis=1)
X_test['press'] = X_test[press_sum_name].mean(axis=1)
X_train.drop(localpress_name, axis=1, inplace=True)
X_test.drop(localpress_name, axis=1, inplace=True)
X_train.drop(press_name, axis=1, inplace=True)
X_test.drop(press_name, axis=1, inplace=True)

In [0]:
# 풍향, 제거
X_train.drop(direction_name, axis=1, inplace=True)
X_test.drop(direction_name, axis=1, inplace=True)

In [0]:
#풍속, 평균 
#X_train['speed'] = X_train[speed_name].mean(axis=1)
#X_test['speed'] = X_test[speed_name].mean(axis=1)
#X_train.drop(speed_name, axis=1, inplace=True)
#X_test.drop(speed_name, axis=1, inplace=True)

In [0]:
#강수량 max
X_train['water'] = X_train['X39']
X_test['water'] =  X_test['X39']
X_train.drop(water_name, axis=1, inplace=True)
X_test.drop(water_name, axis=1, inplace=True)

In [0]:
X_train2

In [0]:
X_train

In [0]:
X_test

### 1-3 input 데이터 처리



In [0]:
# 표준화처리
def standardization(df):
    mean = np.mean(df)
    std = np.std(df)
    norm = (df - mean) / (std + 1e-07)
    return norm, mean, std

In [0]:
#minmax
def Minmax(X):
  x_scaler = MinMaxScaler()
  for col_ in X.columns:
    train_scaled = x_scaler.fit_transform(X[col_][:, np.newaxis])
    X[col_] = train_scaled.flatten()
  return X


In [0]:
X_train_norm, MEAN, STD = standardization(X_train) 
#X_train_norm = Minmax(X_train)

In [0]:
# RNN 모델에 입력 할 수 있는 시계열 형태로 데이터 변환 
def convert_to_timeseries(df, interval):
  sequence_list = []
  target_list = []

  for i in tqdm(range(df.shape[0] - interval)):
    sequence_list.append(np.array(df.iloc[i:i+interval,:-1]))
    target_list.append(df.iloc[i+interval,-1])

  sequence = np.array(sequence_list)
  target = np.array(target_list)

  return sequence, target

In [0]:
y_columns = ['Y15','Y16'] #'Y15','Y16'

In [0]:
# t시점 이전 120분의 데이터로 t시점의 온도를 추정할 수 있는 학습데이터 형성
sequence = np.empty((0,12,len(X_train_norm.columns)))
target = np.empty((0,))

for column in y_columns:

  concat = pd.concat([X_train_norm, train[column]], axis =1) #train[column]

  _sequence, _target = convert_to_timeseries(concat.head(144*30), interval = 12)
  # 24시간 = 1440분 = 144* 10분 즉, 30일 

  sequence = np.vstack((sequence, _sequence))
  target = np.hstack((target,_target))



In [0]:
# convert_to_timeseries 함수를 쓰기 위한 dummy feature 생성
X_train_norm['dummy'] = 0

In [0]:
# train set에서 도출된 평균과 표준편차로 standardization 실시 
X_test_norm = (X_test - MEAN) / (STD + 1e-07) 
#X_test_norm = Minmax(X_test)

In [0]:
X_test_norm['dummy'] = 0

In [0]:
# train과 test 기간을 합쳐서 120분 간격으로 학습데이터 재구축
X_test_ts, _ = convert_to_timeseries(pd.concat([X_train_norm,X_test_norm],axis = 0 ), interval = 12)


In [0]:
X_test_norm.shape

In [0]:
# test set 기간인 후반부 80일에 맞게 자르기 
X_test_ts = X_test_ts[-11520:,:,:]

In [0]:
X_test_ts.shape

In [0]:
# 만들어 두었던 dummy feature 제거
X_train_norm.drop('dummy',axis=1, inplace = True)
X_test_norm.drop('dummy', axis = 1 ,inplace=True)

### 1-4 특성 자동 탐색

####1-1 selectPercentile


In [0]:
select = SelectPercentile(percentile=40)

In [0]:
#a = X_train_norm.tail(432)
#b= train['Y18'].tail(432)
a = X_train_norm[-864:-432]
b= train['Y04'][-864:-432]

In [0]:
select.fit(a,b)

In [0]:
X_train_selected = select.transform(a)

In [0]:
print("X_train.shape: {} ".format(a.shape))
print("X_train_selected.shape: {} ".format(X_train_selected.shape))

In [0]:
mask = select.get_support()
print(mask)

In [0]:
X_train

#### 1-2 multi_info_regression


In [0]:
a1 = train.loc[:4319,["X00","X07","X28","X31","X32","X12","X20","X30","X37","X38"]] 
b1 = train['Y03'][:4320] 
b2 = train['Y04'][:4320] 
b3 = train['Y16'][:4320] 

In [0]:
#a1.drop(['X14','X16','X19'],axis=1,inplace=True)

In [0]:
mi = mutual_info_regression(a1, b1)
mi2 = mutual_info_regression(a1, b2)
mi3 = mutual_info_regression(a1, b3)

mi /= np.max(mi)
mi2 /= np.max(mi2)
mi3 /= np.max(mi3)

In [0]:
column_name = ['X00','X01','X02','X03','X04','X05','X06'
                                  ,'X07','X08','X09','X10','X11','X12','X13'
                                   ,'X15','X17','X18', 'X20','X21','X22'
                                   ,'X23','X24','X25','X26','X27','X28','X29','X30'
                                   ,'X31','X32','X33','X34','X35','X36','X37','X38','X39']

In [0]:
temp_hum_col = ["X00","X07","X28","X31","X32","X12","X20","X30","X37","X38"]

In [0]:
temp_hum_col = np.array(temp_hum_col)  #column_name = np.array(column_name)

In [0]:
arr1 = np.vstack([temp_hum_col,mi])
arr2 = np.vstack([temp_hum_col,mi2])
arr3 = np.vstack([temp_hum_col,mi3])

In [0]:
feature_df = pd.DataFrame(data=arr1.reshape(2,10),index=['temp_hum_y03','Y03'])
feature_df2 = pd.DataFrame(data=arr2.reshape(2,10),index=['temp_hum_y04','Y04'])
feature_df3 = pd.DataFrame(data=arr3.reshape(2,10),index=['temp_hum_y16','Y16'])

In [0]:
feature_df.sort_values(by=['Y03'], axis=1,inplace=True, ascending=False)
feature_df2.sort_values(by=['Y04'], axis=1,inplace=True, ascending=False)
feature_df3.sort_values(by=['Y16'], axis=1,inplace=True, ascending=False)

In [0]:
feature_df = feature_df.T
feature_df2 = feature_df2.T
feature_df3 = feature_df3.T

In [0]:
feature_df.reset_index(drop=True,inplace=True) 
feature_df2.reset_index(drop=True,inplace=True) 
feature_df3.reset_index(drop=True,inplace=True) 

In [0]:
result = pd.concat([feature_df,feature_df2,feature_df3],axis=1)

In [0]:
result = result.loc[:,['temp_hum_y03','Y03','temp_hum_y04','Y04','temp_hum_y16','Y16']]

In [0]:
result

In [0]:
result.to_csv('temp_hum_f_score.csv', index =False)

## 2. 탐색적 자료분석

## 3. 변수 선택 및 모델 구축 

In [0]:
def mse_keras(y_true, y_pred):
  def mse_AIFrenz(y_true, y_pred):
    diff = abs(y_true - y_pred)
    less_then_one = np.where(diff < 1, 0, diff)
    # multi-column일 경우에도 계산 할 수 있도록 np.average를 한번 더 씌움
    try:
        score = np.average(np.average(less_then_one ** 2, axis = 0))
    except ValueError:
        score = mean_squared_error(y_true, y_pred)
    return score

  score = tf.py_function(func=mse_AIFrenz, inp=[y_true, y_pred], Tout=tf.float32,  name='custom_mse') # tf 2.x
  return score

In [0]:
# 간단한 lstm 모델 구축하기
def build_model(lr):
  lstm_model = tf.keras.models.Sequential([
      tf.keras.layers.LSTM(128, input_shape=sequence.shape[-2:]),
      tf.keras.layers.Dense(256, activation='linear'),
      tf.keras.layers.Dense(128, activation='linear'),
      tf.keras.layers.Dense(1)
  ])
  
  # opt = tf.keras.optimizers.RMSprop(lr)
  opt = tf.keras.optimizers.Adam(lr)
  
  lstm_model.compile(optimizer=opt, loss='mse', metrics=[mse_keras])

  return lstm_model

LEARNINGRATE = 0.001
lstm_model = build_model(LEARNINGRATE)
lstm_model.summary()

## 4.모델 학습 및 검증


In [0]:
# loss가 4미만으로 떨어지면 학습 종료 시키는 기능
# 전이 학습을 위해 느슨한 학습을 적용
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        if(logs.get('loss') < 3):
            print('\n Loss is under 3, cancelling training')
            self.model.stop_training = True

In [0]:
callbacks = myCallback()

In [0]:
#X_train_T, X_valid, y_train_T, y_valid = train_test_split(sequence,target,test_size = 0.3, random_state = 0)
                                                            

In [0]:
#print(X_train_T.shape, X_train_T.dtype)
#print(X_valid.shape, X_valid.dtype)
#print(y_train_T.shape, y_train_T.dtype)
#print(y_valid.shape, y_valid.dtype)

In [0]:
import multiprocessing
multiprocessing.cpu_count()

In [0]:
# 모델 학습
score_list = []

lstm_model.fit(    
     sequence,target, # X_train_T, y_train_T, #
    epochs=100,
    batch_size=128,
    verbose=2,
    use_multiprocessing = True, 
    workers = multiprocessing.cpu_count(),
    shuffle=False,
    callbacks = [callbacks]
)
#score = lstm_model.evaluate(X_valid, y_valid, batch_size=128)
#score_list.append(score)

## Part4. 평균내서 정확도 확인
#print('-'*50); print('score:',score_list)
#np.array(score_list)[:,1].mean()

In [0]:
# LSTM 레이어는 고정
lstm_model.layers[0].trainalbe = False

In [0]:
# fine tuning 할 때 사용할 학습데이터 생성 (Y18)
finetune_X, finetune_y = convert_to_timeseries(pd.concat([X_train_norm.tail(432), train['Y18'].tail(432)],axis = 1),interval = 12)

In [0]:
ftrain_XT, fvalid_XT, ftrain_yT, fvalid_yT = train_test_split(finetune_X,finetune_y,test_size = 0.3, random_state = 0)
                                                            

In [0]:
# LSTM 레이어는 고정 시켜두고, DNN 레이어에 대해서 fine tuning 진행 (Transfer Learning
score_list = []

finetune_history = lstm_model.fit(
            ftrain_XT, ftrain_yT,#finetune_X, finetune_y, #
            epochs=100,
            batch_size=256,
            shuffle=False,
            use_multiprocessing = True, 
            workers = multiprocessing.cpu_count(),
            verbose = 2)#,
            #callbacks = callbacks2)
score = lstm_model.evaluate(fvalid_XT, fvalid_yT, batch_size=128)
score_list.append(score)
## Part4. 평균내서 정확도 확인
print('-'*50); print('score:',score_list)
np.array(score_list)[:,1].mean()

In [0]:
# 예측하기
finetune_pred = lstm_model.predict(X_test_ts)

# 그래프 그리기


In [0]:
# Y18(3일) - Y18(3일예측)
plt.figure(figsize=(15,5))
ff=finetune_pred.flatten()
ff= ff[:432]
t1 = np.arange(4752,5184,1)
plt.plot(train.loc[-432:,'Y18'])
plt.plot(t1,ff)

In [0]:
# Y18(3일) - Y18(30일예측)
plt.figure(figsize=(15,5))
ff=finetune_pred.flatten()
ff = ff[:4320]
t2 = np.arange(4752,9072,1)
plt.plot(train.loc[-432:,'Y18'])
plt.plot(t2,ff)

In [0]:
# Y16(3일)- Y18(3일)- Y18예측(3일)
plt.figure(figsize=(15,5))
ff= ff[:432]
plt.plot(train.loc[3888:4320,'Y16'])
plt.plot(train.loc[4320:4752,'Y18'])
plt.plot(t1,ff)

In [0]:
#Y16(30일) - Y18(3일) -  예측(30일)
plt.figure(figsize=(15,5))
ff=finetune_pred.flatten()
ff= ff[:4320]
plt.plot(train.loc[:4320,'Y16'])
plt.plot(train.loc[4320:4752,'Y18'])
plt.plot(t2,ff)

In [0]:
ff=finetune_pred.flatten()
style.use('ggplot')
t3 = np.arange(4752,16272,1)
temp_all = pd.concat([X_train2.loc[:,'temp'] ,X_test2.loc[:,'temp']], axis =0)
now = pd.concat([train.loc[:4320,'Y16'],train.loc[4320:4752,'Y18']],axis =0)

# red - y16(30) + y18(3), yellow - 예측, blue - X temp 데이터
plt.figure(figsize = (40,10))
plt.plot(now,'r')
plt.plot(t3,ff, 'y')
plt.plot(temp_all,'b')

In [0]:
plt.figure(figsize = (40,10))
plt.plot(train.loc[:, ['X34']],'b')
plt.plot(train.loc[:, ['X11']],'r')
plt.figure(figsize = (40,10))

## 6. 결과 및 결언

In [0]:
# 제출 파일 만들기
submit = pd.DataFrame({'id':range(144*33, 144*113),
              'Y18':finetune_pred.reshape(1,-1)[0]})

In [0]:
submit.to_csv('test_temp3_hum3_sun_ori(X1220X34o).csv', index = False)