In [1]:
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tqdm import tqdm

import tensorflow as tf
import pandas as pd        #pandas 1.3.2 버전 권장! 현제 1.4.1 버전에서 오류있음
import numpy as np
import pickle
import json
import os

## 데이터 불러오기

In [2]:
data_path = './data/'
model_path = './saved/'
image_path = './images/'

index = pd.read_csv(data_path + 'us_index.csv')
epidemic = pd.read_csv(data_path + 'us_epidemiology.csv')
trends = pd.read_csv(data_path + 'us_google-search_trends.csv')
states = json.load(open(data_path + 'us_adjacent_states.json', 'r'))

## 누락 데이터 처리

In [3]:
epidemic = epidemic.fillna(method='ffill')
epidemic = epidemic.fillna(0)
trends = trends.fillna(method='ffill')
trends = trends.fillna(0)

## 편의를 위한 함수들

In [4]:
# name > code
def get_region_code(region_name, index=index):
    ''' 지역 이름을 코드로 변환하는 함수
    Args:
      region_name (str): 지역의 이름 ex) New York

    Returns:
      region_code (str): 지역의 코드 ex) US_NY
    '''
    if region_name == 'US':
        return region_name

    region_code = index[(index['subregion1_name'] == region_name) &
                      (index['aggregation_level'] == 1)]['key'].values[0]
    return region_code


# code > name
def get_region_name(region_code, index=index):
    ''' 지역 코드를 이름으로 변환하는 함수
    Args:
      region_code (str): 지역의 코드

    Returns:
      region_name (str): 지역의 이름
    '''
    if region_code == 'US':
        return region_code

    country_code, subregion_code = region_code.split('_')
    region_name = index[(index['country_code'] == country_code) &
                      (index['subregion1_code'] == subregion_code)]['subregion1_name'].values[0]
    return region_name

def get_state_data(state, data_df, cols='new_confirmed', acc=False):
    ''' 지역의 데이터를 불러오는 함수
    Args:
      state (str): 지역의 이름
      data_df (pandas.DataFrame): 전체 데이터 테이블(프레임)
      cols (str, list): 불러올 데이터 테이블의 컬럼명, str형태일 경우 series반환, list형태일 경우 dataframe반환
      acc (bool): 누적 데이터로 만들어서 불러올 것인지 여부 (확진자 데이터의 경우 누적 데이터가 있으나, 웹의 경우 없음)

    Returns:
      padas.Series / pandas.DataFrame: 해당 지역의 데이터(cols의 타입 및 acc에 따라 리턴 타입이 다름)
    '''
    region_code = get_region_code(state)
    if acc:
        return np.add.accumulate(data_df[data_df['key']==region_code][cols].reset_index(drop=True))
    else:
        return data_df[data_df['key']==region_code][cols].reset_index(drop=True)

## 사용 가능한 데이터 날짜 

In [5]:
date_analysis = pd.DataFrame(columns=['epidemic_start', 'epidemic_end',
                                      'trends_start', 'trends_end'])

for state in states:
    region_code = get_region_code(state)
    date_analysis = pd.concat([
      date_analysis, 
      pd.DataFrame({'epidemic_start': epidemic[epidemic['key'] == region_code][:1]['date'].values,
                    'epidemic_end': epidemic[epidemic['key'] == region_code][-1:]['date'].values,
                    'trends_start': trends[trends['key'] == region_code][:1]['date'].values,
                    'trends_end': trends[trends['key'] == region_code][-1:]['date'].values,},
                   index=[state])])

date_analysis

Unnamed: 0,epidemic_start,epidemic_end,trends_start,trends_end
Washington,2020-01-13,2022-06-15,2020-01-01,2021-12-31
Illinois,2020-01-22,2022-06-15,2020-01-01,2021-12-31
California,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Arizona,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Massachusetts,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Wisconsin,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Texas,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Nebraska,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Utah,2020-01-22,2022-06-15,2020-01-01,2021-12-31
Oregon,2020-01-22,2022-06-15,2020-01-01,2021-12-31


## 데이터 사용기간 설정, 데이터 자르기

In [6]:
from_date = datetime(2020, 3, 1)
to_date = datetime(2021, 12, 31)

In [7]:
selected_dates = []
start_date = from_date
while(start_date <= to_date):
    selected_dates.append(start_date.strftime('%Y-%m-%d'))
    start_date += timedelta(1)
epidemic = epidemic[epidemic['date'].isin(selected_dates)]
trends = trends[trends['date'].isin(selected_dates)]

## 데이터에서 이상값 수정

In [8]:
for state in states:
    tmp_df = get_state_data(state, epidemic, ['date', 'new_confirmed'])
    roll_df = tmp_df.rolling(7).mean()

    dates = tmp_df[tmp_df['new_confirmed'] <= 0]['date']
    indices = tmp_df[tmp_df['new_confirmed'] <= 0].index

    for i, date in enumerate(dates):
        idx = epidemic[(epidemic['key']==get_region_code(state)) &
                       (epidemic['date']==date)].index
        roll_val = roll_df.loc[indices[i], 'new_confirmed']
        roll_val = 0 if np.isnan(roll_val) else roll_val
        epidemic.loc[idx, 'new_confirmed'] = roll_val

## 데이터 보간

In [9]:
interpolated_epidemic = pd.DataFrame(columns=epidemic.columns[:3])
interpolated_epidemic

for state in tqdm(states):
    df = epidemic[epidemic['key'] == get_region_code(state)][['date', 'key', 'new_confirmed']]
    df = df.reset_index(drop=True)
    
    df.loc[df.index % 7 != 0, df.columns[2:]] = None
    df['new_confirmed'] = pd.Series(df['new_confirmed']).interpolate(method='linear')
    
    interpolated_epidemic = pd.concat([interpolated_epidemic, df])
#end for

interpolated_epidemic = interpolated_epidemic.reset_index(drop=True)
epidemic = interpolated_epidemic

100%|█████████████████████████████████████████████████████████████████████████████████| 48/48 [00:00<00:00, 164.83it/s]


In [10]:
interpolated_trends = pd.DataFrame(columns=trends.columns)

for state in tqdm(states):
    df = trends[trends['key'] == get_region_code(state)]
    df = df.reset_index(drop=True)
    
    df.loc[df.index % 7 != 0, df.columns[2:]] = None
    df_numpy = df[df.columns[2:]].to_numpy().T
    
    for i, keyword in enumerate(df.columns[2:]):
        df[keyword] = pd.Series(df_numpy[i]).interpolate(method='linear')
    #end for
    
    interpolated_trends = pd.concat([interpolated_trends, df])
#end for
interpolated_trends = interpolated_trends.reset_index(drop=True)
trends = interpolated_trends

100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:14<00:00,  3.30it/s]


## Top 10 질병 

In [11]:
c19Cases = {"state" : [], "cases" : []}
for state in list(states.keys()):
    c19Cases['state'].append(state)
    c19Cases['cases'].append(int(np.sum(get_state_data(state, epidemic, ['new_confirmed']).values)))
    
top10Cases = pd.DataFrame.from_dict(c19Cases)
top10Cases = top10Cases.sort_values(by="cases", ascending=False)#[:10]
top10Cases = top10Cases.reset_index(drop=True)
top10Cases

Unnamed: 0,state,cases
0,California,6005808
1,Florida,3718413
2,New York,3632815
3,Texas,2867871
4,Illinois,1776306
5,Pennsylvania,1692552
6,Ohio,1685846
7,North Carolina,1547082
8,Georgia,1424281
9,New Jersey,1382277


## 지역 데이터 정규화

In [12]:
scalers = {}
scaled_epidemic = pd.DataFrame(columns=list(epidemic.columns[:2])
                               + ['new_confirmed'])

for state in states:
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(get_state_data(state, epidemic, ['new_confirmed']).values)
    tmp = pd.DataFrame(scaled, columns = ['new_confirmed'])
    tmp['date'] = get_state_data(state, epidemic, 'date')
    tmp['key'] = get_region_code(state)
    scaled_epidemic = pd.concat([scaled_epidemic, tmp])
    scalers[state] = scaler

scaled_epidemic = scaled_epidemic.reset_index(drop = True)
scaled_epidemic

Unnamed: 0,date,key,new_confirmed
0,2020-03-01,US_WA,0.000000
1,2020-03-02,US_WA,0.001185
2,2020-03-03,US_WA,0.002369
3,2020-03-04,US_WA,0.003554
4,2020-03-05,US_WA,0.004738
...,...,...,...
32203,2021-12-27,US_MT,0.102314
32204,2021-12-28,US_MT,0.102314
32205,2021-12-29,US_MT,0.102314
32206,2021-12-30,US_MT,0.102314


## google trend 정규화

In [13]:
scaled_trends = pd.DataFrame(columns = ['key', 'date'] + [c.replace('search_trends_', '')
                                                for c in trends.columns[2:]])
for state in states:
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(get_state_data(state, trends, trends.columns[2:]).values)
    tmp = pd.DataFrame(scaled, columns = [c.replace('search_trends_', '')
                                                for c in trends.columns[2:]])
    tmp['date'] = get_state_data(state, trends, 'date')
    tmp['key'] = get_region_code(state)
    scaled_trends = pd.concat([scaled_trends, tmp])
scaled_trends = scaled_trends.reset_index(drop=True)
scaled_trends

Unnamed: 0,key,date,abdominal_obesity,abdominal_pain,acne,actinic_keratosis,acute_bronchitis,adrenal_crisis,ageusia,alcoholism,...,visual_acuity,vomiting,wart,water_retention,weakness,weight_gain,wheeze,xeroderma,xerostomia,yawn
0,US_WA,2020-03-01,0.491713,0.686441,0.556818,0.235294,0.892473,0.080000,0.0,0.145482,...,0.538462,1.000000,0.492754,0.195122,0.687500,0.587097,0.552632,0.308290,0.631579,0.300000
1,US_WA,2020-03-02,0.481452,0.663438,0.541048,0.218487,0.886329,0.082857,0.0,0.135638,...,0.538462,0.967857,0.469979,0.188153,0.705357,0.570507,0.556391,0.301628,0.631579,0.328571
2,US_WA,2020-03-03,0.471192,0.640436,0.525278,0.201681,0.880184,0.085714,0.0,0.125793,...,0.538462,0.935714,0.447205,0.181185,0.723214,0.553917,0.560150,0.294967,0.631579,0.357143
3,US_WA,2020-03-04,0.460931,0.617433,0.509508,0.184874,0.874040,0.088571,0.0,0.115948,...,0.538462,0.903571,0.424431,0.174216,0.741071,0.537327,0.563910,0.288305,0.631579,0.385714
4,US_WA,2020-03-05,0.450671,0.594431,0.493738,0.168067,0.867896,0.091429,0.0,0.106104,...,0.538462,0.871429,0.401656,0.167247,0.758929,0.520737,0.567669,0.281643,0.631579,0.414286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32203,US_MT,2021-12-27,0.077348,0.595628,0.455172,0.173913,0.730769,0.000000,0.0,0.131213,...,0.133333,0.692308,0.137500,0.081081,0.172414,0.251282,0.100000,0.348684,0.733333,0.000000
32204,US_MT,2021-12-28,0.077348,0.595628,0.455172,0.173913,0.730769,0.000000,0.0,0.131213,...,0.133333,0.692308,0.137500,0.081081,0.172414,0.251282,0.100000,0.348684,0.733333,0.000000
32205,US_MT,2021-12-29,0.077348,0.595628,0.455172,0.173913,0.730769,0.000000,0.0,0.131213,...,0.133333,0.692308,0.137500,0.081081,0.172414,0.251282,0.100000,0.348684,0.733333,0.000000
32206,US_MT,2021-12-30,0.077348,0.595628,0.455172,0.173913,0.730769,0.000000,0.0,0.131213,...,0.133333,0.692308,0.137500,0.081081,0.172414,0.251282,0.100000,0.348684,0.733333,0.000000


### 교차 상관 계수 계산

1) 타겟 지역의 누적 확진자를 base
2) 다른 지역의 확진자 or 해당 지역의 쿼리를 0\~28(maxlag)칸씩 오른쪽으로 shift하며 base와 상관관계  계산
3) 각 lag별 가장 높은 상관 관계(max_corr)와 해당 데이터의 이름(max_name)을 기록
4) 타겟 지역을 변경해가며 1~5)의 과정을 반복하고 테이블을 병합

In [14]:
MAXLAG = 28
update_force = False

In [15]:
xcorr_epidemic_fname = data_path + 'us_xcorr_epidemic.csv'

if os.path.exists(xcorr_epidemic_fname) and not update_force == True:
    xcorr_epidemic = pd.read_csv(xcorr_epidemic_fname)
else:
    xcorr_epidemic = pd.DataFrame()

    for state in tqdm(states, position=0):
        base = get_state_data(state, scaled_epidemic)
        regions = pd.DataFrame({state: get_state_data(state, scaled_epidemic) 
                                    for state in states})
    
        sub_epidemic = []
        for _, target_cases in regions.items():
            sub_epidemic.append(
                [base.corr(target_cases.shift(i)) for i in range(MAXLAG+1)])

        sub_epidemic = pd.DataFrame(np.array(sub_epidemic).T,
                                    columns=regions.columns)
        sub_epidemic['max_name'] = sub_epidemic[
          [sub_state for sub_state in sub_epidemic if state != sub_state]
        ].idxmax(axis=1)
        sub_epidemic['max_corr'] = sub_epidemic[
          [sub_state for sub_state in sub_epidemic if state != sub_state]
        ].max(axis=1)

        xcorr_epidemic = pd.concat([
          xcorr_epidemic,
          pd.concat([
            pd.DataFrame({'key': get_region_code(state),
                          'lag': np.arange(MAXLAG+1)}),
            sub_epidemic], axis=1)
        ])

    xcorr_epidemic = xcorr_epidemic.reset_index(drop=True)
    xcorr_epidemic.to_csv(xcorr_epidemic_fname, index=False)
xcorr_epidemic

Unnamed: 0,key,lag,Washington,Illinois,California,Arizona,Massachusetts,Wisconsin,Texas,Nebraska,...,Mississippi,New Mexico,North Dakota,Wyoming,Maine,Alabama,Idaho,Montana,max_name,max_corr
0,US_WA,0,1.000000,0.640142,0.548547,0.548336,0.517647,0.442406,0.421291,0.534291,...,0.745176,0.555321,0.387554,0.473245,0.081890,0.769372,0.665541,0.424970,Louisiana,0.795982
1,US_WA,1,0.993131,0.645826,0.546687,0.548219,0.515937,0.446634,0.426534,0.538606,...,0.739030,0.556455,0.398963,0.470997,0.076088,0.763412,0.674759,0.424036,Louisiana,0.792528
2,US_WA,2,0.975097,0.650421,0.544593,0.551544,0.513288,0.450701,0.432846,0.542271,...,0.731788,0.557472,0.410311,0.470382,0.073175,0.755888,0.686595,0.423105,Louisiana,0.787838
3,US_WA,3,0.948459,0.654043,0.542392,0.556990,0.509798,0.454667,0.439898,0.545389,...,0.723316,0.558487,0.421572,0.470981,0.072278,0.747143,0.699865,0.422136,Louisiana,0.781870
4,US_WA,4,0.915803,0.656809,0.540209,0.563232,0.505572,0.458593,0.447365,0.548061,...,0.713477,0.559614,0.432720,0.472374,0.072495,0.737523,0.713382,0.421087,Missouri,0.779518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,US_MT,24,0.367060,0.535234,0.213406,0.132617,0.097665,0.712957,0.323516,0.598627,...,0.492192,0.480247,0.717552,0.766459,0.188519,0.429146,0.626591,0.777607,Wyoming,0.766459
1388,US_MT,25,0.363019,0.522774,0.208193,0.124226,0.087108,0.702383,0.311417,0.588179,...,0.494590,0.468580,0.708804,0.752084,0.179641,0.430681,0.620271,0.763119,Wyoming,0.752084
1389,US_MT,26,0.358738,0.511085,0.203412,0.116039,0.076509,0.691694,0.299353,0.577883,...,0.496899,0.457065,0.699672,0.737539,0.170108,0.432220,0.613663,0.748388,Wyoming,0.737539
1390,US_MT,27,0.354307,0.500250,0.199061,0.107972,0.065858,0.681030,0.287383,0.567592,...,0.499107,0.445769,0.690160,0.722848,0.159889,0.433875,0.606578,0.733446,Wyoming,0.722848


In [16]:
xcorr_trends_fname = data_path + 'us_xcorr_trends.csv'

if os.path.exists(xcorr_trends_fname) and not update_force == True:
    xcorr_trends = pd.read_csv(xcorr_trends_fname)
else:
    xcorr_trends = pd.DataFrame()

    for state in tqdm(states, position=0):
        base = get_state_data(state, scaled_epidemic)
        queries = get_state_data(state, scaled_trends, scaled_trends.columns[2:])

        sub_trends = []
        for _, target_trends in queries.items():
            sub_trends.append(
                [base.corr(target_trends.shift(i)) for i in range(MAXLAG+1)])

        sub_trends = pd.DataFrame(np.array(sub_trends).T,
                                    columns=queries.columns)
        sub_trends['max_name'] = sub_trends.idxmax(axis=1)
        sub_trends['max_corr'] = sub_trends.max(axis=1)


        xcorr_trends = pd.concat([
          xcorr_trends, 
          pd.concat([
            pd.DataFrame({'key': get_region_code(state),
                        'lag': np.arange(MAXLAG+1)}),
            sub_trends], axis=1)
          ])

    xcorr_trends = xcorr_trends.reset_index(drop=True)
    xcorr_trends.to_csv(xcorr_trends_fname, index=False)
xcorr_trends

Unnamed: 0,key,lag,abdominal_obesity,abdominal_pain,acne,actinic_keratosis,acute_bronchitis,adrenal_crisis,ageusia,alcoholism,...,wart,water_retention,weakness,weight_gain,wheeze,xeroderma,xerostomia,yawn,max_name,max_corr
0,US_WA,0,-0.243077,0.209276,-0.008880,-0.211779,-0.138324,-0.004039,0.682359,-0.070868,...,-0.142557,-0.290486,0.034904,-0.115926,-0.135807,-0.018959,0.273540,-0.199511,ageusia,0.682359
1,US_WA,1,-0.249383,0.204992,-0.010742,-0.208923,-0.135582,-0.008853,0.678271,-0.072779,...,-0.142846,-0.293010,0.040824,-0.124565,-0.136286,-0.022109,0.252107,-0.197750,ageusia,0.678271
2,US_WA,2,-0.254733,0.201294,-0.012689,-0.205182,-0.132990,-0.014599,0.674076,-0.074536,...,-0.143312,-0.294598,0.049112,-0.130469,-0.136654,-0.025768,0.231344,-0.193526,dysgeusia,0.675467
3,US_WA,3,-0.259143,0.198224,-0.014681,-0.200473,-0.130457,-0.020750,0.669602,-0.075901,...,-0.143581,-0.295076,0.059217,-0.133978,-0.137243,-0.029619,0.211936,-0.187562,dysgeusia,0.676094
4,US_WA,4,-0.262630,0.195824,-0.016675,-0.194712,-0.127892,-0.026779,0.664678,-0.076634,...,-0.143268,-0.294272,0.070569,-0.135436,-0.138382,-0.033346,0.194568,-0.180585,dysgeusia,0.675456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,US_MT,24,-0.652879,0.006639,-0.416299,-0.186037,-0.009159,,,0.022683,...,-0.323825,-0.277325,-0.109233,-0.287486,-0.117260,-0.331406,-0.186611,-0.185607,hypoxemia,0.581717
1388,US_MT,25,-0.654586,-0.000973,-0.423838,-0.178643,-0.012427,,,0.016999,...,-0.319661,-0.272075,-0.109610,-0.288945,-0.113144,-0.341483,-0.196573,-0.190509,hypoxemia,0.573780
1389,US_MT,26,-0.656451,-0.008710,-0.431376,-0.171669,-0.016067,,,0.011287,...,-0.314475,-0.266876,-0.110027,-0.290815,-0.108219,-0.351798,-0.206291,-0.195096,hypoxemia,0.565827
1390,US_MT,27,-0.658486,-0.016472,-0.438915,-0.164849,-0.020103,,,0.005576,...,-0.308096,-0.261872,-0.110386,-0.293058,-0.102511,-0.362217,-0.215880,-0.199304,hypoxemia,0.558029


## 데이터 슬라이싱

In [17]:
def DataSplit(input_data, label_data, look_back, look_ahead, split_rate=(0.6, 0.2, 0.2), teacher_force=False, time_data=None):
    #LSTM, Seq2Seq 데이터셋 만드는 함수
    def make_dataset(inputs, labels, time_dataset=None):
        window_size = look_back + look_ahead
        enc_in = []
        dec_in = []
        dec_out = []

        for i in range(len(labels) - window_size + int(teacher_force==False)):
            enc_in_select = i+look_back
            dec_in_select = enc_in_select + look_ahead
            enc_in_data = inputs[i:enc_in_select].reshape(-1, 1)
            dec_in_data = labels[enc_in_select:dec_in_select].reshape(-1, 1)
            dec_out_data = labels[enc_in_select+1:dec_in_select+1].reshape(-1, 1)

            if not time_dataset is None:
                if teacher_force == False:
                    td = time_dataset[i:enc_in_select]
                    enc_in_data = np.concatenate([enc_in_data, td], axis=1)
                else:
                    td = time_dataset[enc_in_select:dec_in_select]
                    dec_in_data = np.concatenate([dec_in_data, td], axis=1)
                #end if
            #end if

            enc_in.append(enc_in_data)
            dec_in.append(dec_in_data)
            dec_out.append(dec_out_data)
        #end for

        if teacher_force == True:
            return [np.array(enc_in), np.array(dec_in)], np.array(dec_out)
        else:
            return np.array(enc_in), np.array(dec_in)
        #end if
    #end def
    
    #학습 데이터 길이 구하기
    total_len = len(label_data)
    train_size = int(total_len * split_rate[0])
    valid_size = int(total_len * split_rate[1])
    test_size = int(total_len * split_rate[2])
    
    r = total_len - (train_size + valid_size + test_size)
    train_size += r
    
    if not time_data is None:
        trainset = make_dataset(input_data[:train_size], label_data[:train_size], time_data[:train_size])
        validset = make_dataset(input_data[train_size:train_size+valid_size], label_data[train_size:train_size+valid_size], time_data[train_size:train_size+valid_size])
        testset = make_dataset(input_data[-test_size:], label_data[-test_size:], time_data[-test_size:])
    else:
        trainset = make_dataset(input_data[:train_size], label_data[:train_size])
        validset = make_dataset(input_data[train_size:train_size+valid_size], label_data[train_size:train_size+valid_size])
        testset = make_dataset(input_data[-test_size:], label_data[-test_size:])
    #end if

    return trainset, validset, testset
#end def
    

#미국의 각 주에 대하여 다른 주와 최적의 시간차를 구함
def get_timepred_infomation(state, xcorr_data, corr_limit=0.3):
    df = xcorr_data[xcorr_data['key']==get_region_code(state)]
    df = df.reset_index(drop=True)
    dataset_indicater = {}
        
    for col in df.columns[2:-2]:
        try:
            best_step = np.argmax(df[col])
            best_corr = df[col][best_step]
            dataset_indicater[col] = {'best_step' : int(best_step), 'best_corr' : best_corr}
        except:
            pass
        #end try
    #end for

    dataset_indicater = pd.DataFrame(dataset_indicater)
    dataset_indicater_final = []
        
    for step in list(set(dataset_indicater.loc['best_step'])):
        temp = dataset_indicater.loc['best_corr'][dataset_indicater.loc['best_step'] == step]
        best_name = temp.index[temp.argmax()]
        best_corr = temp[temp.argmax()]
        if step == 0 or best_corr < corr_limit: continue

        dataset_indicater_final.append({
            'step' : step,
            'best_name' : best_name,
            'best_corr' : best_corr
        })
    #end for
    return pd.DataFrame(dataset_indicater_final)
#end def


#만들어진 시간차 정보 데이터를 통하여 데이터를 모으고 shift 시킴
def make_timedata(state, datatype='region', corr_limit=0.3):
    def _make_region_data(state):
        timepred_region = get_timepred_infomation(state, xcorr_epidemic, corr_limit)
        region_data = []

        for i in range(len(timepred_region)):
            reg_code = get_region_code(timepred_region.loc[i]['best_name'])
            step = int(timepred_region.loc[i]['step'])
            epidemic_state_data = scaled_epidemic[scaled_epidemic['key'] == reg_code]['new_confirmed']
            region_data.append(epidemic_state_data.shift(step).fillna(0).to_numpy().reshape(-1, 1))
        #end for
        
        if len(region_data) > 0: 
            return np.concatenate(region_data, axis=1)
        else:
            return None
        #end if
    #end def

    def _make_trend_data(state):
        timepred_trends = get_timepred_infomation(state, xcorr_trends, corr_limit)
        trend_data = []

        for i in range(len(timepred_trends)):
            keyword = timepred_trends.loc[i]['best_name']
            step = int(timepred_trends.loc[i]['step'])
            trends_state_data = scaled_trends[scaled_trends['key'] == get_region_code(state)]
            keyword_data = trends_state_data[keyword]
            trend_data.append(keyword_data.shift(step).fillna(0).to_numpy().reshape(-1, 1))
        #end for
        
        if len(trend_data) > 0: 
            return np.concatenate(trend_data, axis=1)
        else:
            return None
        #end if
    #end def
    
    if datatype == 'no_timedata':
        return None
    elif datatype == "region":
        region_data = _make_region_data(state)
        return region_data
    elif datatype == "trend":
        trends_data = _make_trend_data(state)
        return trends_data
    elif datatype == "region+trend":
        region_data = _make_region_data(state)
        trends_data = _make_trend_data(state)
        
        if (region_data is not None) and (trends_data is None):
            return region_data
        elif (region_data is None) and (trends_data is not None):
            return trends_data
        elif (region_data is None) and (trends_data is None):
            return None
        else:
            return np.concatenate([region_data, trends_data], axis=1)
        #end if
    else:
        raise Exception('Invalid_datatype')
    #end if
#end if

## 모델 튜닝

In [18]:
import importlib
from collections import defaultdict
from models import seq2seq, lstm
from tcn import compiled_tcn
from utils.callbacks import EarlyStopping, ModelCheckpoint
from silence_tensorflow import silence_tensorflow
import keras_tuner as kt
import pickle

silence_tensorflow()

In [19]:
importlib.reload(lstm)
importlib.reload(seq2seq)

<module 'models.seq2seq' from 'C:\\Users\\admin\\Desktop\\미국 코로나 예측\\models\\seq2seq.py'>

## 모델 평가 방법

In [20]:
RMSE = lambda x, y: mean_squared_error(x, y, squared=False)
MAE = mean_absolute_error
MAPE = mean_absolute_percentage_error
CORR = lambda x, y: pd.Series(tf.squeeze(x).numpy()).corr(
                    pd.Series(tf.squeeze(y).numpy()))

def evaluation(label, outputs, label_width=20):
    return {
        "RMSE": [RMSE(label[:, i], outputs[:, i]) for i in range(label_width)],
        "MAE": [MAE(label[:, i], outputs[:, i]) for i in range(label_width)],
        "MAPE": [MAPE(label[:, i], outputs[:, i]) for i in range(label_width)],
        "CORR": [CORR(label[:, i], outputs[:, i]) for i in range(label_width)]
    }

## 예측할 질병

In [21]:
#예측할 질병 = Top 10 개 질병
target_states = top10Cases['state'][:25]

## 하이퍼 파라메터 불러오기

In [22]:
try:
    with open("./turning/hparams.pkl","rb") as fr:
        hparams = pickle.load(fr)
    #end with
except:
    hparams = {}
#end try

In [23]:
hparams

{'California': {'lstm_no_timedata': {'units': 16,
   'dropout': 0.1,
   'learning_rate': 0.0353},
  'lstm_region': {'units': 128, 'dropout': 0.3, 'learning_rate': 0.0304},
  'lstm_trend': {'units': 64, 'dropout': 0.3, 'learning_rate': 0.0304},
  'lstm_region+trend': {'units': 32, 'dropout': 0.1, 'learning_rate': 0.0304},
  'seq2seq_no_timedata': {'units': 128,
   'dropout': 0.0,
   'learning_rate': 0.0059},
  'seq2seq_region': {'units': 16, 'dropout': 0.1, 'learning_rate': 0.0451},
  'seq2seq_trend': {'units': 16, 'dropout': 0.1, 'learning_rate': 0.0304},
  'seq2seq_region+trend': {'units': 32,
   'dropout': 0.0,
   'learning_rate': 0.0157},
  'tcn': {'nb_filters': 32,
   'kernel_size': 8,
   'dropout': 0.1,
   'learning_rate': 0.007925000000000001}},
 'Florida': {'lstm_no_timedata': {'units': 16,
   'dropout': 0.0,
   'learning_rate': 0.0402},
  'lstm_region': {'units': 128, 'dropout': 0.0, 'learning_rate': 0.0206},
  'lstm_trend': {'units': 128,
   'dropout': 0.1,
   'learning_rate':

## 튜닝 파라메터

In [24]:
max_epochs = 20
epochs = 20
batch_size = 20

look_back = 28
look_ahead = 28

corr_limit = 0.5

dataset_split = (0.6, 0.2, 0.2)

## 튜닝할 모델

In [25]:
from tensorflow.keras.optimizers import Adam

class model_lstm(kt.HyperModel):
    def build(self, hp):
        units = hp.Choice('units', [16, 32, 64, 128, 256])
        dropout = hp.Choice('dropout', [0.0, 0.1, 0.2, 0.3])
        learning_rate = hp.Float('learning_rate', min_value=0.001, max_value=0.05, step=0.0049)
        
        lstm_model = tf.keras.Sequential([
            tf.keras.layers.LSTM(units, dropout=dropout),
            tf.keras.layers.Dense(look_ahead),
            tf.keras.layers.Reshape((look_ahead, 1)),
        ])
        
        lstm_model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
        return lstm_model
    #end def
#end class

class model_seq2seq(kt.HyperModel):
    def __init__(self, feature_count):
        self.feature_count = feature_count
    #end def
    
    def build(self, hp):
        units = hp.Choice('units', [16, 32, 64, 128, 256])
        dropout = hp.Choice('dropout', [0.0, 0.1, 0.2, 0.3])
        learning_rate = hp.Float('learning_rate', min_value=0.001, max_value=0.05, step=0.0049)
        seq2seq_model = seq2seq.get_model(units, look_back, look_ahead, self.feature_count, dropout, learning_rate=learning_rate)
        return seq2seq_model
    #end def
#end class

class model_tcn(kt.HyperModel):
    def __init__(self, feature_count):
        self.feature_count = feature_count
    #end def
    
    def build(self, hp):
        nb_filters = hp.Choice('nb_filters', [4, 8, 16, 32])
        kernel_size = hp.Choice('kernel_size', [4, 8, 16, 32])
        dropout = hp.Choice('dropout', [0.0, 0.1, 0.2, 0.3])
        learning_rate = learning_rate = hp.Float('learning_rate', min_value=0.0005, max_value=0.05, step=0.002475)
        
        return compiled_tcn(
            return_sequences=False,
            num_feat=self.feature_count,
            nb_filters=nb_filters,
            num_classes=0,
            kernel_size=kernel_size,
            dilations=[2 ** i for i in range(9)], # 2, 4, 8, 16, 32...
            nb_stacks=1,
            max_len=look_back,
            use_skip_connections=True,
            regression=True,
            dropout_rate=dropout,
            output_len=look_ahead,
            lr=learning_rate,
        )
    #end def
#end class

## LSTM 튜닝

In [80]:
model_name = 'lstm'

for state in target_states:
    if not state in list(hparams.keys()): hparams[state] = {}
    for additional_data in ['no_timedata', 'region', 'trend', 'region+trend']:
        #현제 진행하고 있는 주 이름, 진행상황 표시
        progress = '[' + str(list(hparams.keys()).index(state) + 1) + '/' + str(len(target_states)) + ']'
        print('='*20, state + "_" + additional_data + ' ' + progress, '='*20)
        
        #저장할 파일 이름 정의
        save_name = state + "_" + model_name + "_" + additional_data
    
        #주 데이터 구하기
        states_data = scaled_epidemic[scaled_epidemic['key'] == get_region_code(state)]['new_confirmed'].to_numpy()
        
        #시간데이터 사용하는 경우에만 넣기
        if not additional_data == 'no_timedata':
            time_data = make_timedata(state, datatype=additional_data, corr_limit=corr_limit)
            
            #시간데이터가 corr_limit 때문에 0인 경우 시간데이터를 사용하지 않음
            if time_data is None:
                feature_count = 1
            else:
                feature_count = time_data.shape[1] + 1
            #end if
        else:
            time_data = None
            feature_count = 1
        #end if

        #데이터 생성
        train, valid, test = DataSplit(
            states_data, 
            states_data, 
            look_back, 
            look_ahead, 
            split_rate=dataset_split, 
            time_data=time_data,
            teacher_force=False, 
        )
        
        #케라스 튜너
        tuner = kt.Hyperband(
            model_lstm(), 
            objective = 'val_loss', 
            max_epochs=20,
            directory="./turning",
            project_name = save_name
        )
        
        #하이퍼 파라메터 검색
        tuner.search(
            train[0],
            train[1], 
            validation_data=(valid[0], valid[1]),
            batch_size=batch_size,
            epochs=20,
            shuffle = False, 
            verbose = 0, 
            workers = 4,
        )
        
        #최적 파라메터 저장
        best_hps = tuner.get_best_hyperparameters()[0]
        hparams[state][model_name + "_" + additional_data] = {
            'units' : best_hps.get('units'),
            'dropout' : best_hps.get('dropout'),
            'learning_rate' : best_hps.get('learning_rate')
        }
        
        #저장
        with open("./turning/hparams.pkl","wb") as fw:
            pickle.dump(hparams, fw)
        #end with
            
        #클리어 세션
        tf.keras.backend.clear_session() 
        print(hparams[state][model_name + "_" + additional_data])
        print()
    #end for
#end for

{'units': 16, 'dropout': 0.1, 'learning_rate': 0.0353}

{'units': 128, 'dropout': 0.3, 'learning_rate': 0.0304}

{'units': 64, 'dropout': 0.3, 'learning_rate': 0.0304}

{'units': 32, 'dropout': 0.1, 'learning_rate': 0.0304}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0402}

{'units': 128, 'dropout': 0.0, 'learning_rate': 0.0206}

{'units': 128, 'dropout': 0.1, 'learning_rate': 0.025500000000000002}

{'units': 256, 'dropout': 0.0, 'learning_rate': 0.001}

{'units': 128, 'dropout': 0.1, 'learning_rate': 0.001}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0304}

{'units': 128, 'dropout': 0.1, 'learning_rate': 0.001}

{'units': 64, 'dropout': 0.1, 'learning_rate': 0.001}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.001}

{'units': 128, 'dropout': 0.3, 'learning_rate': 0.0304}

{'units': 64, 'dropout': 0.2, 'learning_rate': 0.0402}

{'units': 256, 'dropout': 0.0, 'learning_rate': 0.025500000000000002}

{'units': 128, 'dropout': 0.0, 'learning_rate': 0.0059}

{'units': 64, 'd

{'units': 128, 'dropout': 0.3, 'learning_rate': 0.0304}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0353}

{'units': 64, 'dropout': 0.3, 'learning_rate': 0.0157}

{'units': 16, 'dropout': 0.1, 'learning_rate': 0.001}

{'units': 64, 'dropout': 0.0, 'learning_rate': 0.0059}

{'units': 32, 'dropout': 0.2, 'learning_rate': 0.0059}

{'units': 256, 'dropout': 0.3, 'learning_rate': 0.0108}

{'units': 32, 'dropout': 0.1, 'learning_rate': 0.0304}

{'units': 256, 'dropout': 0.2, 'learning_rate': 0.0304}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.0353}

{'units': 128, 'dropout': 0.0, 'learning_rate': 0.001}

{'units': 16, 'dropout': 0.3, 'learning_rate': 0.001}

{'units': 256, 'dropout': 0.2, 'learning_rate': 0.0059}

{'units': 64, 'dropout': 0.0, 'learning_rate': 0.0402}

{'units': 256, 'dropout': 0.1, 'learning_rate': 0.0206}

{'units': 128, 'dropout': 0.3, 'learning_rate': 0.0206}

{'units': 256, 'dropout': 0.0, 'learning_rate': 0.001}

{'units': 128, 'dropout': 0.1, 'learning_rat

## Seq2Seq 튜닝

In [81]:
model_name = 'seq2seq'

for state in target_states:
    if not state in list(hparams.keys()): hparams[state] = {}
    for additional_data in ['no_timedata', 'region', 'trend', 'region+trend']:
        #현제 진행하고 있는 주 이름, 진행상황 표시
        progress = '[' + str(list(hparams.keys()).index(state) + 1) + '/' + str(len(target_states)) + ']'
        print('='*20, state + "_" + additional_data + ' ' + progress, '='*20)
        
        #저장할 파일 이름 정의
        save_name = state + "_" + model_name + "_" + additional_data
    
        #주 데이터 구하기
        states_data = scaled_epidemic[scaled_epidemic['key'] == get_region_code(state)]['new_confirmed'].to_numpy()
        
        #시간데이터 사용하는 경우에만 넣기
        if not additional_data == 'no_timedata':
            time_data = make_timedata(state, datatype=additional_data, corr_limit=corr_limit)
            
            #시간데이터가 corr_limit 때문에 0인 경우 시간데이터를 사용하지 않음
            if time_data is None:
                feature_count = 1
            else:
                feature_count = time_data.shape[1] + 1
            #end if
        else:
            time_data = None
            feature_count = 1
        #end if

        #데이터 생성
        train, valid, test = DataSplit(
            states_data, 
            states_data, 
            look_back, 
            look_ahead, 
            split_rate=dataset_split, 
            time_data=time_data,
            teacher_force=True, 
        )
        
        #케라스 튜너
        tuner = kt.Hyperband(
            model_seq2seq(feature_count), 
            objective = 'val_loss', 
            max_epochs=20,
            directory="./turning",
            project_name = save_name
        )
        
        #하이퍼 파라메터 검색
        tuner.search(
            train[0],
            train[1], 
            validation_data=(valid[0], valid[1]),
            batch_size=batch_size,
            epochs=20,
            shuffle = False, 
            verbose = 0, 
            workers = 4,
        )
        
        #최적 파라메터 저장
        best_hps = tuner.get_best_hyperparameters()[0]
        hparams[state][model_name + "_" + additional_data] = {
            'units' : best_hps.get('units'),
            'dropout' : best_hps.get('dropout'),
            'learning_rate' : best_hps.get('learning_rate')
        }
        
        #저장
        with open("./turning/hparams.pkl","wb") as fw:
            pickle.dump(hparams, fw)
        #end with
            
        #클리어 세션
        tf.keras.backend.clear_session() 
        print(hparams[state][model_name + "_" + additional_data])
        print()
    #end for
#end for

{'units': 128, 'dropout': 0.0, 'learning_rate': 0.0059}

{'units': 16, 'dropout': 0.1, 'learning_rate': 0.0451}

{'units': 16, 'dropout': 0.1, 'learning_rate': 0.0304}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.0157}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.025500000000000002}

{'units': 64, 'dropout': 0.0, 'learning_rate': 0.0353}

{'units': 128, 'dropout': 0.0, 'learning_rate': 0.0108}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.0157}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.0304}

{'units': 32, 'dropout': 0.3, 'learning_rate': 0.0157}

{'units': 128, 'dropout': 0.3, 'learning_rate': 0.0157}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0059}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.0304}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0108}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0304}

{'units': 16, 'dropout': 0.3, 'learning_rate': 0.0353}

{'units': 64, 'dropout': 0.1, 'learning_rate': 0.0206}

{'units': 16, 'dropout': 0.1, '

{'units': 16, 'dropout': 0.1, 'learning_rate': 0.0402}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0402}

{'units': 32, 'dropout': 0.2, 'learning_rate': 0.025500000000000002}

{'units': 32, 'dropout': 0.3, 'learning_rate': 0.0206}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.0108}

{'units': 64, 'dropout': 0.1, 'learning_rate': 0.0353}

{'units': 16, 'dropout': 0.2, 'learning_rate': 0.0206}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0402}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.0206}

{'units': 32, 'dropout': 0.0, 'learning_rate': 0.05}

{'units': 16, 'dropout': 0.0, 'learning_rate': 0.025500000000000002}

{'units': 128, 'dropout': 0.0, 'learning_rate': 0.0059}

{'units': 64, 'dropout': 0.1, 'learning_rate': 0.0157}

{'units': 128, 'dropout': 0.2, 'learning_rate': 0.0059}

{'units': 64, 'dropout': 0.0, 'learning_rate': 0.0108}

{'units': 128, 'dropout': 0.2, 'learning_rate': 0.001}

{'units': 64, 'dropout': 0.2, 'learning_rate': 0.0157}

{'units': 128, 'drop

## TCN 튜닝

In [26]:
model_name = 'tcn'

for state in target_states:
    if not state in list(hparams.keys()): hparams[state] = {}
    #현제 진행하고 있는 주 이름, 진행상황 표시
    progress = '[' + str(list(hparams.keys()).index(state) + 1) + '/' + str(len(target_states)) + ']'
    print('='*20, state + "_" + progress, '='*20)
    
    #저장할 파일 이름 정의
    save_name = state + "_" + model_name
    
    #주 데이터 구하기
    states_data = scaled_epidemic[scaled_epidemic['key'] == get_region_code(state)]['new_confirmed'].to_numpy()

    #데이터 생성
    train, valid, test = DataSplit(
        states_data, 
        states_data, 
        look_back, 
        look_ahead, 
        split_rate=dataset_split, 
        time_data=None,
        teacher_force=False, 
    )
        
    #케라스 튜너
    tuner = kt.Hyperband(
        model_tcn(1), 
        objective = 'val_loss', 
        max_epochs=max_epochs,
        directory="./turning",
        project_name = save_name
    )
        
    #하이퍼 파라메터 검색
    tuner.search(
        train[0],
        train[1], 
        validation_data=(valid[0], valid[1]),
        batch_size=batch_size,
        epochs=epochs,
        shuffle = False, 
        verbose = 0, 
        workers = 4,
    )
        
    #최적 파라메터 저장
    best_hps = tuner.get_best_hyperparameters()[0]
    hparams[state][model_name] = {
        'nb_filters' : best_hps.get('nb_filters'),
        'kernel_size' : best_hps.get('kernel_size'),
        'dropout' : best_hps.get('dropout'),
        'learning_rate' : best_hps.get('learning_rate')
    }
        
    #저장
    with open("./turning/hparams.pkl","wb") as fw:
        pickle.dump(hparams, fw)
    #end with
            
    #클리어 세션
    tf.keras.backend.clear_session() 
    print(hparams[state][model_name])
    print()
#end for



  super(Adam, self).__init__(name, **kwargs)


{'nb_filters': 32, 'kernel_size': 8, 'dropout': 0.1, 'learning_rate': 0.007925000000000001}

{'nb_filters': 16, 'kernel_size': 32, 'dropout': 0.2, 'learning_rate': 0.017825}

{'nb_filters': 32, 'kernel_size': 16, 'dropout': 0.1, 'learning_rate': 0.007925000000000001}

{'nb_filters': 4, 'kernel_size': 32, 'dropout': 0.1, 'learning_rate': 0.0005}

{'nb_filters': 4, 'kernel_size': 32, 'dropout': 0.0, 'learning_rate': 0.002975}

{'nb_filters': 16, 'kernel_size': 32, 'dropout': 0.3, 'learning_rate': 0.0005}

{'nb_filters': 16, 'kernel_size': 8, 'dropout': 0.3, 'learning_rate': 0.0005}

{'nb_filters': 8, 'kernel_size': 16, 'dropout': 0.0, 'learning_rate': 0.00545}

{'nb_filters': 8, 'kernel_size': 4, 'dropout': 0.0, 'learning_rate': 0.010400000000000001}

{'nb_filters': 32, 'kernel_size': 4, 'dropout': 0.1, 'learning_rate': 0.002975}

{'nb_filters': 16, 'kernel_size': 8, 'dropout': 0.1, 'learning_rate': 0.002975}

{'nb_filters': 4, 'kernel_size': 16, 'dropout': 0.1, 'learning_rate': 0.002975

In [28]:
with open("./turning/hparams.pkl","wb") as fw:
    pickle.dump(hparams, fw)
#end with

In [27]:
hparams

{'California': {'lstm_no_timedata': {'units': 16,
   'dropout': 0.1,
   'learning_rate': 0.0353},
  'lstm_region': {'units': 128, 'dropout': 0.3, 'learning_rate': 0.0304},
  'lstm_trend': {'units': 64, 'dropout': 0.3, 'learning_rate': 0.0304},
  'lstm_region+trend': {'units': 32, 'dropout': 0.1, 'learning_rate': 0.0304},
  'seq2seq_no_timedata': {'units': 128,
   'dropout': 0.0,
   'learning_rate': 0.0059},
  'seq2seq_region': {'units': 16, 'dropout': 0.1, 'learning_rate': 0.0451},
  'seq2seq_trend': {'units': 16, 'dropout': 0.1, 'learning_rate': 0.0304},
  'seq2seq_region+trend': {'units': 32,
   'dropout': 0.0,
   'learning_rate': 0.0157},
  'tcn': {'nb_filters': 32,
   'kernel_size': 8,
   'dropout': 0.1,
   'learning_rate': 0.007925000000000001}},
 'Florida': {'lstm_no_timedata': {'units': 16,
   'dropout': 0.0,
   'learning_rate': 0.0402},
  'lstm_region': {'units': 128, 'dropout': 0.0, 'learning_rate': 0.0206},
  'lstm_trend': {'units': 128,
   'dropout': 0.1,
   'learning_rate':