# Long Term Forecast Influenza using Web data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import shutil
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
from tqdm import tqdm
from datetime import datetime, timedelta
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

from utils.preprocessing import normalize
from model.proposed import Proposed

In [3]:
# US, AU, KR influenza load

flu = {
    'us': pd.read_csv('./data/US_flu.csv'), 
    'au': pd.read_csv('./data/AU_flu.csv'),
    'kr': pd.read_csv('./data/KR_flu.csv') 
}

In [4]:
flu['us']

Unnamed: 0,weeks,ILI
0,10-01,19.8284
1,10-02,18.2749
2,10-03,19.2606
3,10-04,19.2495
4,10-05,20.8877
...,...,...
535,20-16,25.7453
536,20-17,20.2111
537,20-18,16.2086
538,20-19,13.5240


In [5]:
for nation in flu:
    print(f'{nation} flu data shape: {flu[nation].shape}')

us flu data shape: (540, 2)
au flu data shape: (540, 2)
kr flu data shape: (540, 2)


In [6]:
# US, AU, KR Google trends

trends = {
    'us': pd.read_csv('./data/US_trends.csv'), 
    'au': pd.read_csv('./data/AU_trends.csv'), 
    'kr': pd.read_csv('./data/KR_trends.csv') 
}

In [7]:
trends['us']

Unnamed: 0,weeks,influenza,H1N1 flu,swine flu,Flu,H1N1,seasonal flu,H1N1 virus,flu virus,H1N1 swine flu,...,polio epidemic,Afluria seasonal flu,hemolytic uremic syndrome HUS,croup,Jeff Duchin,inactivated vaccine,fevers aches,invasive pneumococcal disease,Sudden Acute Respiratory Syndrome,commonly infect humans
0,17-35,0,0,0,6,0,0,0,1,0,...,1,0,0,40,0,46,0,0,0,0
1,17-36,1,0,0,8,0,0,0,2,0,...,0,0,0,50,0,50,14,14,0,0
2,17-37,1,0,0,9,0,0,0,2,0,...,0,0,0,51,0,46,14,14,0,0
3,17-38,1,0,0,11,0,0,0,2,0,...,0,0,0,58,0,87,0,0,0,0
4,17-39,1,0,0,12,0,0,0,2,0,...,0,0,0,68,0,54,14,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,20-16,2,0,1,24,2,3,1,10,1,...,5,0,0,8,1,76,0,0,0,0
138,20-17,2,0,1,19,1,2,1,10,1,...,8,0,0,10,0,41,0,0,0,0
139,20-18,2,0,1,18,1,2,1,8,1,...,6,0,0,5,0,56,0,11,0,0
140,20-19,2,0,1,16,1,2,1,6,1,...,7,0,0,7,0,28,0,0,0,0


In [8]:
for nation in trends:
    print(f'{nation} trends data shape: {trends[nation].shape}')

us trends data shape: (142, 1001)
au trends data shape: (142, 966)
kr trends data shape: (142, 1001)


In [9]:
MIN_PRECIDENCE = 3 # 최소 시간 선행
MAX_PRECIDENCE = 15 # 최대 시간 선행
precidences = [i for i in range(MIN_PRECIDENCE, MAX_PRECIDENCE+1)]

corrs = {}

for nation in ['us', 'au', 'kr']:
    print('='*10, nation, '='*10)
    flu_data = flu[nation]
    trends_data = trends[nation]

    corrs_by_prcds = []
    keywords = trends_data.columns[1:] # weeks를 제외한 모든 열
    for keyword in tqdm(keywords):
        # correlation을 계산할 데이터 선택
        # trends의 weeks와 일치하는 flu데이터
        base = flu_data[flu_data.weeks.isin(trends_data.weeks)].ILI.reset_index(drop=True)
        target = trends_data[keyword] # 각 키워드의 trends 데이터
        
        corr_list = []
        for prcd in precidences:
            corr = base.corr(target.shift(prcd)) # precidence에 따라 trends를 shift/corr 계산
            corr_list.append(corr)
        corrs_by_prcds.append(corr_list)

    corrs_by_prcds = pd.DataFrame(corrs_by_prcds).T # 데이터 프레임 형식
    corrs_by_prcds.columns = keywords 
    corrs_by_prcds.insert(0, 'prcd', precidences) # 첫 번째 열에 precidence를 표시
    corrs[nation] = corrs_by_prcds



100%|██████████| 1000/1000 [00:01<00:00, 750.38it/s]




100%|██████████| 965/965 [00:01<00:00, 757.11it/s]




100%|██████████| 1000/1000 [00:01<00:00, 763.02it/s]


In [10]:
def slice_data(flu_data, trends_data, corrs_data, 
               lookback=5, lookahead=5, topn=1, historic=7):
    flu_inputs = []
    flu_labels = []
    web_past_inputs = []
    web_future_inputs = []
    
    keywords_rank = corrs_data.max().sort_values(ascending=False)
    keywords_rank = [(keyword, coef, corrs_data[keyword].argmax()+MIN_PRECIDENCE)
                     for keyword, coef in keywords_rank[1:topn+1].items()]
    
    available_weeks = trends_data['weeks'][MAX_PRECIDENCE+lookback:-lookahead+1].values
    for year_week in available_weeks:
        year, week = map(int, year_week.split('-'))
        
        flu_pos = flu_data[flu_data.weeks==f'{year}-{week:02}'].index[0]
        curr_flu = flu_data[flu_pos-lookback:flu_pos].ILI.values.reshape(-1, 1)
        curr_lbl = flu_data[flu_pos:flu_pos+lookahead].ILI.values
        
        for h in range(1, historic+1):
            flu_pos = flu_data[flu_data.weeks==f'{year-h}-{week:02}'].index[0]
            historic_flu = flu_data[flu_pos-lookback:flu_pos].ILI.values.reshape(-1, 1)
            curr_flu = np.hstack([curr_flu, historic_flu])
        
        web_past = []
        web_future = []
        for keyword, coef, prcd in keywords_rank:
            web_pos = trends_data[trends_data.weeks==f'{year}-{week:02}'].index[0]
            curr_web_past = trends_data[keyword][web_pos-lookback-prcd:web_pos-prcd].values.reshape(-1, 1)
            curr_web_future = np.random.uniform(1e-6, 1e-4, (lookahead, 1))
            curr_web_future[:prcd] = trends_data[keyword][web_pos-prcd:web_pos-prcd+lookahead].values.reshape(-1, 1)[:prcd]
            
            web_past.append(curr_web_past)
            web_future.append(curr_web_future)
            
        web_past = np.hstack(web_past)
        web_future = np.hstack(web_future)
        
        flu_inputs.append(curr_flu)
        flu_labels.append(curr_lbl)
        web_past_inputs.append(web_past)
        web_future_inputs.append(web_future)
    
    return flu_inputs, flu_labels, web_past_inputs, web_future_inputs

In [11]:
flu_norm = {nation: normalize(flu_df) for nation, flu_df in flu.items()}
trends_norm = {nation: normalize(trends_df) for nation, trends_df in trends.items()}

In [12]:
flu_inputs, flu_labels, web_past_inputs, web_future_inputs = slice_data(flu_norm['us'][0], 
                                                                        trends_norm['us'][0], 
                                                                        corrs['us'], 
                                                                        lookback=8,
                                                                        lookahead=10,
                                                                        topn=10)

In [13]:
train_flu_inputs, test_flu_inputs = train_test_split(flu_inputs,
                                                     test_size=0.2,
                                                     shuffle=False)
train_flu_inputs, val_flu_inputs = train_test_split(train_flu_inputs,
                                                    test_size=0.2,
                                                    shuffle=False)

train_flu_labels, test_flu_labels = train_test_split(flu_labels,
                                                     test_size=0.2,
                                                     shuffle=False)
train_flu_labels, val_flu_labels = train_test_split(train_flu_labels,
                                                    test_size=0.2,
                                                    shuffle=False)

train_web_past_inputs, test_web_past_inputs = train_test_split(web_past_inputs,
                                                               test_size=0.2,
                                                               shuffle=False)
train_web_past_inputs, val_web_past_inputs = train_test_split(train_web_past_inputs,
                                                              test_size=0.2,
                                                              shuffle=False)

train_web_future_inputs, test_web_future_inputs = train_test_split(web_future_inputs,
                                                                   test_size=0.2,
                                                                   shuffle=False)
train_web_future_inputs, val_web_future_inputs = train_test_split(train_web_future_inputs,
                                                                  test_size=0.2,
                                                                  shuffle=False)

In [14]:
tf.keras.backend.clear_session()
proposed_model = Proposed()
proposed_model.compile(loss='mse', optimizer='adam')

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2021-12-22 00:21:28.566826: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-22 00:21:28.566956: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [16]:
proposed_model((tf.random.uniform((2, 8, 8)),
               tf.random.uniform((2, 8, 10)),
               tf.random.uniform((2, 10, 10))))

<tf.Tensor: shape=(2, 10, 1), dtype=float32, numpy=
array([[[0.04667272],
        [0.04579561],
        [0.04563766],
        [0.04543255],
        [0.04524018],
        [0.0450242 ],
        [0.04517508],
        [0.04517188],
        [0.04514723],
        [0.04481198]],

       [[0.02621363],
        [0.02701101],
        [0.02753377],
        [0.0276632 ],
        [0.02777492],
        [0.02800976],
        [0.02799149],
        [0.02794026],
        [0.02790291],
        [0.02803956]]], dtype=float32)>

In [17]:
proposed_model.summary()

Model: "proposed"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (LSTM)             multiple                  70144     
                                                                 
 encoder2 (LSTM)             multiple                  71168     
                                                                 
 attention (Attention)       multiple                  0         
                                                                 
 decoder (LSTM)              multiple                  273408    
                                                                 
 leaky_re_lu (LeakyReLU)     multiple                  0         
                                                                 
 fc (Dense)                  multiple                  257       
                                                                 
Total params: 414,977
Trainable params: 414,977
Non-traina

In [23]:
proposed_model.fit(x=(np.array(train_flu_inputs), 
                      np.array(train_web_past_inputs), 
                      np.array(train_web_future_inputs)),
                   y=np.array(train_flu_labels),
                   validation_split=0.2, epochs=20, batch_size=32)

Epoch 1/20


TypeError: 'NoneType' object is not callable

In [None]:
# Hyper parameter

look_back = 8
look_ahead = 10
past_year_num = 6

max_lag = 14
collect_err = 22

test_split_size = 0.65
val_split_size = 0.8

epochs = 300
batch_size = 32