# library

In [7]:
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
from fbprophet import Prophet
import datetime
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [3]:
base_path = './open_data'
sub_path = './submission'

In [4]:
df = pd.read_csv(os.path.join(base_path, 'train.csv'), engine = 'python', encoding = 'cp949')
test = pd.read_csv(os.path.join(base_path, 'submission.csv'), engine = 'python', encoding = 'cp949')

# Benchmark code
- https://www.kaggle.com/eswarchandt/timeseries-forecasting-of-covid-19-arima#Time-evaluation

In [None]:
# 날짜 추출, submission 추출
submission_df = pd.DataFrame()
submission_df['DateTime'] = pd.date_range('2020-11-09','2021-01-08')
submission_df['DateTime'] = submission_df['DateTime'].apply(
            lambda x: datetime.datetime.strftime(x,'%Y-%m-%d')
        )

df2 = df[['DateTime','사용자']]
df2.columns = ['ds','y']

model = Prophet()
model.fit(df2)

# 시간 단위 예측 -> 평균내서 제출해볼 것(오차보정도 해야함)
future = model.make_future_dataframe(periods = 1464, freq = 'H')
fcst = model.predict(future)
pred_df = fcst.iloc[19008:]

In [54]:
predict_ls = []
for col in df.columns[1:]:
    fit = sm.tsa.statespace.SARIMAX(df[col],order=(1, 1, 1),seasonal_order=(1,1,1,12)).fit()
    predict = fit.predict(start=19008,end=19008+1463,dynamic=True).tolist()
    predict_ls.append(predict)

    temp = pd.DataFrame()
    temp['DateTime'] = pred_df['ds']
    temp.index = range(len(temp))

    temp['DateTime'] = \
            temp['DateTime'].apply(
                lambda x: datetime.datetime.strftime(x,'%Y-%m-%d')
        )

    temp[col] = predict
    temp2 = temp.groupby('DateTime')[col].sum().reset_index(name = col)
    
    submission_df = pd.merge(
        submission_df,
        temp2,
        on = 'DateTime',
        how = 'left'
    )

In [55]:
submission_df

Unnamed: 0,DateTime,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,3179.4295,3305.6829,819.1465,71791.6369
1,2020-11-10,3341.9845,3450.8899,836.6550,75967.2926
2,2020-11-11,3376.5166,3486.1573,840.5242,76270.4711
3,2020-11-12,3399.1182,3513.5876,844.2555,76380.0670
4,2020-11-13,3420.4194,3540.2517,847.9849,76476.5444
...,...,...,...,...,...
56,2021-01-04,4518.9272,4922.0639,1041.9115,81440.1022
57,2021-01-05,4540.0519,4948.6370,1045.6408,81535.5537
58,2021-01-06,4561.1766,4975.2101,1049.3702,81631.0052
59,2021-01-07,4582.3013,5001.7832,1053.0995,81726.4567


In [57]:
time_str = datetime.datetime.strftime(
    datetime.datetime.today(),
    '%Y%m%d_%H%M%S'
)
submission_df.to_csv(
    os.path.join(sub_path, f'submission_sarima_{time_str}.csv'),
    index = False,
    encoding = 'cp949'
)

# ensemble

In [60]:
# arima + prophet

am = pd.read_csv(
    os.path.join(sub_path, f'submission_sarima_{time_str}.csv'),
    encoding = 'cp949'
)
pp = pd.read_csv(
    os.path.join(sub_path, 'submission_prophet_2020_12_21_00_53_00.csv'),
    encoding = 'cp949'
)

In [67]:
eb = pp.iloc[:,1:] * 0.5 + am.iloc[:,1:] * 0.5
sub = pd.concat([pp[['DateTime']], eb], axis = 1)

In [70]:
time_str = datetime.datetime.strftime(
    datetime.datetime.today(),
    '%Y%m%d_%H%M%S'
)
sub.to_csv(
    os.path.join(sub_path, f'submission_ensemble_{time_str}.csv'),
    index = False,
    encoding = 'cp949'
)