In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline

In [2]:
train = pd.read_csv("../../Data/Jeju/201901-202003.csv")
submission = pd.read_csv("../../Data/Jeju/submission.csv")

train.REG_YYMM = pd.to_datetime(train.REG_YYMM, format='%Y%m')

In [3]:
submission

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,0,202004,강원,건강보조식품 소매업,0
1,1,202004,강원,골프장 운영업,0
2,2,202004,강원,과실 및 채소 소매업,0
3,3,202004,강원,관광 민예품 및 선물용품 소매업,0
4,4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0
...,...,...,...,...,...
1389,1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,0
1390,1390,202007,충북,한식 음식점업,0
1391,1391,202007,충북,호텔업,0
1392,1392,202007,충북,화장품 및 방향제 소매업,0


In [4]:
def train_change(location):
    temp = train[train["CARD_SIDO_NM"]==location].groupby(["REG_YYMM", "STD_CLSS_NM"], as_index=False)[['AMT']].sum()
    
    temp_return = pd.DataFrame(index=train["REG_YYMM"].unique())

    for name in temp.STD_CLSS_NM.unique():
        temp_return[name] = temp[temp["STD_CLSS_NM"]==name].groupby("REG_YYMM")[['AMT']].sum()
        
    return temp_return

In [38]:
sub = submission.copy()

In [39]:
sub['yhat'] = 0
sub['yhat_lower'] = 0
sub['yhat_upper'] = 0

In [40]:
sub

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT,yhat,yhat_lower,yhat_upper
0,0,202004,강원,건강보조식품 소매업,0,0,0,0
1,1,202004,강원,골프장 운영업,0,0,0,0
2,2,202004,강원,과실 및 채소 소매업,0,0,0,0
3,3,202004,강원,관광 민예품 및 선물용품 소매업,0,0,0,0
4,4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0,0,0,0
...,...,...,...,...,...,...,...,...
1389,1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,0,0,0,0
1390,1390,202007,충북,한식 음식점업,0,0,0,0
1391,1391,202007,충북,호텔업,0,0,0,0
1392,1392,202007,충북,화장품 및 방향제 소매업,0,0,0,0


In [22]:
# https://data.oecd.org/trade/trade-in-goods-and-services-forecast.htm
other_resource = pd.read_csv("data/Trade in goods and services forecast.csv")

In [23]:
other_resource

Unnamed: 0.1,Unnamed: 0,Couyntry,Value
0,2019-01-01,KOR,596000000000.0
1,2019-02-01,KOR,596000000000.0
2,2019-03-01,KOR,596000000000.0
3,2019-04-01,KOR,613000000000.0
4,2019-05-01,KOR,613000000000.0
5,2019-06-01,KOR,613000000000.0
6,2019-07-01,KOR,621000000000.0
7,2019-08-01,KOR,621000000000.0
8,2019-09-01,KOR,621000000000.0
9,2019-10-01,KOR,623000000000.0


In [24]:
# until March
other_resource = other_resource[['Value']]

until_March = other_resource.iloc[:-4]

## Korea GDP

In [49]:
from fbprophet import Prophet # ds, y
import datetime as dt

import warnings
warnings.filterwarnings("ignore")


def prophet_kor_gdp(loc):
    
    temp = train_change(loc).fillna(0)
    temp["ds"] = temp.index
    temp.reset_index(inplace=True, drop=True)
    
    names = temp.columns
    
    for name in names:
        temp_field = pd.DataFrame({'ds': temp['ds'], 'y': temp[name]})
        temp_field =  pd.concat([temp_field, until_March['Value']], axis=1)
        
        model = Prophet(seasonality_mode='multiplicative',
                    weekly_seasonality=False,
                    yearly_seasonality=True,
                    mcmc_samples=1000,
                    growth='linear', 
                    changepoint_prior_scale=0.001)
        
        # add other seasonal data
        model.add_regressor('Value', prior_scale=0.5, mode='multiplicative')
        
        # fit model
        model.fit(temp_field) 
        
        # make future set
        future = model.make_future_dataframe(periods=4, freq = 'MS')
        future = pd.concat([future, other_resource['Value']], axis=1)
        
        # predict
        forecast = model.predict(future)
        
        # yhat_upper
        preds_1 = [202004, forecast["yhat_upper"].iloc[-4]]
        preds_2 = [202007, forecast["yhat_upper"].iloc[-1]]
        
        sub.loc[((sub["REG_YYMM"]==preds_1[0]) & (sub["CARD_SIDO_NM"]==loc) 
                 & (sub["STD_CLSS_NM"]==name)), 'AMT'] = preds_1[1]
        sub.loc[((sub["REG_YYMM"]==preds_2[0]) & (sub["CARD_SIDO_NM"]==loc) 
                 & (sub["STD_CLSS_NM"]==name)), 'AMT'] = preds_2[1]

In [50]:
location = train.CARD_SIDO_NM.unique()

for i in location:
    prophet_kor_gdp(i)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 11.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 11.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 11.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 11.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:n_changepoints greater than number of observations. Using 11.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INF

KeyboardInterrupt: 

In [None]:
sub_copy = sub.copy()

In [29]:
sub[sub["AMT"] < 0] = 0

In [31]:
sub.to_csv('submission/submission_prophet_kor_trade_amount.csv', index=False, encoding="CP949")
# 한글이 전부 깨져서 submission 파일에 값만 그대로 옮김

In [33]:
sub_copy

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,0,202004,강원,건강보조식품 소매업,-2.505293e+08
1,1,202004,강원,골프장 운영업,-8.588634e+09
2,2,202004,강원,과실 및 채소 소매업,-2.256982e+09
3,3,202004,강원,관광 민예품 및 선물용품 소매업,-6.623946e+07
4,4,202004,강원,그외 기타 분류안된 오락관련 서비스업,-6.287464e+03
...,...,...,...,...,...
1389,1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,-2.760208e+09
1390,1390,202007,충북,한식 음식점업,-4.456765e+10
1391,1391,202007,충북,호텔업,-8.430192e+07
1392,1392,202007,충북,화장품 및 방향제 소매업,-1.184648e+09
