# Rule based Predict Price

## load df from s3 origin bucket

In [6]:
from util.s3_manager.manager import S3Manager

date = "201908"

manager = S3Manager(bucket_name="production-bobsim")
origin_df = manager.fetch_objects(key="public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(
            filename=date
        ))[0]

2020-03-30 12:16:03:manager:<INFO> 1 files is loaded from public_data/open_data_raw_material_price/origin/csv/201908.csv in s3 'production-bobsim'


## RMSE/MAE one day and one day after

In [8]:
filtered = origin_df[origin_df.조사구분명 == "소비자가격"].drop("조사구분명", axis=1).groupby([
            "조사일자", "조사지역명", "조사단위명",
            "표준품목명", "조사가격품목명", "표준품종명", "조사가격품종명"
        ]).mean().reset_index()

In [39]:
import functools
import pandas as pd
import numpy as np

date_list = filtered.조사일자.drop_duplicates().tolist()

def combine_categories(df: pd.DataFrame):
    """
        starting point of process
        combine categories into one category
    :return: combined pd DataFrame
    """
    return df.assign(
        품목명=lambda x: x.표준품목명 + x.조사가격품목명 + x.표준품종명 + x.조사가격품종명
    ).drop(columns=["표준품목명", "조사가격품목명", "표준품종명", "조사가격품종명"], axis=1)


def func(x, df):
    return combine_categories(df[df.조사일자 == x])[["조사일자", "품목명", "당일조사가격"]]
    
df_list = list(map(functools.partial(func, df=filtered), date_list))

In [45]:
f_df = df_list[1]
s_df = df_list[2]


merged = pd.merge(f_df, s_df, how='inner', on="품목명")
merged.head(5)

Unnamed: 0,조사일자_x,품목명,당일조사가격_x,조사일자_y,당일조사가격_y
0,2019-08-02,감자감자수미수미,238.0,2019-08-05,238.0
1,2019-08-02,감자감자수미수미,238.0,2019-08-05,203.333333
2,2019-08-02,감자감자수미수미,238.0,2019-08-05,210.0
3,2019-08-02,감자감자수미수미,238.0,2019-08-05,190.0
4,2019-08-02,감자감자수미수미,238.0,2019-08-05,195.0


### RMSE

In [46]:
se = merged.apply(lambda r: (r.당일조사가격_x - r.당일조사가격_y) **2, axis=1)
se

0        0.000000e+00
1        1.201778e+03
2        7.840000e+02
3        2.304000e+03
4        1.849000e+03
             ...     
41444    2.310400e+06
41445    4.624000e+05
41446    2.250000e+06
41447    0.000000e+00
41448    3.062500e+06
Length: 41449, dtype: float64

In [47]:
np.sqrt(se.mean())

2756.3840051391344

### RAE

In [49]:
ae = merged.apply(lambda r: abs(r.당일조사가격_x - r.당일조사가격_y), axis=1)
ae.mean()

1145.5258745400256

In [60]:
e = merged.apply(lambda r: r.당일조사가격_x - r.당일조사가격_y, axis=1)
e.mean()

-26.479058314922877

## last year's inflation rate (2.8%)

In [62]:
inflation_rate = 1 + 0.028/365

se1 = merged.apply(lambda r: (r.당일조사가격_x*inflation_rate - r.당일조사가격_y) **2, axis=1)
print("RMSE: ", np.sqrt(se1.mean()))

ae1 = merged.apply(lambda r: abs(r.당일조사가격_x*inflation_rate - r.당일조사가격_y), axis=1)
print("MAE: ", ae1.mean())

RMSE:  2756.488692793273
MAE:  1145.5904849885153
