# Rule based Predict Price

## load df from s3 origin bucket

In [6]:
from util.s3_manager.manager import S3Manager

date = "201908"

manager = S3Manager(bucket_name="production-bobsim")
origin_df = manager.fetch_objects(key="public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(
            filename=date
        ))[0]

2020-03-30 12:16:03:manager:<INFO> 1 files is loaded from public_data/open_data_raw_material_price/origin/csv/201908.csv in s3 'production-bobsim'


## RMSE/MAE one day and one day after

In [8]:
filtered = origin_df[origin_df.조사구분명 == "소비자가격"].drop("조사구분명", axis=1).groupby([
            "조사일자", "조사지역명", "조사단위명",
            "표준품목명", "조사가격품목명", "표준품종명", "조사가격품종명"
        ]).mean().reset_index()

In [39]:
import functools
import pandas as pd
import numpy as np

date_list = filtered.조사일자.drop_duplicates().tolist()

def combine_categories(df: pd.DataFrame):
    """
        starting point of process
        combine categories into one category
    :return: combined pd DataFrame
    """
    return df.assign(
        품목명=lambda x: x.표준품목명 + x.조사가격품목명 + x.표준품종명 + x.조사가격품종명
    ).drop(columns=["표준품목명", "조사가격품목명", "표준품종명", "조사가격품종명"], axis=1)


def func(x, df):
    return combine_categories(df[df.조사일자 == x])[["조사일자", "품목명", "당일조사가격"]]
    
df_list = list(map(functools.partial(func, df=filtered), date_list))

In [73]:
f_df = df_list[-1]
s_df = df_list[-2]


merged = pd.merge(f_df, s_df, how='inner', on="품목명")
merged.head(5)

Unnamed: 0,조사일자_x,품목명,당일조사가격_x,조사일자_y,당일조사가격_y
0,2019-08-30,감자감자수미수미,238.0,2019-08-29,238.0
1,2019-08-30,감자감자수미수미,238.0,2019-08-29,216.666667
2,2019-08-30,감자감자수미수미,238.0,2019-08-29,210.0
3,2019-08-30,감자감자수미수미,238.0,2019-08-29,190.0
4,2019-08-30,감자감자수미수미,238.0,2019-08-29,220.0


### RMSE

In [74]:
se = merged.apply(lambda r: (r.당일조사가격_y - r.당일조사가격_x)**2, axis=1)
se

0        0.000000e+00
1        4.551111e+02
2        7.840000e+02
3        2.304000e+03
4        3.240000e+02
             ...     
39882    1.199025e+06
39883    2.635211e+06
39884    3.980025e+06
39885    3.980025e+06
39886    1.960000e+04
Length: 39887, dtype: float64

In [75]:
np.sqrt(se.mean())

2777.7423372246144

### RAE

In [76]:
ae = merged.apply(lambda r: abs(r.당일조사가격_y - r.당일조사가격_x), axis=1)
ae.mean()

1129.36896721333

In [77]:
e = merged.apply(lambda r: r.당일조사가격_y - r.당일조사가격_x, axis=1)
e.mean()

14.430994344738169

## last year's inflation rate (2.8%)

In [78]:
inflation_rate = 1 + 0.028/365

se1 = merged.apply(lambda r: (r.당일조사가격_y - r.당일조사가격_x*inflation_rate) **2, axis=1)
print("RMSE: ", np.sqrt(se1.mean()))

ae1 = merged.apply(lambda r: abs(r.당일조사가격_y - r.당일조사가격_x*inflation_rate), axis=1)
print("MAE: ", ae1.mean())

RMSE:  2777.842454336399
MAE:  1129.4427524424468
