In [82]:
from data_pipeline.open_data_raw_material_price.core import OpenDataRawMaterialPrice
from data_pipeline.open_data_marine_weather.core import OpenDataMarineWeather
from data_pipeline.open_data_terrestrial_weather.core import OpenDataTerrestrialWeather
import pandas as pd
import functools
from sklearn.preprocessing import QuantileTransformer, PowerTransformer ,MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import TimeSeriesSplit

# transformer
def log_transform(df):
    return np.log1p(df)

def sqrt_transform(df):
    return np.sqrt(df)



log = log_transform
sqrt = sqrt_transform
min_max = MinMaxScaler()
standard = StandardScaler()
robust = RobustScaler()
max_abs = MaxAbsScaler()
normal = Normalizer()
quantile = QuantileTransformer()
box_cox = PowerTransformer(method='box-cox')
yeo_johnson = PowerTransformer(method='yeo-johnson')

t_names = ['log', 'sqrt', 'min_max', 'standard', 'robust', 'max_abs', 'quantile', 'yeo_johnson', 'None']
transformers = [log, sqrt, min_max, standard, robust, max_abs, quantile, yeo_johnson, 'None']


In [92]:

def sum_corr(df):
    # default: method=pearson, min_periods=1
    # method{‘pearson’, ‘kendall’, ‘spearman’} 
    corr = df.corr()
    return abs(corr['당일조사가격'].drop('당일조사가격')).sum()

def analyze_skew(df):
    return get_skews(df)

def analyze_coef(df):
    return sum_coef(df)

def transform(transformer, df):
    if isinstance(transformer, TransformerMixin):
        return pd.DataFrame(transformer.fit_transform(df), columns=df.columns)
    elif transformer == 'None':
        return df
    else: 
        return transformer(df)


def build_dataset(date="201908"):
    t = OpenDataTerrestrialWeather(
        date=date
    )
    t_df = t.clean(t.filter(t.input_df))

    m = OpenDataMarineWeather(
        date=date
    )
    m_df = m.clean(m.filter(m.input_df))

    p = OpenDataRawMaterialPrice(
        date=date
    )
    p_df = p.clean(p.filter(p.input_df))
    print(p_df)


    w_df = pd.merge(
                t_df, m_df,
                how='inner', on="일시"
            )
    
    origin_df = pd.merge(
                    p_df, w_df, how="inner", left_on="조사일자", right_on="일시"
                ).drop("일시", axis=1).astype(dtype={"조사일자": "datetime64"})

    return origin_df

def split_xy(df):
    X = df.drop("당일조사가격" ,axis=1)
    y = df['당일조사가격'].rename('price')
    return X, y

def corr_xy(x, y):
    corr = pd.concat([x,y] ,axis=1).corr()
    return abs(corr['price']).drop('price').sum()

def search_transformers(column, X: pd.DataFrame, y: pd.Series):
    """
        iterate transformer for X and compare with y (corr_xy)
    """
    x = X[column]
    l_tx = list(map(functools.partial(transform, df=pd.DataFrame(x)), transformers))

    l_coef = list(map(functools.partial(corr_xy, y=y), l_tx))
 
    # find max coef and index 
    max_coef = max(l_coef)
    max_index = l_coef.index(max_coef)
    transformed_column = l_tx[max_index]
    
    # proper_transformer = t_names[max_index]
    return transformed_column
                
def iterate_x(y: pd.Series, X: pd.DataFrame):
    # iterate X
    return pd.concat([pd.concat(list(map(functools.partial(search_transformers, X=X, y=y), X.columns.tolist())), axis=1), y], axis=1)

def grid_search(X: pd.DataFrame, y: pd.Series):
    """
    return: result grid, pd DataFrame
    """
    l_ty = list(map(functools.partial(transform, df=pd.DataFrame(y)), transformers))
    
    # iterate y
    result = list(map(functools.partial(iterate_x, X=X), l_ty))
    #print(result)
    return result


def customized_rmse(y, y_pred):
    error = y - y_pred
    def penalize(x):
        if x > 0:
            # if y > y_pred, penalize 10%
            return x * 1.1
        else:
            return x
    X = np.vectorize(penalize)(error)
    return np.sqrt(np.square(X).mean())

def set_train_test(df:pd.DataFrame):
    """
        TODO: search grid to find proper train test volume
    :param df: dataset
    :return: train Xy, test Xy
    """
    predict_days = 7
    # TODO: it should be processed in data_pipeline
    reversed_time = df["조사일자"].drop_duplicates().sort_values(ascending=False).tolist()
    standard_date = reversed_time[predict_days]

    train = df[df.조사일자.dt.date < standard_date]
    test = df[df.조사일자.dt.date >= standard_date]
    return train, test



def f(df:pd.DataFrame):
    X = df.drop(columns =["price","조사일자"])
    y = df['price']
    return X, y

def inverser_transform(step, x, transformer):
    """
            log = log_transform
            sqrt = sqrt_transform
            min_max = MinMaxScaler()
            standard = StandardScaler()
            robust = RobustScaler()
            max_abs = MaxAbsScaler()
            normal = Normalizer()
            quantile = QuantileTransformer()
            box_cox = PowerTransformer(method='box-cox')
            yeo_johnson = PowerTransformer(method='yeo-johnson')
    
    """
#     inverse_log(Y) = np.e**Y
#     inverse_sqrt(x) = ""
#     inverse_min_max(x) = min_max.inverse_transform
    if step == 'log':
        return np.expm1(x)
    elif step == 'sqrt':
        return x**2
    else:
        transformer.inverse_transform(x)

    
def grid_search_matrix(df:pd.DataFrame, c, t_name, transformer):
    """
    params: df(already gridsearched df about tansfomer)
    return: score(rmse)
    """


    print(pd.concat([c,df], axis=1))
    train, test = set_train_test(pd.concat([c,df], axis=1))
    train_X, train_y = f(train)
    test_X, test_y = f(test)
    enet = ElasticNet()
    tscv = TimeSeriesSplit(n_splits=2)
    parametersGrid = {"max_iter": [1, 5, 10],
                      "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                      "l1_ratio": np.arange(0.0, 1.0, 0.1)}
    
    grid = GridSearchCV(enet, parametersGrid, scoring=make_scorer(customized_rmse, greater_is_better=False), cv=tscv.split(train_X)  )
    grid.fit(train_X,train_y)
    
    pred_y = grid.predict(test_X)
    
    r_test = inverser_transform(t_name, test_y.to_numpy().reshape(-1,1), transformer)
    r_pred = inverser_transform(t_name, pred_y.reshape(-1,1), transformer)
    return customized_rmse(r_test, r_pred)
    

def get_final_df(df):
    sum_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 1], df.index, df.columns).astype("float").sum(axis=1).rename("sum")
    transformer_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 0], df.index, df.columns)
    return pd.concat([sum_df, transformer_df], axis=1)


# main: pipeline
def pipeline(date="201908"):
    origin_df = build_dataset(date=date)
    numeric_df = origin_df.select_dtypes(exclude=['object', 'datetime64[ns]'])
    X, y = split_xy(numeric_df)
    return grid_search(X, y), origin_df['조사일자']
    
   # rmse = grid_search_matrix()
    
    # return get_final_df(result_df),sum_corr(numeric_df), rmse
    

In [None]:
result, date_series = pipeline("201908")

In [88]:
result[1]

Unnamed: 0,평균기온(°C),최저기온(°C),최고기온(°C),최대 풍속(m/s),평균 풍속(m/s)_x,최소 상대습도(pct),평균 상대습도(pct)_x,강수 계속시간(hr),일강수량(mm),평균 풍속(m/s)_y,...,평균 상대습도(pct)_y,평균 기온(°C),평균 수온(°C),평균 최대 파고(m),평균 유의 파고(m),최고 유의 파고(m),최고 최대 파고(m),평균 파주기(sec),최고 파주기(sec),price
0,3.368964,1.078399,3.517529,0.548048,0.704989,0.405388,0.876138,0.549556,0.518922,1.534968,...,4.522468,0.259760,3.247635,0.168669,0.168669,0.120621,0.120621,0.000000,-2.264291,15.427249
1,3.368964,1.078399,3.517529,0.548048,0.704989,0.405388,0.876138,0.549556,0.518922,1.534968,...,4.522468,0.259760,3.247635,0.168669,0.168669,0.120621,0.120621,0.000000,-2.264291,38.078866
2,3.368964,1.078399,3.517529,0.548048,0.704989,0.405388,0.876138,0.549556,0.518922,1.534968,...,4.522468,0.259760,3.247635,0.168669,0.168669,0.120621,0.120621,0.000000,-2.264291,31.464265
3,3.368964,1.078399,3.517529,0.548048,0.704989,0.405388,0.876138,0.549556,0.518922,1.534968,...,4.522468,0.259760,3.247635,0.168669,0.168669,0.120621,0.120621,0.000000,-2.264291,32.403703
4,3.368964,1.078399,3.517529,0.548048,0.704989,0.405388,0.876138,0.549556,0.518922,1.534968,...,4.522468,0.259760,3.247635,0.168669,0.168669,0.120621,0.120621,0.000000,-2.264291,25.884358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40507,3.163273,-1.671776,3.365129,0.881882,0.811321,0.000000,0.816002,-1.290556,-0.894013,1.846136,...,4.285613,0.165165,3.268380,0.452452,0.501001,0.382382,0.358358,0.073857,-0.745060,39.572718
40508,3.163273,-1.671776,3.365129,0.881882,0.811321,0.000000,0.816002,-1.290556,-0.894013,1.846136,...,4.285613,0.165165,3.268380,0.452452,0.501001,0.382382,0.358358,0.073857,-0.745060,34.146742
40509,3.163273,-1.671776,3.365129,0.881882,0.811321,0.000000,0.816002,-1.290556,-0.894013,1.846136,...,4.285613,0.165165,3.268380,0.452452,0.501001,0.382382,0.358358,0.073857,-0.745060,26.720778
40510,3.163273,-1.671776,3.365129,0.881882,0.811321,0.000000,0.816002,-1.290556,-0.894013,1.846136,...,4.285613,0.165165,3.268380,0.452452,0.501001,0.382382,0.358358,0.073857,-0.745060,32.526912


In [93]:
idx = 0
for X in result:
    t_name = t_names[idx]
    grid_search_matrix(X, c=date_series, t_name=t_name, transformer = transformers[idx])
    idx += 1

            조사일자  평균기온(°C)  최저기온(°C)  최고기온(°C)  최대 풍속(m/s)  평균 풍속(m/s)_x  \
0     2019-08-01  0.736236  0.881381  0.736236    0.548048      0.479174   
1     2019-08-01  0.736236  0.881381  0.736236    0.548048      0.479174   
2     2019-08-01  0.736236  0.881381  0.736236    0.548048      0.479174   
3     2019-08-01  0.736236  0.881381  0.736236    0.548048      0.479174   
4     2019-08-01  0.736236  0.881381  0.736236    0.548048      0.479174   
...          ...       ...       ...       ...         ...           ...   
40507 2019-08-30  0.070070  0.000000  0.165165    0.881882      1.047222   
40508 2019-08-30  0.070070  0.000000  0.165165    0.881882      1.047222   
40509 2019-08-30  0.070070  0.000000  0.165165    0.881882      1.047222   
40510 2019-08-30  0.070070  0.000000  0.165165    0.881882      1.047222   
40511 2019-08-30  0.070070  0.000000  0.165165    0.881882      1.047222   

       최소 상대습도(pct)  평균 상대습도(pct)_x  강수 계속시간(hr)  일강수량(mm)  ...  \
0          0.549049 

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


            조사일자  평균기온(°C)  최저기온(°C)  최고기온(°C)  최대 풍속(m/s)  평균 풍속(m/s)_x  \
0     2019-08-01  3.368964  1.078399  3.517529    0.548048      0.704989   
1     2019-08-01  3.368964  1.078399  3.517529    0.548048      0.704989   
2     2019-08-01  3.368964  1.078399  3.517529    0.548048      0.704989   
3     2019-08-01  3.368964  1.078399  3.517529    0.548048      0.704989   
4     2019-08-01  3.368964  1.078399  3.517529    0.548048      0.704989   
...          ...       ...       ...       ...         ...           ...   
40507 2019-08-30  3.163273 -1.671776  3.365129    0.881882      0.811321   
40508 2019-08-30  3.163273 -1.671776  3.365129    0.881882      0.811321   
40509 2019-08-30  3.163273 -1.671776  3.365129    0.881882      0.811321   
40510 2019-08-30  3.163273 -1.671776  3.365129    0.881882      0.811321   
40511 2019-08-30  3.163273 -1.671776  3.365129    0.881882      0.811321   

       최소 상대습도(pct)  평균 상대습도(pct)_x  강수 계속시간(hr)  일강수량(mm)  ...  \
0          0.405388 

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


            조사일자  평균기온(°C)  최저기온(°C)  최고기온(°C)  최대 풍속(m/s)  평균 풍속(m/s)_x  \
0     2019-08-01  3.368964  1.078399  3.517529    0.548048      1.292081   
1     2019-08-01  3.368964  1.078399  3.517529    0.548048      1.292081   
2     2019-08-01  3.368964  1.078399  3.517529    0.548048      1.292081   
3     2019-08-01  3.368964  1.078399  3.517529    0.548048      1.292081   
4     2019-08-01  3.368964  1.078399  3.517529    0.548048      1.292081   
...          ...       ...       ...       ...         ...           ...   
40507 2019-08-30  3.163273 -1.671776  3.365129    0.881882      1.386101   
40508 2019-08-30  3.163273 -1.671776  3.365129    0.881882      1.386101   
40509 2019-08-30  3.163273 -1.671776  3.365129    0.881882      1.386101   
40510 2019-08-30  3.163273 -1.671776  3.365129    0.881882      1.386101   
40511 2019-08-30  3.163273 -1.671776  3.365129    0.881882      1.386101   

       최소 상대습도(pct)  평균 상대습도(pct)_x  강수 계속시간(hr)  일강수량(mm)  ...  \
0          0.405388 

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'

In [None]:
a = list(map(functools.partial(grid_search_matrix, c=date_series), result))
a