# When we have to do log transformation?

## find threshold

### define functions

In [25]:
import pandas as pd
import functools
from sklearn.preprocessing import QuantileTransformer, PowerTransformer ,MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew
from util.build_dataset import build_origin_fmp


# transformer
def log_transform(df):
    return np.log1p(df)

def sqrt_transform(df):
    return np.sqrt(df)


log = log_transform
sqrt = sqrt_transform
standard = StandardScaler()

t_names_X = ['log', 'None']
t_names_y = ['log', 'standard', 'None']
transformers_X = [log, 'None']
transformers_y = [log, standard, 'None']



def get_skews(df):
    return df.apply(lambda x : skew(x))

def sum_corr(df):
    # default: method=pearson, min_periods=1
    # method{‘pearson’, ‘kendall’, ‘spearman’} 
    corr = df.corr()
    return abs(corr['price'].drop('price')).sum()

def transform(transformer, df):
    if isinstance(transformer, TransformerMixin):
        return pd.DataFrame(transformer.fit_transform(df), columns=df.columns)
    elif transformer == 'None':
        return df
    else: 
        return transformer(df)

def split_xy(df):
    X = df.drop("price" ,axis=1)
    y = df['price']
    return X, y


def corr_xy(x, y):
    corr = pd.concat([x,y] ,axis=1).corr()
    return abs(corr['price']).drop('price').sum()

def search_transformers(column, X: pd.DataFrame, y: pd.Series):
    """
        iterate transformer for X and compare with y (corr_xy)
    """
    x = X[column]
    l_tx = list(map(functools.partial(transform, df=pd.DataFrame(x)), transformers_X))

    l_coef = list(map(functools.partial(corr_xy, y=y), l_tx))
    
    # find max coef and index 
    max_coef = max(l_coef)
    max_index = l_coef.index(max_coef)
    
    proper_transformer = t_names_X[max_index]
    return proper_transformer, max_coef


                
def iterate_x(y: pd.Series, X: pd.DataFrame):
    # iterate X
    return list(map(functools.partial(search_transformers, X=X, y=y), X.columns.tolist()))

def grid_search(X: pd.DataFrame, y: pd.Series):
    """
    return: result grid, pd DataFrame
    """
    l_ty = list(map(functools.partial(transform, df=pd.DataFrame(y)), transformers_y))
    print(l_ty)
    
    # iterate y
    result = list(map(functools.partial(iterate_x, X=X), l_ty))
    return pd.DataFrame(result, columns=X.columns, index=t_names_y)

def get_final_df(df):
    sum_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 1], df.index, df.columns).astype("float").sum(axis=1).rename("corr_sum")
    transformer_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 0], df.index, df.columns)
    return pd.concat([sum_df, transformer_df], axis=1)



# main: pipeline
def pipeline(df):
    # have to make numeric df
    numeric_df = df.select_dtypes(exclude=['object', 'datetime64[ns]'])
    X, y = split_xy(numeric_df)
    result_df = grid_search(X, y)
    return get_final_df(result_df),sum_corr(numeric_df), get_skews(X)

In [26]:
origin_df = build_origin_fmp(date="201908", prefix='clean')

2020-04-09 03:37:32:manager:<INFO> 1 files is loaded from public_data/open_data_raw_material_price/origin/csv/201908.csv in s3 'production-bobsim'
2020-04-09 03:37:32:core:<INFO> missing values: 
 None
2020-04-09 03:37:34:manager:<INFO> 1 files is loaded from public_data/open_data_terrestrial_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'
2020-04-09 03:37:36:manager:<INFO> 1 files is loaded from public_data/open_data_marine_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'


In [27]:
result08, ori_corr08, skews08 = pipeline(origin_df)

[          price
0      5.476464
1      7.280008
2      6.807198
3      7.848153
4      7.365813
...         ...
32818  7.356918
32819  7.062192
32820  6.572283
32821  6.965080
32822  7.609697

[32823 rows x 1 columns],           price
0     -0.656426
1     -0.177054
2     -0.393273
3      0.261974
4     -0.125637
...         ...
32818 -0.131174
32819 -0.289382
32820 -0.468158
32821 -0.332099
32822  0.047074

[32823 rows x 1 columns],              price
0       238.000000
1      1450.000000
2       903.333333
3      2560.000000
4      1580.000000
...            ...
32818  1566.000000
32819  1166.000000
32820   714.000000
32821  1058.000000
32822  2016.666667

[32823 rows x 1 columns]]


In [28]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(ori_corr08)
result08.T

0.05087778034899348


Unnamed: 0,log,standard,None
corr_sum,0.0333295,0.0516301,0.0516301
t_dur_preci,,,
t_daily_preci,,log,log
t_temper_avg,,log,log
t_temper_lowest,log,,
t_temper_high,,log,log
t_wind_spd_max,log,,
t_wind_spd_avg,log,,
t_rel_hmd_min,log,,
t_rel_hmd_avg,,,


In [29]:
temp_df = result08.reset_index().rename(columns={"index": "price"})

### standard(for 

In [30]:
temp_df[temp_df['price']=='standard'].T.rename(columns={1:"transformer"})

Unnamed: 0,transformer
price,standard
corr_sum,0.0516301
t_dur_preci,
t_daily_preci,log
t_temper_avg,log
t_temper_lowest,
t_temper_high,log
t_wind_spd_max,
t_wind_spd_avg,
t_rel_hmd_min,


In [12]:
skews08

평균기온(°C)         -0.449520
최저기온(°C)         -0.560311
최고기온(°C)         -0.641829
최대 풍속(m/s)        0.341646
평균 풍속(m/s)_x      0.750572
최소 상대습도(pct)      0.565822
평균 상대습도(pct)_x    0.673415
강수 계속시간(hr)       0.864804
일강수량(mm)          1.109568
평균 풍속(m/s)_y      0.097296
평균기압(hPa)        -1.146476
평균 상대습도(pct)_y   -1.212847
평균 기온(°C)        -0.860780
평균 수온(°C)        -0.919219
평균 최대 파고(m)       0.648147
평균 유의 파고(m)       0.735840
최고 유의 파고(m)       0.643821
최고 최대 파고(m)       0.427031
평균 파주기(sec)       0.761779
최고 파주기(sec)       0.650867
dtype: float64

In [19]:
result07, ori_corr07, skews07 = pipeline(date="201907")

2020-04-02 04:47:19:manager:<INFO> 1 files is loaded from public_data/open_data_terrestrial_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'
2020-04-02 04:47:19:core:<INFO> 강수 계속시간(hr)    1
dtype: int64
2020-04-02 04:47:28:manager:<INFO> 1 files is loaded from public_data/open_data_marine_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'
2020-04-02 04:47:28:core:<INFO> Series([], dtype: int64)
2020-04-02 04:48:25:manager:<INFO> 1 files is loaded from public_data/open_data_raw_material_price/origin/csv/201907.csv in s3 'production-bobsim'
2020-04-02 04:48:26:core:<INFO> no missing value at raw material price


In [20]:
print(ori_corr07)
result07.reset_index().rename(columns={"index": "price"})

0.05504804034174497


Unnamed: 0,price,sum,평균기온(°C),최저기온(°C),최고기온(°C),최대 풍속(m/s),평균 풍속(m/s)_x,최소 상대습도(pct),평균 상대습도(pct)_x,강수 계속시간(hr),일강수량(mm),평균 풍속(m/s)_y,평균기압(hPa),평균 상대습도(pct)_y,평균 기온(°C),평균 수온(°C),평균 최대 파고(m),평균 유의 파고(m),최고 유의 파고(m),최고 최대 파고(m),평균 파주기(sec),최고 파주기(sec)
0,log,0.068223,,log,log,,,log,log,log,log,log,log,log,log,log,log,log,,log,log,log
1,,0.061753,log,log,log,,log,log,log,log,log,log,,log,log,log,log,log,log,log,log,log


In [25]:
skews07.to_frame().T

Unnamed: 0,평균기온(°C),최저기온(°C),최고기온(°C),최대 풍속(m/s),평균 풍속(m/s)_x,최소 상대습도(pct),평균 상대습도(pct)_x,강수 계속시간(hr),일강수량(mm),평균 풍속(m/s)_y,평균기압(hPa),평균 상대습도(pct)_y,평균 기온(°C),평균 수온(°C),평균 최대 파고(m),평균 유의 파고(m),최고 유의 파고(m),최고 최대 파고(m),평균 파주기(sec),최고 파주기(sec)
0,0.165048,0.279966,-0.440246,0.529074,0.456861,-0.400158,-0.386126,0.957824,0.80754,0.191083,0.504193,-0.85356,0.459598,0.428036,0.601955,0.538537,1.22191,1.178716,0.777665,1.303977
