# When we have to do log transformation?

## find threshold

### define functions

In [28]:
import pandas as pd
import functools
from sklearn.preprocessing import QuantileTransformer, PowerTransformer ,MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew
from util.build_dataset import build_master


# transformer
def log_transform(df):
    return np.log1p(df)

def sqrt_transform(df):
    return np.sqrt(df)


log = log_transform
sqrt = sqrt_transform
standard = StandardScaler()

t_names_X = ['log', 'None']
t_names_y = ['None', 'standard']
transformers_X = [log, 'None']
transformers_y = ['None', standard]



def get_skews(df):
    return df.apply(lambda x : skew(x))

def sum_corr(df):
    # default: method=pearson, min_periods=1
    # method{‘pearson’, ‘kendall’, ‘spearman’} 
    corr = df.corr()
    return abs(corr['price'].drop('price')).sum()

def transform(transformer, df):
    if isinstance(transformer, TransformerMixin):
        return pd.DataFrame(transformer.fit_transform(df), columns=df.columns)
    elif transformer == 'None':
        return df
    else: 
        return transformer(df)

def split_xy(df):
    X = df.drop("price" ,axis=1)
    y = df['price']
    return X, y


def corr_xy(x, y):
    corr = pd.concat([x,y] ,axis=1).corr()
    return abs(corr['price']).drop('price').sum()

def search_transformers(column, X: pd.DataFrame, y: pd.Series):
    """
        iterate transformer for X and compare with y (corr_xy)
    """
    x = X[column]
    l_tx = list(map(functools.partial(transform, df=pd.DataFrame(x)), transformers_X))

    l_coef = list(map(functools.partial(corr_xy, y=y), l_tx))
    
    # find max coef and index 
    max_coef = max(l_coef)
    max_index = l_coef.index(max_coef)
    
    proper_transformer = t_names_X[max_index]
    return proper_transformer, max_coef


                
def iterate_x(y: pd.Series, X: pd.DataFrame):
    # iterate X
    return list(map(functools.partial(search_transformers, X=X, y=y), X.columns.tolist()))

def grid_search(X: pd.DataFrame, y: pd.Series):
    """
    return: result grid, pd DataFrame
    """
    l_ty = list(map(functools.partial(transform, df=pd.DataFrame(y)), transformers_y))
    print(l_ty)
    
    # iterate y
    result = list(map(functools.partial(iterate_x, X=X), l_ty))
    return pd.DataFrame(result, columns=X.columns, index=t_names_y)

def get_final_df(df):
    sum_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 1], df.index, df.columns).astype("float").sum(axis=1).rename("corr_sum")
    transformer_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 0], df.index, df.columns)
    return pd.concat([sum_df, transformer_df], axis=1)



# main: pipeline
def pipeline(df):
    # have to make numeric df
    numeric_df = df.select_dtypes(exclude=['object', 'datetime64[ns]'])
    X, y = split_xy(numeric_df)
    result_df = grid_search(X, y)
    return get_final_df(result_df),sum_corr(numeric_df), get_skews(X)

In [2]:
origin_df = build_master(dataset="clean_origin_fmp")

2020-04-14 16:25:05:manage:<INFO> 1 files is loaded from public_data/open_data_raw_material_price/origin/csv/201908.csv in s3 'production-bobsim'
2020-04-14 16:25:05:core:<INFO> missing values: 
 None
2020-04-14 16:25:08:manage:<INFO> 1 files is loaded from public_data/open_data_terrestrial_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'
2020-04-14 16:25:09:manage:<INFO> 1 files is loaded from public_data/open_data_marine_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'


In [29]:
result08, ori_corr08, skews08 = pipeline(origin_df)

[             price
0       238.000000
1      1450.000000
2       903.333333
3      2560.000000
4      1580.000000
...            ...
32818  1566.000000
32819  1166.000000
32820   714.000000
32821  1058.000000
32822  2016.666667

[32823 rows x 1 columns],           price
0     -0.656426
1     -0.177054
2     -0.393273
3      0.261974
4     -0.125637
...         ...
32818 -0.131174
32819 -0.289382
32820 -0.468158
32821 -0.332099
32822  0.047074

[32823 rows x 1 columns]]


In [31]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(ori_corr08)
result08.T

0.04909775241001403


Unnamed: 0,None,standard
corr_sum,0.0497395,0.0497395
t_dur_preci,,
t_daily_preci,log,log
t_temper_avg,log,log
t_temper_lowest,,
t_temper_high,log,log
t_wind_spd_max,,
t_wind_spd_avg,,
t_rel_hmd_min,,
t_rel_hmd_avg,,


corr_sum              0.0497395
t_dur_preci                None
t_daily_preci               log
t_temper_avg                log
t_temper_lowest            None
t_temper_high               log
t_wind_spd_max             None
t_wind_spd_avg             None
t_rel_hmd_min              None
t_rel_hmd_avg              None
m_wave_p_avg               None
m_wind_spd_avg              log
m_atm_press_avg            None
m_rel_hmd_avg              None
m_temper_avg               None
m_water_temper_avg         None
m_max_wave_h_avg           None
m_sign_wave_h_avg          None
m_sign_wave_h_high         None
m_max_wave_h_high          None
Name: None, dtype: object

In [32]:
temp_df = result08.reset_index().rename(columns={"index": "price"})

### y None 

In [39]:
temp_df[temp_df['price']=='None'].T.rename(columns={0:"transformer"})

Unnamed: 0,transformer
price,
corr_sum,0.0497395
t_dur_preci,
t_daily_preci,log
t_temper_avg,log
t_temper_lowest,
t_temper_high,log
t_wind_spd_max,
t_wind_spd_avg,
t_rel_hmd_min,


In [33]:
skews08

t_dur_preci           1.217864
t_daily_preci         1.359788
t_temper_avg         -0.450662
t_temper_lowest      -0.560998
t_temper_high        -0.642173
t_wind_spd_max        0.341669
t_wind_spd_avg        0.751093
t_rel_hmd_min         0.565466
t_rel_hmd_avg         0.673339
m_wave_p_avg          0.408102
m_wind_spd_avg        0.097859
m_atm_press_avg      -1.145962
m_rel_hmd_avg        -1.213207
m_temper_avg         -0.859559
m_water_temper_avg   -0.917022
m_max_wave_h_avg      0.648479
m_sign_wave_h_avg     0.736192
m_sign_wave_h_high    0.644133
m_max_wave_h_high     0.427066
dtype: float64

In [None]:
result07, ori_corr07, skews07 = pipeline(date="201907")

In [None]:
print(ori_corr07)
result07.reset_index().rename(columns={"index": "price"})

In [None]:
skews07.to_frame().T