# When we have to do log transformation?

## find threshold

### define functions

In [1]:
import pandas as pd
import functools
from sklearn.preprocessing import QuantileTransformer, PowerTransformer ,MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew
from util.build_dataset import build_master


# transformer
def log_transform(df):
    return np.log1p(df)

def sqrt_transform(df):
    return np.sqrt(df)


log = log_transform
sqrt = sqrt_transform
standard = StandardScaler()

t_names_X = ['log', 'None']
t_names_y = ['None', 'standard']
transformers_X = [log, 'None']
transformers_y = ['None', standard]



def get_skews(df):
    return df.apply(lambda x : skew(x))

def sum_corr(df):
    # default: method=pearson, min_periods=1
    # method{‘pearson’, ‘kendall’, ‘spearman’} 
    corr = df.corr()
    return abs(corr['price'].drop('price')).sum()

def transform(transformer, df):
    if isinstance(transformer, TransformerMixin):
        return pd.DataFrame(transformer.fit_transform(df), columns=df.columns)
    elif transformer == 'None':
        return df
    else: 
        return transformer(df)

def split_xy(df):
    X = df.drop("price" ,axis=1)
    y = df['price']
    return X, y


def corr_xy(x, y):
    corr = pd.concat([x,y] ,axis=1).corr()
    return abs(corr['price']).drop('price').sum()

def search_transformers(column, X: pd.DataFrame, y: pd.Series):
    """
        iterate transformer for X and compare with y (corr_xy)
    """
    x = X[column]
    l_tx = list(map(functools.partial(transform, df=pd.DataFrame(x)), transformers_X))

    l_coef = list(map(functools.partial(corr_xy, y=y), l_tx))
    
    # find max coef and index 
    max_coef = max(l_coef)
    max_index = l_coef.index(max_coef)
    
    proper_transformer = t_names_X[max_index]
    return proper_transformer, max_coef


                
def iterate_x(y: pd.Series, X: pd.DataFrame):
    # iterate X
    return list(map(functools.partial(search_transformers, X=X, y=y), X.columns.tolist()))

def grid_search(X: pd.DataFrame, y: pd.Series):
    """
    return: result grid, pd DataFrame
    """
    l_ty = list(map(functools.partial(transform, df=pd.DataFrame(y)), transformers_y))
    print(l_ty)
    
    # iterate y
    result = list(map(functools.partial(iterate_x, X=X), l_ty))
    return pd.DataFrame(result, columns=X.columns, index=t_names_y)

def get_final_df(df):
    sum_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 1], df.index, df.columns).astype("float").sum(axis=1).rename("corr_sum")
    transformer_df = pd.DataFrame(np.array(df.values.tolist())[:, :, 0], df.index, df.columns)
    return pd.concat([sum_df, transformer_df], axis=1)



# main: pipeline
def pipeline(df):
    # have to make numeric df
    numeric_df = df.select_dtypes(exclude=['object', 'datetime64[ns]'])
    X, y = split_xy(numeric_df)
    result_df = grid_search(X, y)
    return get_final_df(result_df),sum_corr(numeric_df), get_skews(X)

In [2]:
origin_df = build_master(dataset="clean_origin_fmp")

2020-04-19 18:06:59:manage:<INFO> 1 files is loaded from public_data/open_data_raw_material_price/origin/csv/201908.csv in s3 'production-bobsim'
2020-04-19 18:06:59:core:<INFO> missing values: 
 None


            date region   grade standard_item_name   price
0     2019-08-01   서울서부  상(1등급)                  벼   271.5
1     2019-08-01   서울서부  상(1등급)                  벼   266.0
2     2019-08-01   서울서부  상(1등급)                  벼   253.0
5     2019-08-01     서울  상(1등급)                  벼   270.0
6     2019-08-01     서울  상(1등급)                  벼   239.5
...          ...    ...     ...                ...     ...
97404 2019-08-30   경북포항  중(2등급)             others  1124.0
97405 2019-08-30   경북안동  중(2등급)             others   960.0
97406 2019-08-30   경남창원  중(2등급)             others   950.0
97407 2019-08-30   경남창원  중(2등급)             others   880.0
97408 2019-08-30     제주  중(2등급)             others   954.0

[81737 rows x 5 columns]


2020-04-19 18:07:02:manage:<INFO> 1 files is loaded from public_data/open_data_terrestrial_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'
2020-04-19 18:07:03:manage:<INFO> 1 files is loaded from public_data/open_data_marine_weather/origin/csv/2014-2020.csv in s3 'production-bobsim'


In [3]:
result08, ori_corr08, skews08 = pipeline(origin_df)

[             price
0      3151.285714
1       238.000000
2       288.000000
3      1450.000000
4       690.000000
...            ...
23666  2344.000000
23667   765.000000
23668  7020.000000
23669  1830.000000
23670  1130.000000

[23671 rows x 1 columns],           price
0      0.492279
1     -0.589336
2     -0.570772
3     -0.139357
4     -0.421522
...         ...
23666  0.192559
23667 -0.393677
23668  1.928616
23669  0.001726
23670 -0.258163

[23671 rows x 1 columns]]


In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(ori_corr08)
result08.T

0.07727662373646337


Unnamed: 0,None,standard
corr_sum,0.078052,0.078052
t_dur_preci,,
t_daily_preci,log,log
t_temper_avg,log,log
t_temper_lowest,,
t_temper_high,log,log
t_wind_spd_max,,
t_wind_spd_avg,,
t_rel_hmd_min,,
t_rel_hmd_avg,,


In [9]:
skews08.sum()

2.0974950045347214

In [10]:
temp_df = result08.reset_index().rename(columns={"index": "price"})

### y None 

In [11]:
temp_df[temp_df['price']=='None'].T.rename(columns={0:"transformer"})

Unnamed: 0,transformer
price,
corr_sum,0.078052
t_dur_preci,
t_daily_preci,log
t_temper_avg,log
t_temper_lowest,
t_temper_high,log
t_wind_spd_max,
t_wind_spd_avg,
t_rel_hmd_min,


In [15]:
skews08

t_dur_preci           1.214518
t_daily_preci         1.360846
t_temper_avg         -0.444567
t_temper_lowest      -0.553368
t_temper_high        -0.638396
t_wind_spd_max        0.341917
t_wind_spd_avg        0.750654
t_rel_hmd_min         0.560732
t_rel_hmd_avg         0.669528
m_wave_p_avg          0.402587
m_wind_spd_avg        0.101123
m_atm_press_avg      -1.143436
m_rel_hmd_avg        -1.207113
m_temper_avg         -0.857826
m_water_temper_avg   -0.920824
m_max_wave_h_avg      0.648813
m_sign_wave_h_avg     0.736340
m_sign_wave_h_high    0.646174
m_max_wave_h_high     0.429791
dtype: float64

In [16]:
origin_df

Unnamed: 0,date,region,standard_item_name,price,t_dur_preci,t_daily_preci,t_temper_avg,t_temper_lowest,t_temper_high,t_wind_spd_max,t_wind_spd_avg,t_rel_hmd_min,t_rel_hmd_avg,m_wave_p_avg,m_wind_spd_avg,m_atm_press_avg,m_rel_hmd_avg,m_temper_avg,m_water_temper_avg,m_max_wave_h_avg,m_sign_wave_h_avg,m_sign_wave_h_high,m_max_wave_h_high
0,2019-08-01,강원,others,3151.285714,0.345263,3.237895,28.048422,24.331579,32.701054,3.921053,1.669474,58.642105,79.241051,4.458824,3.641176,1011.594116,91.062500,25.694117,24.729412,0.882353,0.529412,0.735294,1.252941
1,2019-08-01,강원,감자,238.000000,0.345263,3.237895,28.048422,24.331579,32.701054,3.921053,1.669474,58.642105,79.241051,4.458824,3.641176,1011.594116,91.062500,25.694117,24.729412,0.882353,0.529412,0.735294,1.252941
2,2019-08-01,강원,건가공(해조)류,288.000000,0.345263,3.237895,28.048422,24.331579,32.701054,3.921053,1.669474,58.642105,79.241051,4.458824,3.641176,1011.594116,91.062500,25.694117,24.729412,0.882353,0.529412,0.735294,1.252941
3,2019-08-01,강원,건제품,1450.000000,0.345263,3.237895,28.048422,24.331579,32.701054,3.921053,1.669474,58.642105,79.241051,4.458824,3.641176,1011.594116,91.062500,25.694117,24.729412,0.882353,0.529412,0.735294,1.252941
4,2019-08-01,강원,고구마,690.000000,0.345263,3.237895,28.048422,24.331579,32.701054,3.921053,1.669474,58.642105,79.241051,4.458824,3.641176,1011.594116,91.062500,25.694117,24.729412,0.882353,0.529412,0.735294,1.252941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23666,2019-08-30,충북청주,포도,2344.000000,0.057660,0.109574,22.647873,18.348936,27.937235,4.739362,1.921277,47.148937,73.802132,4.723529,5.335294,1009.358826,71.647057,24.823530,25.268749,1.241176,0.758824,0.976471,1.647059
23667,2019-08-30,충북청주,피망(단고추),765.000000,0.057660,0.109574,22.647873,18.348936,27.937235,4.739362,1.921277,47.148937,73.802132,4.723529,5.335294,1009.358826,71.647057,24.823530,25.268749,1.241176,0.758824,0.976471,1.647059
23668,2019-08-30,충북청주,한우,7020.000000,0.057660,0.109574,22.647873,18.348936,27.937235,4.739362,1.921277,47.148937,73.802132,4.723529,5.335294,1009.358826,71.647057,24.823530,25.268749,1.241176,0.758824,0.976471,1.647059
23669,2019-08-30,충북청주,호두,1830.000000,0.057660,0.109574,22.647873,18.348936,27.937235,4.739362,1.921277,47.148937,73.802132,4.723529,5.335294,1009.358826,71.647057,24.823530,25.268749,1.241176,0.758824,0.976471,1.647059


TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
result07, ori_corr07, skews07 = pipeline(date="201907")

In [None]:
print(ori_corr07)
result07.reset_index().rename(columns={"index": "price"})

In [None]:
skews07.to_frame().T