![image.png](attachment:image.png)

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [4]:
X = (np.arange(9, dtype=np.float) - 3).reshape(-1, 1)  # -3부터 5까지의 분포
X = np.vstack([X, [100]]) # 아웃라이어 값 추가
pd.DataFrame(X).describe()

Unnamed: 0,0
count,10.0
mean,10.9
std,31.412842
min,-3.0
25%,-0.75
50%,1.5
75%,3.75
max,100.0


In [5]:
X

array([[ -3.],
       [ -2.],
       [ -1.],
       [  0.],
       [  1.],
       [  2.],
       [  3.],
       [  4.],
       [  5.],
       [100.]])

#### Standarad Scaler

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
np.mean(X_scaled), np.std(X_scaled)

(0.0, 1.0)

In [8]:
X_scaled

array([[-0.46642982],
       [-0.43287372],
       [-0.39931762],
       [-0.36576152],
       [-0.33220541],
       [-0.29864931],
       [-0.26509321],
       [-0.23153711],
       [-0.197981  ],
       [ 2.98984872]])

#### RobustScaler

In [9]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()
robust_scaler.fit(X)
X_robust_scaled = robust_scaler.transform(X)
np.mean(X_robust_scaled), np.std(X_robust_scaled)

(2.088888888888889, 6.622408647636923)

In [10]:
X_robust_scaled

array([[-1.        ],
       [-0.77777778],
       [-0.55555556],
       [-0.33333333],
       [-0.11111111],
       [ 0.11111111],
       [ 0.33333333],
       [ 0.55555556],
       [ 0.77777778],
       [21.88888889]])

In [11]:
pd.DataFrame(
    np.hstack([X_scaled[:-2], X_robust_scaled[:-2]]),
    columns=["Standard Scaler", "Robust Scaler"]
).describe()[3:]

Unnamed: 0,Standard Scaler,Robust Scaler
min,-0.46643,-1.0
25%,-0.407707,-0.611111
50%,-0.348983,-0.222222
75%,-0.29026,0.166667
max,-0.231537,0.555556


## 파이프라인
- 전처리용 객체는 scikit-learn의 파이프라인 기능을 이용하여 분류 모형과 합칠 수 있다. 예를 들어 스케일러와 선형회귀모형은 다음처럼 합친다

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

model = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

![image.png](attachment:image.png)

## 다항 변환

![image.png](attachment:image.png)

In [13]:
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3,2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [14]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [15]:
poly = PolynomialFeatures(interaction_only=True)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.],
       [ 1.,  2.,  3.,  6.],
       [ 1.,  4.,  5., 20.]])

In [16]:
from sklearn.preprocessing import FunctionTransformer

def kernel(X):
    x0 = X[:, :1]
    x1 = X[:, 1:2]
    x2 = X[:, 2:3]
    X_new = np.hstack([x0, 2 * x1, x2 ** 2, np.log(x1)])
    return X_new

In [17]:
X = np.arange(12).reshape(4,3)
X

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [18]:
kernel(X)

array([[  0.        ,   2.        ,   4.        ,   0.        ],
       [  3.        ,   8.        ,  25.        ,   1.38629436],
       [  6.        ,  14.        ,  64.        ,   1.94591015],
       [  9.        ,  20.        , 121.        ,   2.30258509]])

In [19]:
FunctionTransformer(kernel).fit_transform(X)

array([[  0.        ,   2.        ,   4.        ,   0.        ],
       [  3.        ,   8.        ,  25.        ,   1.38629436],
       [  6.        ,  14.        ,  64.        ,   1.94591015],
       [  9.        ,  20.        , 121.        ,   2.30258509]])