# Python for Data Analysis - part13

##### Python의 numpy, pandas 등을 정리하였으며 파이썬 라이브러리를 활용한 데이터분석(2판)을 참고하여 작성하였습니다.
##### 해당 자료는 python 3.6 기반으로 작성되었습니다.

## 13. 파이썬 모데링 라이브러리

### 13.1 pandas와 모델 코드의 인터페이스
#### - 피처 엔지니어링 : 특징을 선택하고 추출 / 모델링에서 유용할 수 있는 정보를 추출하는 변환이나 분석 과정

In [1]:
import pandas as pd
import numpy as np

data = pd.DataFrame({
    'x0' : [1,2,3,4,5],
    'x1' : [0.01, -0.01, 0.25, -4.1, 0.],
    'y' : [-1.5, 0., 3.6, 1.3, -2]})
print(data)
print("------------------------------------")

print(data.columns)
print("------------------------------------")

# numpy array로 변환
print(data.values)
print("------------------------------------")

   x0    x1    y
0   1  0.01 -1.5
1   2 -0.01  0.0
2   3  0.25  3.6
3   4 -4.10  1.3
4   5  0.00 -2.0
------------------------------------
Index(['x0', 'x1', 'y'], dtype='object')
------------------------------------
[[ 1.    0.01 -1.5 ]
 [ 2.   -0.01  0.  ]
 [ 3.    0.25  3.6 ]
 [ 4.   -4.1   1.3 ]
 [ 5.    0.   -2.  ]]
------------------------------------


In [3]:
df2 = pd.DataFrame(data.values, columns = ['one', 'two', 'three'])
print(df2)
print("------------------------------------")

df3 = data.copy()
df3['strings'] = ['a', 'b', 'c', 'd', 'e']
print(df3)
print("------------------------------------")

# data type이 다르면 numpy가 아닌 python 기본 array 반환
print(df3.values)
print("------------------------------------")

   one   two  three
0  1.0  0.01   -1.5
1  2.0 -0.01    0.0
2  3.0  0.25    3.6
3  4.0 -4.10    1.3
4  5.0  0.00   -2.0
------------------------------------
   x0    x1    y strings
0   1  0.01 -1.5       a
1   2 -0.01  0.0       b
2   3  0.25  3.6       c
3   4 -4.10  1.3       d
4   5  0.00 -2.0       e
------------------------------------
[[1 0.01 -1.5 'a']
 [2 -0.01 0.0 'b']
 [3 0.25 3.6 'c']
 [4 -4.1 1.3 'd']
 [5 0.0 -2.0 'e']]
------------------------------------


In [4]:
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'], categories=['a', 'b'])
print(data)
print("------------------------------------")

dummies = pd.get_dummies(data.category, prefix = 'category')
data_with_dummies = data.drop('category', axis = 1).join(dummies)
print(data_with_dummies)
print("------------------------------------")

   x0    x1    y category
0   1  0.01 -1.5        a
1   2 -0.01  0.0        b
2   3  0.25  3.6        a
3   4 -4.10  1.3        a
4   5  0.00 -2.0        b
------------------------------------
   x0    x1    y  category_a  category_b
0   1  0.01 -1.5           1           0
1   2 -0.01  0.0           0           1
2   3  0.25  3.6           1           0
3   4 -4.10  1.3           1           0
4   5  0.00 -2.0           0           1
------------------------------------


### 13.2 Pasty를 이용해서 모델 생성하기
#### - patsy(팻시)는 통계 모델을 위한 파이썬 라이브러리 / R에서 사용하는 수식 문법과 비슷한 형식의 문자열 기반 '수식 문법' 제공

In [7]:
data = pd.DataFrame({
    'x0' : [1,2,3,4,5],
    'x1' : [0.01, -0.01, 0.25, -4.1, 0.],
    'y' : [-1.5, 0., 3.6, 1.3, -2]})
print(data)
print("------------------------------------")

import patsy

y, X = patsy.dmatrices('y ~ x0 + x1', data)

print(y)
print("------------------------------------")

print(X)
print("------------------------------------")

print(np.asarray(y))
print("------------------------------------")

print(np.asarray(X))
print("------------------------------------")

# 모델에 0을 더하면 intercept term이 제거
print(patsy.dmatrices('y ~ x0 + x1 + 0', data)[1])
print("------------------------------------")

   x0    x1    y
0   1  0.01 -1.5
1   2 -0.01  0.0
2   3  0.25  3.6
3   4 -4.10  1.3
4   5  0.00 -2.0
------------------------------------
[[-1.5]
 [ 0. ]
 [ 3.6]
 [ 1.3]
 [-2. ]]
------------------------------------
[[ 1.    1.    0.01]
 [ 1.    2.   -0.01]
 [ 1.    3.    0.25]
 [ 1.    4.   -4.1 ]
 [ 1.    5.    0.  ]]
------------------------------------
[[-1.5]
 [ 0. ]
 [ 3.6]
 [ 1.3]
 [-2. ]]
------------------------------------
[[ 1.    1.    0.01]
 [ 1.    2.   -0.01]
 [ 1.    3.    0.25]
 [ 1.    4.   -4.1 ]
 [ 1.    5.    0.  ]]
------------------------------------
[[ 1.    0.01]
 [ 2.   -0.01]
 [ 3.    0.25]
 [ 4.   -4.1 ]
 [ 5.    0.  ]]
------------------------------------


In [8]:
coef, resid, _, _ = np.linalg.lstsq(X, y)
print(coef)
print("------------------------------------")

coef = pd.Series(coef.squeeze(), index = X.design_info.column_names)
print(coef)
print("------------------------------------")

[[ 0.31290976]
 [-0.07910564]
 [-0.26546384]]
------------------------------------
Intercept    0.312910
x0          -0.079106
x1          -0.265464
dtype: float64
------------------------------------


  """Entry point for launching an IPython kernel.


#### 13.2.1 Patsy 용법으로 데이터 변환하기

In [16]:
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)
print(X)
print("------------------------------------")

# 표준화(평균0, 분산1), 센터링(평균값을 뺌) - standardize(), center()
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)
print(X)
print("------------------------------------")

# patsy.build_design_matrices() - 입력으로 사용되는 원본 데이터셋에서 저장한 정보를 사용해서 출력 데이터를 만들어내는 변환에 적용
new_data = pd.DataFrame({
    'x0' : [6,7,8,9],
    'x1' : [3.1, -0.5, 0, 2.3],
    'y' : [1,2,3,4]})

new_X = patsy.build_design_matrices([X.design_info], new_data) # design_info - 모델 메타 데이터
print(new_X)
print("------------------------------------")

# I() - 데이터셋에서 이름으로 컬럼을 추가
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)
print(X.design_info)
print("------------------------------------")

[[1.         1.         0.00995033]
 [1.         2.         0.00995033]
 [1.         3.         0.22314355]
 [1.         4.         1.62924054]
 [1.         5.         0.        ]]
------------------------------------
[[ 1.         -1.41421356  0.78      ]
 [ 1.         -0.70710678  0.76      ]
 [ 1.          0.          1.02      ]
 [ 1.          0.70710678 -3.33      ]
 [ 1.          1.41421356  0.77      ]]
------------------------------------
[DesignMatrix with shape (4, 3)
  Intercept  standardize(x0)  center(x1)
          1          2.12132        3.87
          1          2.82843        0.27
          1          3.53553        0.77
          1          4.24264        3.07
  Terms:
    'Intercept' (column 0)
    'standardize(x0)' (column 1)
    'center(x1)' (column 2)]
------------------------------------
DesignInfo(['Intercept', 'I(x0 + x1)'],
           factor_infos={EvalFactor('I(x0 + x1)'): FactorInfo(factor=EvalFactor('I(x0 + x1)'),
                                    type='

#### 13.2.2 범주형 데이터와 Patsy

In [29]:
data = pd.DataFrame({
    'key1' : ['a','a','b','b','a','b','a','b'],
    'key2' : [0,1,0,1,0,1,0,0],
    'v1' : [1,2,3,4,5,6,7,8],
    'v2' : [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]
})

y, X = patsy.dmatrices('v2 ~ key1', data)
print(X.design_info)
print("------------------------------------")

y, X = patsy.dmatrices('v2 ~ key1 +0', data)
print(X.design_info)
print("------------------------------------")

# C() -산술 컬럼을 범주형으로 해석
y, X = patsy.dmatrices('v2 ~ C(key2)', data)
print(X.design_info)
print("------------------------------------")

DesignInfo(['Intercept', 'key1[T.b]'],
           factor_infos={EvalFactor('key1'): FactorInfo(factor=EvalFactor('key1'),
                                    type='categorical',
                                    state=<factor state>,
                                    categories=('a', 'b'))},
           term_codings=OrderedDict([(Term([]),
                                      [SubtermInfo(factors=(),
                                                   contrast_matrices={},
                                                   num_columns=1)]),
                                     (Term([EvalFactor('key1')]),
                                      [SubtermInfo(factors=(EvalFactor('key1'),),
                                                   contrast_matrices={EvalFactor('key1'): ContrastMatrix(array([[0.],
                                                                                            [1.]]),
                                                                                    

In [30]:
data['key2'] = data['key2'].map({0:'zero', 1:'one'})
y, X = patsy.dmatrices('v2 ~ key1 + key2', data)
X

DesignMatrix with shape (8, 3)
  Intercept  key1[T.b]  key2[T.zero]
          1          0             1
          1          0             0
          1          1             1
          1          1             0
          1          0             1
          1          1             0
          1          0             1
          1          1             1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)

In [32]:
y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)
X

DesignMatrix with shape (8, 4)
  Intercept  key1[T.b]  key2[T.zero]  key1[T.b]:key2[T.zero]
          1          0             1                       0
          1          0             0                       0
          1          1             1                       1
          1          1             0                       0
          1          0             1                       0
          1          1             0                       0
          1          0             1                       0
          1          1             1                       1
  Terms:
    'Intercept' (column 0)
    'key1' (column 1)
    'key2' (column 2)
    'key1:key2' (column 3)

### 13.3 statsmodels 소개
#### - statsmodels는 다양한 종류의 통계 모델 피팅, 통계 테스트 수행, 데이터 탐색, 시각화를 위한 라이브러리
#### - 선형모델, 일반선형모델, 로버스트선형모델, 선형복합효과모델, 아노바 메서드, 시계열 처리 및 상태 공간 모델, 일반적률추정법

#### 13.3.1 선형 모델 예측하기

In [34]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

In [37]:
def dnorm(mean, variance, size = 1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

np.random.randn(12345)
N = 100
X = np.c_[dnorm(0, 0.4, size = N),
         dnorm(0, 0.6, size = N), 
         dnorm(0, 0.2, size = N)]
eps = dnorm(0, 0.1, size = N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps

In [38]:
print(X[:5])
print("------------------------------------")

print(y[:5])
print("------------------------------------")

[[ 0.41330133 -0.72103493 -0.62054437]
 [-0.1775228  -0.77948667 -0.20781845]
 [ 1.1802563  -0.59658076  0.18706073]
 [ 0.64533176 -0.81869486  0.05192086]
 [ 0.95312359  0.02255852 -0.06609892]]
------------------------------------
[-0.74360246 -0.44509569 -0.08059897 -0.5200937   0.24126448]
------------------------------------


In [39]:
# sm.add_constant() - intercetp와 함께 피팅
X_model = sm.add_constant(X)
print(X_model[:5])
print("------------------------------------")

[[ 1.          0.41330133 -0.72103493 -0.62054437]
 [ 1.         -0.1775228  -0.77948667 -0.20781845]
 [ 1.          1.1802563  -0.59658076  0.18706073]
 [ 1.          0.64533176 -0.81869486  0.05192086]
 [ 1.          0.95312359  0.02255852 -0.06609892]]
------------------------------------


In [40]:
# sm.OLS 클래스 - 최소자승 선형회귀 피팅
model = sm.OLS(y, X)
results = model.fit()
print(results.params)
print("------------------------------------")

print(results.summary())
print("------------------------------------")

[0.11509916 0.33325065 0.60303137]
------------------------------------
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.507
Model:                            OLS   Adj. R-squared (uncentered):              0.492
Method:                 Least Squares   F-statistic:                              33.26
Date:                Thu, 03 Jun 2021   Prob (F-statistic):                    7.19e-15
Time:                        21:58:59   Log-Likelihood:                         -38.750
No. Observations:                 100   AIC:                                      83.50
Df Residuals:                      97   BIC:                                      91.32
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t     

In [41]:
data = pd.DataFrame(X, columns = ['col0', 'col1', 'col2'])
data['y'] = y
print(data[:5])
print("------------------------------------")

       col0      col1      col2         y
0  0.413301 -0.721035 -0.620544 -0.743602
1 -0.177523 -0.779487 -0.207818 -0.445096
2  1.180256 -0.596581  0.187061 -0.080599
3  0.645332 -0.818695  0.051921 -0.520094
4  0.953124  0.022559 -0.066099  0.241264
------------------------------------


In [43]:
results = smf.ols('y ~ col0 + col1 + col2', data = data).fit()
print(results.params)
print("------------------------------------")

print(results.tvalues)
print("------------------------------------")

print(results.predict(data[:5]))
print("------------------------------------")

Intercept    0.023453
col0         0.112212
col1         0.334259
col2         0.600675
dtype: float64
------------------------------------
Intercept    0.642201
col0         2.177975
col1         6.730378
col2         6.750415
dtype: float64
------------------------------------
0   -0.543928
1   -0.381849
2    0.068842
3   -0.146602
4    0.098241
dtype: float64
------------------------------------


#### 13.3.2 시계열 처리 예측

In [45]:
init_x = 4

import random 
values = [init_x, init_x]
N = 1000

b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)
for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)

MAXLAGS = 5
model = sm.tsa.AR(values)
results = model.fit(MAXLAGS)
print(results.params)
print("------------------------------------")

[-0.00687915  0.81400155 -0.39092489 -0.02428872  0.01236633  0.00517477]
------------------------------------


statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
statsmodels.tsa.SARIMAX.

AutoReg adds the ability to specify exogenous variables, include time trends,
and add seasonal dummies. The AutoReg API differs from AR since the model is
treated as immutable, and so the entire specification including the lag
length must be specified when creating the model. This change is too
substantial to incorporate into the existing AR api. The function
ar_select_order performs lag length selection for AutoReg models.

AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
estimate ARX and related models using full MLE via the Kalman Filter.





### 13.4 scikit-learn 소개 - 다른 책을 활용하여 정리 예정
#### - 가장 널리 쓰이는 파이썬 머신러닝 툴
#### - 지도학습, 비지도학습을 포함 / 모델 선택, 평가, 데이터 변형, 데이터 적재, 모델 유지 및 기타 적업들을 위한 도구들을 제공