# 지도학습 (5) - 회귀분석 결과보고

논문등의 보고서에서 결과 보고를 위해 회귀계수 표를 구성하기 위한 값을 구하는 과정

## #01. 작업 준비

### 패키지 가져오기

In [27]:
import sys

sys.path.append("../../")
import helper

import numpy as np
from scipy import stats
from pandas import read_excel, DataFrame
from statsmodels.stats.outliers_influence import variance_inflation_factor

### 데이터 가져오기

In [28]:
origin = read_excel("https://data.hossam.kr/F02/fish2.xlsx")
origin.head()

Unnamed: 0,길이,높이,두께,무게
0,8.4,2.11,1.41,5.9
1,13.7,3.53,2.0,32.0
2,15.0,3.82,2.43,40.0
3,16.2,4.59,2.63,51.5
4,17.4,4.59,2.94,70.0


 #02. 머신러닝에 의한 회귀분석 수행

In [29]:
xnames = ['길이','높이','두께']
yname = '무게'

In [30]:
result = helper.ml_ols(origin, xnames, yname, degree=1, test_size=0)

print("계수: ", result.coef)
print("절편: ", result.intercept)

계수:  [ 2.9082713  67.20469902 67.26029602]
절편:  -546.4397914448656


## #03. 결과보고에 필요한 값 구하기

### 1) 절편과 계수를 하나의 배열로 결합

In [31]:
params = np.append(result.intercept, result.coef)
params

array([-546.43979144,    2.9082713 ,   67.20469902,   67.26029602])

### 2) 상수항 추가하기

In [32]:
# 독립변수 추출
x = origin.filter(xnames)

# 종속변수 추출
y = origin[yname]

# 상수항 추가
designX = x.copy()
designX.insert(0, '상수', 1)
designX.head()

Unnamed: 0,상수,길이,높이,두께
0,1,8.4,2.11,1.41
1,1,13.7,3.53,2.0
2,1,15.0,3.82,2.43
3,1,16.2,4.59,2.63
4,1,17.4,4.59,2.94


### 3) 행렬곱 구하기

In [33]:
dot = np.dot(designX.T,designX)
dot

array([[   56.    ,  1562.    ,   440.28  ,   265.75  ],
       [ 1562.    , 48045.12  , 13688.339 ,  8270.876 ],
       [  440.28  , 13688.339 ,  3917.2114,  2365.5425],
       [  265.75  ,  8270.876 ,  2365.5425,  1434.4117]])

### 4) 행렬곱에 대한 역행렬

In [34]:
inv = np.linalg.inv(dot)
inv

array([[ 0.25997581, -0.02937614,  0.05587393,  0.02907514],
       [-0.02937614,  0.00811062, -0.0207489 , -0.00710593],
       [ 0.05587393, -0.0207489 ,  0.11758923, -0.08463348],
       [ 0.02907514, -0.00710593, -0.08463348,  0.17585582]])

### 5) 역행렬의 대각선 반환

In [35]:
dia = inv.diagonal()
dia

array([0.25997581, 0.00811062, 0.11758923, 0.17585582])

### 6) 평균 제곱오차 구하기

상수항이 적용된 형태이므로 API를 통한 값이 아닌 직접 구한 값이 필요하다

In [36]:
predictions = result.fit.predict(x)
MSE = (sum((y-predictions)**2)) / (len(designX)-len(designX.iloc[0]))
MSE

7374.273394715794

In [37]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 56 entries, 0 to 55
Series name: 무게
Non-Null Count  Dtype  
--------------  -----  
56 non-null     float64
dtypes: float64(1)
memory usage: 580.0 bytes


In [38]:
predictions

array([-285.37138021, -134.84329507,  -82.65125237,  -13.96164937,
         10.37896796,   80.02180361,   67.26144033,   92.99576857,
         60.46577888,   39.43652554,  136.47529709,  135.11730386,
        143.87392965,  181.70121192,  157.52049758,  133.3329216 ,
        164.91913014,  149.46204936,  157.5532998 ,  156.29812063,
        218.79849072,  163.60080077,  115.41385187,  194.15839512,
        263.48927678,  209.05903028,  257.86674132,  206.58314208,
        254.05843451,  290.94471548,  437.8170324 ,  307.30504598,
        299.2404821 ,  330.16743096,  343.02579455,  359.19903393,
        373.73976608,  627.92162037,  673.20907188,  852.16200471,
        752.304867  ,  724.63422709,  710.71463708,  699.96800091,
        819.42275349,  719.20491242,  896.70037378,  849.86382452,
        843.82151728,  903.64704269,  764.48717021,  898.00598423,
        870.02288875,  918.41805562,  974.34802731,  932.08908622])

### 7) 표준오차

In [39]:
se_b = np.sqrt(MSE * dia)
se_b

array([43.78507388,  7.73368804, 29.44715768, 36.0112326 ])

### 8) t-value 구하기

In [40]:
ts_b = params / se_b
ts_b

array([-12.48004726,   0.37605232,   2.28221344,   1.86775878])

### 9) p-value 구하기

In [41]:
p_values = [2*(1-stats.t.cdf(np.abs(i),(len(designX)-len(designX.iloc[0])))) for i in ts_b]
p_values

[0.0, 0.7084079152880327, 0.026597717787692154, 0.06743585337091651]

### 10) VIF 구하기

In [42]:
vif = []

for i, v in enumerate(xnames):
    j = list(origin.columns).index(v)
    vif.append(variance_inflation_factor(origin, j))
    
vif

[338.76030542544714, 500.757055790855, 263.01505845905143]

### 11) 결과표 구성하기

In [43]:
resultDf = DataFrame({
    "종속변수": [yname] * len(xnames),
    "독립변수": xnames,
    "B": result.coef,
    "표준오차": se_b[1:],
    "β": 0,
    "t": ts_b[1:],
    "유의확률": p_values[1:],
    "VIF": vif,
})

helper.prettyPrint(resultDf)

+----+------------+------------+---------+------------+-----+----------+------------+---------+
|    | 종속변수   | 독립변수   |       B |   표준오차 |   β |        t |   유의확률 |     VIF |
|----+------------+------------+---------+------------+-----+----------+------------+---------|
|  0 | 무게       | 길이       | 2.90827 |    7.73369 |   0 | 0.376052 |   0.708408 |  338.76 |
|  1 | 무게       | 높이       | 67.2047 |    29.4472 |   0 |  2.28221 |  0.0265977 | 500.757 |
|  2 | 무게       | 두께       | 67.2603 |    36.0112 |   0 |  1.86776 |  0.0674359 | 263.015 |
+----+------------+------------+---------+------------+-----+----------+------------+---------+


### 11) statsmodels 패키지의 결과와 비교하기

In [44]:
result = helper.myOls(origin, x=['길이','높이','두께'], y='무게')
helper.prettyPrint(result.table)

+------------------+---------+------------+-----+--------+------------+---------+
|                  |       B |   표준오차 |   β | t      |   유의확률 |     VIF |
|------------------+---------+------------+-----+--------+------------+---------|
| ('무게', '길이') |  2.9083 |      7.734 |   0 | 0.376* |      0.708 |  338.76 |
| ('무게', '높이') | 67.2047 |     29.447 |   0 | 2.282* |      0.027 | 500.757 |
| ('무게', '두께') | 67.2603 |     36.011 |   0 | 1.868* |      0.067 | 263.015 |
+------------------+---------+------------+-----+--------+------------+---------+


### 12) 모듈에 추가된 기능 확인하기

In [45]:
ols_result = helper.ml_ols(origin, xnames="길이,높이,두께", yname="무게", degree=1, test_size=0)
helper.prettyPrint(ols_result.table)

+----+------------+------------+---------+------------+-----+----------+------------+---------+
|    | 종속변수   | 독립변수   |       B |   표준오차 |   β |        t |   유의확률 |     VIF |
|----+------------+------------+---------+------------+-----+----------+------------+---------|
|  0 | 무게       | 길이       | 2.90827 |    7.73369 |   0 | 0.376052 |   0.708408 |  338.76 |
|  1 | 무게       | 높이       | 67.2047 |    29.4472 |   0 |  2.28221 |  0.0265977 | 500.757 |
|  2 | 무게       | 두께       | 67.2603 |    36.0112 |   0 |  1.86776 |  0.0674359 | 263.015 |
+----+------------+------------+---------+------------+-----+----------+------------+---------+


## 참고자료(Reference)

1) https://m.blog.naver.com/pmw9440/222482746927

2) https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression

3) https://stats.stackexchange.com/questions/85943/how-to-derive-the-standard-error-of-linear-regression-coefficient

4) https://calcworkshop.com/linear-regression/t-test/

5) https://www.google.com/url?sa=i&url=http%3A%2F%2Fwww.few.vu.nl%2F~wvanwie%2FCourses%2FHighdimensionalDataAnalysis%2FWNvanWieringen_HDDA_Lecture234_RidgeRegression_20182019.pdf&psig=AOvVaw31slQKGfkCNf2PAa3VEIGC&ust=1630070470635000&source=images&cd=vfe&ved=0CAsQjRxqFwoTCIjJworkzvICFQAAAAAdAAAAABAR