# 회귀분석의 결과 보고

## #01. 작업준비

### 패키지 참조

In [1]:
from pandas import read_excel, DataFrame, MultiIndex
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor

### 데이터 가져오기

In [2]:
df = read_excel("https://data.hossam.kr/E04/cars.xlsx")
df.head()

Unnamed: 0,speed,dist
0,4,2
1,4,10
2,7,4
3,7,22
4,8,16


### 회귀분석 수행

In [3]:
model = ols("dist ~ speed", data=df)
fit = model.fit()
tbl = fit.summary()
tbl

0,1,2,3
Dep. Variable:,dist,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,89.57
Date:,"Tue, 25 Jul 2023",Prob (F-statistic):,1.49e-12
Time:,14:33:03,Log-Likelihood:,-206.58
No. Observations:,50,AIC:,417.2
Df Residuals:,48,BIC:,421.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.5791,6.758,-2.601,0.012,-31.168,-3.990
speed,3.9324,0.416,9.464,0.000,3.097,4.768

0,1,2,3
Omnibus:,8.975,Durbin-Watson:,1.676
Prob(Omnibus):,0.011,Jarque-Bera (JB):,8.189
Skew:,0.885,Prob(JB):,0.0167
Kurtosis:,3.893,Cond. No.,50.7


### 회귀분석 결과 활용

#### 모델을 활용한 결과값 얻기

In [4]:
speed = [10, 15, 20, 25, 30, 35, 40]
pred = fit.predict({"speed": speed})
pred

0     21.744993
1     41.407036
2     61.069080
3     80.731124
4    100.393168
5    120.055212
6    139.717255
dtype: float64

## #02. 회귀분석 결과 다루기

### 결과표의 크기

In [5]:
len(tbl.tables)

3

### 결과표 확인

In [6]:
tbl.tables[0]

0,1,2,3
Dep. Variable:,dist,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,89.57
Date:,"Tue, 25 Jul 2023",Prob (F-statistic):,1.49e-12
Time:,14:33:03,Log-Likelihood:,-206.58
No. Observations:,50,AIC:,417.2
Df Residuals:,48,BIC:,421.0
Df Model:,1,,
Covariance Type:,nonrobust,,


In [7]:
tbl.tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.5791,6.758,-2.601,0.012,-31.168,-3.990
speed,3.9324,0.416,9.464,0.000,3.097,4.768


In [8]:
tbl.tables[2]

0,1,2,3
Omnibus:,8.975,Durbin-Watson:,1.676
Prob(Omnibus):,0.011,Jarque-Bera (JB):,8.189
Skew:,0.885,Prob(JB):,0.0167
Kurtosis:,3.893,Cond. No.,50.7


### 첫 번째, 세번째 표의 내용 

In [9]:
import re

my = {}

for k in range(0, 3, 2):
    items = tbl.tables[k].data
    #print(items)

    for item in items:
        #print(item)
        n = len(item)

        for i in range(0, n, 2):
            key = item[i].strip()[:-1]
            value = item[i+1].strip()

            if key and value:
                my[key] = value

my

{'Dep. Variable': 'dist',
 'R-squared': '0.651',
 'Model': 'OLS',
 'Adj. R-squared': '0.644',
 'Method': 'Least Squares',
 'F-statistic': '89.57',
 'Date': 'Tue, 25 Jul 2023',
 'Prob (F-statistic)': '1.49e-12',
 'Time': '14:33:03',
 'Log-Likelihood': '-206.58',
 'No. Observations': '50',
 'AIC': '417.2',
 'Df Residuals': '48',
 'BIC': '421.0',
 'Df Model': '1',
 'Covariance Type': 'nonrobust',
 'Omnibus': '8.975',
 'Durbin-Watson': '1.676',
 'Prob(Omnibus)': '0.011',
 'Jarque-Bera (JB)': '8.189',
 'Skew': '0.885',
 'Prob(JB)': '0.0167',
 'Kurtosis': '3.893',
 'Cond. No': '50.7'}

### VIF값 생성

In [10]:
for i in range(1, len(model.exog_names)):
    vif = variance_inflation_factor(model.exog, i)

    if vif < 10:
        print("%s의 VIF: %f (good)" % (model.exog_names[i], vif))
    else:
        print("%s의 VIF: %f (bad)" % (model.exog_names[i], vif))

speed의 VIF: 1.000000 (good)


In [11]:
my['variables'] = []

for i, v in enumerate(tbl.tables[1].data):
    if i == 0:
        continue

    # 변수의 이름
    name = v[0].strip()
    # 변수의 이름 목록
    name_list = list(model.exog_names)
    # 변수의 이름 목록에서 현재 변수가 몇 번째 항목인지 찾기 
    j = name_list.index(name)

    vif = 0

    # 0번째인 Intercept는 제외
    if j > 0:
        vif = variance_inflation_factor(model.exog, j)

    my['variables'].append({
        "name": name,
        "coef": v[1].strip(),
        "std err": v[2].strip(),
        "t": v[3].strip(),
        "P-value": v[4].strip(),
        "Beta": 0,
        "VIF": vif,
    })

my

{'Dep. Variable': 'dist',
 'R-squared': '0.651',
 'Model': 'OLS',
 'Adj. R-squared': '0.644',
 'Method': 'Least Squares',
 'F-statistic': '89.57',
 'Date': 'Tue, 25 Jul 2023',
 'Prob (F-statistic)': '1.49e-12',
 'Time': '14:33:03',
 'Log-Likelihood': '-206.58',
 'No. Observations': '50',
 'AIC': '417.2',
 'Df Residuals': '48',
 'BIC': '421.0',
 'Df Model': '1',
 'Covariance Type': 'nonrobust',
 'Omnibus': '8.975',
 'Durbin-Watson': '1.676',
 'Prob(Omnibus)': '0.011',
 'Jarque-Bera (JB)': '8.189',
 'Skew': '0.885',
 'Prob(JB)': '0.0167',
 'Kurtosis': '3.893',
 'Cond. No': '50.7',
 'variables': [{'name': 'Intercept',
   'coef': '-17.5791',
   'std err': '6.758',
   't': '-2.601',
   'P-value': '0.012',
   'Beta': 0,
   'VIF': 0},
  {'name': 'speed',
   'coef': '3.9324',
   'std err': '0.416',
   't': '9.464',
   'P-value': '0.000',
   'Beta': 0,
   'VIF': 1.0}]}

In [12]:
mylist = []
yname_list = []
xname_list = []

for i in my['variables']:
    if i['name'] == 'Intercept':
        continue

    yname_list.append('dist')
    xname_list.append(i['name'])

    item = {
        "B": i['coef'],
        "표준오차": i['std err'],
        "β": i['Beta'],
        "t": "%s*" % i['t'],
        "유의확률" : i['P-value'],
        "VIF": i["VIF"]
    }

    mylist.append(item)

df = DataFrame(mylist, 
               index=MultiIndex.from_arrays([yname_list, xname_list], names=['종속변수', '독립변수']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,B,표준오차,β,t,유의확률,VIF
종속변수,독립변수,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dist,speed,3.9324,0.416,0,9.464*,0.0,1.0


| 별표 개수 | p값 범위 |
|---|---|
| `*` | $p < 0.05$|
| `**` | $p < 0.01$|
| `***` | $p < 0.001$|

### 분석결과 문자열

In [13]:
"𝑅(%s), 𝑅^2(%s), 𝐹(%s), 유의확률(%s), Durbin-Watson(%s)" % (my['R-squared'], my['Adj. R-squared'], my['F-statistic'], my['Prob (F-statistic)'], my['Durbin-Watson'])

'𝑅(0.651), 𝑅^2(0.644), 𝐹(89.57), 유의확률(1.49e-12), Durbin-Watson(1.676)'

### 모형 적합도 보고

In [14]:
"%s에 대하여 %s로 예측하는 회귀분석을 실시한 결과, 이 회귀모형은 통계적으로 %s(F(%s,%s) = %s, p < 0.05)." % (
    "dist", 
    ",".join(xname_list), 
    "유의하다" if float(my['Prob (F-statistic)']) < 0.05 else "유의하지 않다", 
    my['Df Model'], 
    my['Df Residuals'], 
    my['F-statistic'])

'dist에 대하여 speed로 예측하는 회귀분석을 실시한 결과, 이 회귀모형은 통계적으로 유의하다(F(1,48) = 89.57, p < 0.05).'

### 독립변수 보고

In [15]:
varstr = []

for i, v in enumerate(my['variables']):
    if i == 0:
        continue

    print(v)
    
    s = "%s의 회귀계수는 %s(p%s0.05)로, %s에 대하여 %s."
    k = s % (v['name'], 
             v['coef'], 
             "<" if float(v['P-value']) < 0.05 else '>', 
             'dist', 
             '유의미한 예측변인인 것으로 나타났다' if float(v['P-value']) < 0.05 else '유의하지 않은 예측변인인 것으로 나타났다'
    )

    varstr.append(k)

varstr

{'name': 'speed', 'coef': '3.9324', 'std err': '0.416', 't': '9.464', 'P-value': '0.000', 'Beta': 0, 'VIF': 1.0}


['speed의 회귀계수는 3.9324(p<0.05)로, dist에 대하여 유의미한 예측변인인 것으로 나타났다.']