In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


sns.set()

#-------------------- 차트 관련 속성 (한글처리, 그리드) -----------
plt.rcParams['font.family']= 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

#-------------------- 주피터 , 출력결과 넓이 늘리기 ---------------
# from IPython.core.display import display, HTML
from IPython.display import display, HTML

display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings(action='ignore')

In [23]:
# ----------------- 학습 ----------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression



# ----------------- 평가 ----------------------
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve

from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

from pycaret.classification import *

In [8]:
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS


# lec14_OLS회귀분석
class statsmodels.regression.linear_model.OLS(endog, exog=None, missing='none', hasconst=None, **kwargs)

* OLS(Ordinary Least Squares) : 최소제곱(자승)법
* RSS, 잔차제곱합(Y-Y^)^2 을 최소화하는 <b>가중치 벡터</b>를 구하는 방법
* y = wX + b
    * w : 회귀계수(coef)
    * b : 편향(bias)

* X >> 무조건 bias 추가해서 사용 >> X.sm.add.constant

# 실습

In [3]:
from sklearn.datasets import load_boston
dataset = load_boston()

In [10]:
df = pd.DataFrame(dataset['data'], columns = dataset['feature_names'])
df['MEDV'] = dataset['target']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [20]:
y = df["MEDV"] 
X= df.drop("MEDV", axis=1)

In [21]:
y.shape, X.shape

((506,), (506, 13))

In [22]:
# X= sm.add_constant(x)  # 상수항 결합 / bias 값 추가

model=OLS(y,X)
fit_res=model.fit()
fit_res.summary()


# R2 : 0.959, AIC : 3074.

0,1,2,3
Dep. Variable:,MEDV,R-squared (uncentered):,0.959
Model:,OLS,Adj. R-squared (uncentered):,0.958
Method:,Least Squares,F-statistic:,891.3
Date:,"Wed, 05 Apr 2023",Prob (F-statistic):,0.0
Time:,11:07:26,Log-Likelihood:,-1523.8
No. Observations:,506,AIC:,3074.0
Df Residuals:,493,BIC:,3128.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.0929,0.034,-2.699,0.007,-0.161,-0.025
ZN,0.0487,0.014,3.382,0.001,0.020,0.077
INDUS,-0.0041,0.064,-0.063,0.950,-0.131,0.123
CHAS,2.8540,0.904,3.157,0.002,1.078,4.630
NOX,-2.8684,3.359,-0.854,0.394,-9.468,3.731
RM,5.9281,0.309,19.178,0.000,5.321,6.535
AGE,-0.0073,0.014,-0.526,0.599,-0.034,0.020
DIS,-0.9685,0.196,-4.951,0.000,-1.353,-0.584
RAD,0.1712,0.067,2.564,0.011,0.040,0.302

0,1,2,3
Omnibus:,204.082,Durbin-Watson:,0.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1374.225
Skew:,1.609,Prob(JB):,3.9e-299
Kurtosis:,10.404,Cond. No.,8500.0


In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size = 0.2, random_state=1)

In [26]:
lr = LinearRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
sc = mean_squared_error(y_test,pred)
sc

23.380836480270247

In [29]:
coef_val = pd.DataFrame(lr.coef_)
coef_val

Unnamed: 0,0
0,-0.112387
1,0.058059
2,0.018359
3,2.129978
4,-19.581101
5,3.095462
6,0.004453
7,-1.500476
8,0.305359
9,-0.011123


# OLS

In [5]:
x = [1,3,2,5,6,9,12,23,35,60]
y = [10,21,32,44,56,65,76,89,90,100]

# model = sm.OLS(y,x)

x= sm.add_constant(x)  # 상수항 결합 / bias 값 추가
model=OLS(y,x)
fit_res=model.fit()
fit_res.summary()




0,1,2,3
Dep. Variable:,y,R-squared:,0.66
Model:,OLS,Adj. R-squared:,0.617
Method:,Least Squares,F-statistic:,15.52
Date:,"Wed, 05 Apr 2023",Prob (F-statistic):,0.0043
Time:,10:52:25,Log-Likelihood:,-42.623
No. Observations:,10,AIC:,89.25
Df Residuals:,8,BIC:,89.85
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,37.5081,8.045,4.662,0.002,18.956,56.061
x1,1.3328,0.338,3.939,0.004,0.553,2.113

0,1,2,3
Omnibus:,1.535,Durbin-Watson:,0.422
Prob(Omnibus):,0.464,Jarque-Bera (JB):,0.81
Skew:,-0.256,Prob(JB):,0.667
Kurtosis:,1.704,Cond. No.,31.5


classmethod OLS.from_formula(formula, data, subset=None, drop_cols=None, *args, **kwargs)¶

In [6]:
x = [1,3,2,5,6,9,12,23,35,60]
y = [10,21,32,44,56,65,76,89,90,100]
df = pd.DataFrame({"x":x, "y":y})
model = OLS.from_formula("y ~ x", data=df)
fit_res=model.fit()
fit_res.summary()



0,1,2,3
Dep. Variable:,y,R-squared:,0.66
Model:,OLS,Adj. R-squared:,0.617
Method:,Least Squares,F-statistic:,15.52
Date:,"Wed, 05 Apr 2023",Prob (F-statistic):,0.0043
Time:,10:52:25,Log-Likelihood:,-42.623
No. Observations:,10,AIC:,89.25
Df Residuals:,8,BIC:,89.85
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,37.5081,8.045,4.662,0.002,18.956,56.061
x,1.3328,0.338,3.939,0.004,0.553,2.113

0,1,2,3
Omnibus:,1.535,Durbin-Watson:,0.422
Prob(Omnibus):,0.464,Jarque-Bera (JB):,0.81
Skew:,-0.256,Prob(JB):,0.667
Kurtosis:,1.704,Cond. No.,31.5


<pre>
R-squared(R2) : 1이면 좋다
F-statistic   : 크다 >> 분산의 분포에 차이가 있다 >> 두 피쳐가 차이가 있다
AIC           : 낮을 수록 좋음
coef          : 클수록 y에 영향도가 크다 
P>|t|         : 0.05보다 낮아야 유의미하다
Skew          : 정규분포의 왜도 0  
Kurtosis      : 정규분포의 첨도 0

* 회귀방정식 >>>  y = 1.3328x + 37.5081

# F-statistic (F통계량)
<pre>
- SSError = SSResidual    : Sum of Squares for Errors (or Residuals) 잔차의 제곱합
- SSReg. SSTr = SSExplain : Sum of Squares for Regression (or Treated) 회귀(예측값) 제곱합
- SSTotal                 : Sum of Squares Total (SSE + SSTr)

- MSE = Mean of Squared Error      = SSError / Df Residuals(8) (총샘플수-독립변수갯수-종속변수갯수)
- MSR = Mean of Sauqred Regression = SSReg   / Df Model(1)     (독립변수의 개수)
- F-statistic (F통계량)            = SSReg.의 평균(MSR) / SSError의 평균(MSE)

In [7]:
x = [1,3,2,5,6,9,12,23,35,60]
y = [10,21,32,44,56,65,76,89,90,100]
#y = 1.3328x + 37.5081

y_mean = np.mean(y)                          # y평균
y_pred = np.array(x) * 1.3328  + 37.5081     # 회귀식으로 구한 예측값
y_true = np.array(y)                         # 실제값

SSError  = ((y_true - y_pred)**2).sum()      #잔차(실제값-예측값) 제곱 총합
SSReg    = ((y_mean - y_pred)**2).sum()      #편차(평균 - 예측값) 제곱 총합
SST      = SSError + SSReg

R2 = SSReg / SST


MSE = SSError / 8                            # SSError / Df Residuals(8)  >> n - k - 1
MSR = SSReg / 1                              # SSReg   / Df Model(1)      >> k(독립변수의 개수)
F   = MSR  / MSE                             # SSReg.의 평균(MSR) / SSError의 평균(MSE)

print(R2, F )                                # 결정계수, F통계량

0.6598154386835811 15.51664628471745


<pre>

Dep. Variable    : 타겟변수명
Model            : OLS
Method           : Least Squares(최소제곱)
No. Observations : 샘플갯수(10건)
Df Residuals     : 잔차자유도(샘플갯수-종속변수갯수(y)-독립변수갯수(x))
Df Model         : 독립변수 갯수(x)
Covariance Type  : nonrobust(non-constant variance)

---------------------------------------------------------

<font color=red><b>R-squared(R2)</b></font>    : 결정계수(회귀식의 설명력) =  SSReg / SST
                   전체 데이터에서 회귀모델이 설명할 수 있는 데이터 비율
                   회귀모델 y = 1.3x + 37은 전체 데이터의 66%를 설명할 수 있다
Adj. R-squared    : 보정된 R2

<font color=red><b>F-statistic</b></font>       : F분포(통계량)
                    F통계량으로 회귀모델 y = 1.3x + 37의 적절성 평가
Prob (F-statistic): F분포(통계량) 유의수준

<font color=red><b>AIC</b></font> BIC           : 손실 가중치 계산 (낮을 수록 좋음)
                  : X피쳐를 이용해 Y를 예측할 수 있는 정도
                    AIC (Akaikie’s Information Criteria)
                    BIC (Bayesian Information Criterion)
                    
---------------------------------------------------------

<font color=red><b>coef</b></font>              : 회귀계수(클수록 y에 영향도가 크다)
std err           : 표준오차 (오차합 / 표준편차) (낮을 수록 좋음)
t                 : t테스트 (평균값이용 : x피쳐가 y에 영향을 주는 정도 : 상관도)
<font color=red><b>P>|t|</b></font>             : 유의수준 (p-value)
[0.025 0.975]     : 신뢰구간

---------------------------------------------------------

Omnibus           : 비대칭도(왜도) 정규성 테스트 값 (크다:정규분포를 따른다)
Prob(Omnibus)     : Omnibus 유의수준

<font color=red><b>Skew</b></font>              : 왜도 (좌우 비대칭도)
<font color=red><b>Kurtosis</b></font>          : 첨도(뾰족)
Durbin-Watson     : (DW검정)잔차의 독립성을 확인할 수 있는 수치
                     1.5 ~ 2.5 사이이면 독립으로 판단(회귀 모형이 적합하다)
                     0 : 잔차들이 양의 자기 상관
                     2 : 독립성(자기 상관이 없다)
                     4 : 잔차들이 음의 자기 상관