In [1]:
# 다중회귀분석 
# 연속형 종속변수 , 두개 이상의 독립변수 간의 선형관계, 및 설명력 (f분포,r 스퀘어) 확인 기법
# 필요시 모델 성능 향상을 위해 파생변수 생성 및 성능 비교 필요
# 명목형 독립변수인 경우 가변수 변환 후 모델 적합-> pd.getDummies 써야됨 (원핫 인코딩)

# 다중 공선성 문제
# 독립변수 간 강한 상관관계가 나타나는 문제 (상관분석도 가능한데, 더 강려갛ㄴ게 필요)
# 상관계수를 확인하여 그 값이 높은것을 사전에 제거
# 회귀모델 생성 이후 분산 팽창계수 (VIF) 확인 (10 이상) 하여 관련 변수 처리 
# 10 미만이면 남김

In [2]:
#patsy - dmatrices()
# 수식 기반 데이터 행렬 생성
# 분산 팽창계수 확인을 위해 입력데이터를 전처리 함

# statsmodel - variance_inflation_factor()
# 분산팽창계수를 연산하기위한 함수
# 반복문 or list comprehension 사용

In [4]:
import pandas as pd
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [6]:
df = pd.read_csv("bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [13]:
df_sub = df.loc[:, "season":"casual"]
df_sub.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual
0,1,0,0,1,9.84,14.395,81,0.0,3
1,1,0,0,1,9.02,13.635,80,0.0,8


In [15]:
formular = "casual ~ " + " + ".join(df_sub.columns[:-1])
formular

'casual ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed'

In [17]:
y, X = dmatrices(formular,data = df_sub, return_type="dataframe")

In [19]:
y.head(2) # 종속

Unnamed: 0,casual
0,3.0
1,8.0


In [20]:
X.head(2) # 독립

Unnamed: 0,Intercept,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1.0,1.0,0.0,0.0,1.0,9.84,14.395,81.0,0.0
1,1.0,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0


In [22]:
# VIF 계산
df_vif = pd.DataFrame()
df_vif["colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,34.029472
1,season,1.137211
2,holiday,1.069731
3,workingday,1.071196
4,weather,1.23615
5,temp,35.516012
6,atemp,35.550831
7,humidity,1.425034
8,windspeed,1.195704


In [None]:
# temp, atemp 값이 크게 나오므로 둘중 하나는 빼줘야 vif 가 낮게나옴. (독립변수 각각이 연관이 있으면 안됨!)

In [46]:

df_sub = pd.concat( [df.loc[:, "season":"temp"],df.loc[:, "humidity":"casual"] ] , axis = 1)
df_sub

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual
0,1,0,0,1,9.84,81,0.0000,3
1,1,0,0,1,9.02,80,0.0000,8
2,1,0,0,1,9.02,80,0.0000,5
3,1,0,0,1,9.84,75,0.0000,3
4,1,0,0,1,9.84,75,0.0000,0
...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,50,26.0027,7
10882,4,0,1,1,14.76,57,15.0013,10
10883,4,0,1,1,13.94,61,15.0013,4
10884,4,0,1,1,13.94,61,6.0032,12


In [47]:
formular = "casual ~ " + " + ".join(df_sub.columns[:-1])
y, X = dmatrices(formular,data = df_sub, return_type="dataframe")
y.head(2)

Unnamed: 0,casual
0,3.0
1,8.0


In [49]:
df_vif = pd.DataFrame()
df_vif["colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,31.375118
1,season,1.136866
2,holiday,1.068094
3,workingday,1.070025
4,weather,1.235251
5,temp,1.089028
6,humidity,1.421256
7,windspeed,1.14965


In [51]:
df.corr().round(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.03,-0.01,0.01,0.26,0.26,0.19,-0.15,0.1,0.16,0.16
holiday,0.03,1.0,-0.25,-0.01,0.0,-0.01,0.0,0.01,0.04,-0.02,-0.01
workingday,-0.01,-0.25,1.0,0.03,0.03,0.02,-0.01,0.01,-0.32,0.12,0.01
weather,0.01,-0.01,0.03,1.0,-0.06,-0.06,0.41,0.01,-0.14,-0.11,-0.13
temp,0.26,0.0,0.03,-0.06,1.0,0.98,-0.06,-0.02,0.47,0.32,0.39
atemp,0.26,-0.01,0.02,-0.06,0.98,1.0,-0.04,-0.06,0.46,0.31,0.39
humidity,0.19,0.0,-0.01,0.41,-0.06,-0.04,1.0,-0.32,-0.35,-0.27,-0.32
windspeed,-0.15,0.01,0.01,0.01,-0.02,-0.06,-0.32,1.0,0.09,0.09,0.1
casual,0.1,0.04,-0.32,-0.14,0.47,0.46,-0.35,0.09,1.0,0.5,0.69
registered,0.16,-0.02,0.12,-0.11,0.32,0.31,-0.27,0.09,0.5,1.0,0.97


In [52]:
# temp, atemp 상관관계가 큼.

In [56]:
df_dum = pd.get_dummies(df, columns = ["season"], drop_first=True)
df_dum.head(2)

Unnamed: 0,datetime,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_2,season_3,season_4
0,2011-01-01 00:00:00,0,0,1,9.84,14.395,81,0.0,3,13,16,0,0,0
1,2011-01-01 01:00:00,0,0,1,9.02,13.635,80,0.0,8,32,40,0,0,0


In [None]:
# 문제 1  다중공선성 갯수

In [57]:
df = pd.read_csv("diamonds.csv")
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [58]:
df_sub = df.iloc[: , [6,0,4,5,7,8,9]]
df_sub.head(2)

Unnamed: 0,price,carat,depth,table,x,y,z
0,326,0.23,61.5,55.0,3.95,3.98,2.43
1,326,0.21,59.8,61.0,3.89,3.84,2.31


In [59]:
y, X = dmatrices("price ~ " + " + ".join(df_sub.columns[1:]), data = df_sub, return_type="dataframe")

df_vif = pd.DataFrame()
df_vif["vars"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,vars,VIF
0,Intercept,4821.69635
1,carat,21.602712
2,depth,1.49659
3,table,1.143225
4,x,56.187704
5,y,20.454295
6,z,23.530049


In [60]:
# 문제 2

In [61]:
from statsmodels.formula.api import ols

In [62]:
model = ols(formula = "price ~ carat + depth", data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.851
Model:,OLS,Adj. R-squared:,0.851
Method:,Least Squares,F-statistic:,153600.0
Date:,"Thu, 26 Jan 2023",Prob (F-statistic):,0.0
Time:,19:12:28,Log-Likelihood:,-472490.0
No. Observations:,53940,AIC:,945000.0
Df Residuals:,53937,BIC:,945000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4045.3332,286.205,14.134,0.000,3484.368,4606.298
carat,7765.1407,14.009,554.282,0.000,7737.682,7792.599
depth,-102.1653,4.635,-22.041,0.000,-111.251,-93.080

0,1,2,3
Omnibus:,14148.858,Durbin-Watson:,0.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148236.675
Skew:,0.962,Prob(JB):,0.0
Kurtosis:,10.89,Cond. No.,2660.0


In [63]:
df_test = pd.DataFrame({"carat" : [1], "depth" : [60], "table" : [55]})
df_test

Unnamed: 0,carat,depth,table
0,1,60,55


In [64]:
model.predict(df_test)

0    5680.554517
dtype: float64

In [65]:
# 문제 3

In [67]:
df_sub = df.loc[: , ["price", "carat","color", "depth"]]
df_sub.head(2)

Unnamed: 0,price,carat,color,depth
0,326,0.23,E,61.5
1,326,0.21,E,59.8


In [68]:
df_dum = pd.get_dummies(df_sub, columns=["color"], drop_first=True)
df_dum.head(2)

Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,0.23,61.5,1,0,0,0,0,0
1,326,0.21,59.8,1,0,0,0,0,0


In [None]:
model = ols(formula = "price ~" + " + ".join(df_dum.columns[1:]), data = df).fit()
model.summary()