## 다중 회귀

## 다중 회귀 (Multiple regression)

오늘 수업에서는 여러 개의 변수로 하나의 종속 변수를 설명하는 방법에 대해 실습해보겠습니다.

In [1]:
import numpy as np
import pandas as pd

In [25]:
np.set_printoptions(precision=2, floatmode= 'fixed')

### 사용할 데이터    
보스턴 데이터


### 데이터 설명
타겟 데이터   
1978 보스턴 주택 가격   
506개 타운의 주택 가격 중앙값 (단위 1,000 달러)   

|특징 데이터||
|------|---|
|CRIM|범죄율|
|INDUS|비소매상업지역 면적 비율|
|NOX|일산화질소 농도|
|RM|주택당 방 수|
|LSTAT|인구 중 하위 계층 비율|
|B|인구 중 흑인 비율|
|PTRAT|학생/교사 비율|
|ZN|25,000 평방피트를 초과 거주지역 비율|
|CHAS|찰스강의 경계에 위치한 경우는 1, 아니면 0|
|AGE|1940년 이전에 건축된 주택의 비율|
|RAD|방사형 고속도로까지의 거리|
|DIS|직업센터의 거리|
|TAX|재산세율|
|Target|집값|

In [2]:
# 데이터 불러오기
boston = pd.read_csv("./Boston_house.csv")
boston.head()

Unnamed: 0,AGE,B,RM,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,ZN,TAX,CHAS,Target
0,65.2,396.9,6.575,0.00632,4.09,2.31,4.98,0.538,15.3,1,18.0,296,0,24.0
1,78.9,396.9,6.421,0.02731,4.9671,7.07,9.14,0.469,17.8,2,0.0,242,0,21.6
2,61.1,392.83,7.185,0.02729,4.9671,7.07,4.03,0.469,17.8,2,0.0,242,0,34.7
3,45.8,394.63,6.998,0.03237,6.0622,2.18,2.94,0.458,18.7,3,0.0,222,0,33.4
4,54.2,396.9,7.147,0.06905,6.0622,2.18,5.33,0.458,18.7,3,0.0,222,0,36.2


In [3]:
boston.columns

Index(['AGE', 'B', 'RM', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO',
       'RAD', 'ZN', 'TAX', 'CHAS', 'Target'],
      dtype='object')

AGE ~ CHAS 까지 총 13개의 독립 변수로 Boston house price (Target) 을 설명하는 모델을 만들어 보도록 하겠습니다.

In [4]:
features_to_use = boston.columns.drop('Target')
features_to_use

Index(['AGE', 'B', 'RM', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO',
       'RAD', 'ZN', 'TAX', 'CHAS'],
      dtype='object')

In [6]:
# Dependent variable 및 independent variable 셋팅
y, X = boston[['Target']], boston[features_to_use]

In [7]:
X.head()

Unnamed: 0,AGE,B,RM,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,ZN,TAX,CHAS
0,65.2,396.9,6.575,0.00632,4.09,2.31,4.98,0.538,15.3,1,18.0,296,0
1,78.9,396.9,6.421,0.02731,4.9671,7.07,9.14,0.469,17.8,2,0.0,242,0
2,61.1,392.83,7.185,0.02729,4.9671,7.07,4.03,0.469,17.8,2,0.0,242,0
3,45.8,394.63,6.998,0.03237,6.0622,2.18,2.94,0.458,18.7,3,0.0,222,0
4,54.2,396.9,7.147,0.06905,6.0622,2.18,5.33,0.458,18.7,3,0.0,222,0


In [8]:
y.head()

Unnamed: 0,Target
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


### Linear regression using statsmodel.api

In [28]:
import statsmodels.api as sm

In [29]:
# ols 사용
# ordinary least squares

# stats 모델에서 bias를 사용하려면 constant를 추가해서 모델링을 해줘야 함.
X1 = sm.add_constant(X, has_constant='add')
X1.head(5)

Unnamed: 0,const,AGE,B,RM,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,ZN,TAX,CHAS
0,1.0,65.2,396.9,6.575,0.00632,4.09,2.31,4.98,0.538,15.3,1,18.0,296,0
1,1.0,78.9,396.9,6.421,0.02731,4.9671,7.07,9.14,0.469,17.8,2,0.0,242,0
2,1.0,61.1,392.83,7.185,0.02729,4.9671,7.07,4.03,0.469,17.8,2,0.0,242,0
3,1.0,45.8,394.63,6.998,0.03237,6.0622,2.18,2.94,0.458,18.7,3,0.0,222,0
4,1.0,54.2,396.9,7.147,0.06905,6.0622,2.18,5.33,0.458,18.7,3,0.0,222,0


In [30]:
lm = sm.OLS(y, X1)
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,Target,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Mon, 06 Dec 2021",Prob (F-statistic):,6.72e-135
Time:,12:19:11,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4595,5.103,7.144,0.000,26.432,46.487
AGE,0.0007,0.013,0.052,0.958,-0.025,0.027
B,0.0093,0.003,3.467,0.001,0.004,0.015
RM,3.8099,0.418,9.116,0.000,2.989,4.631
CRIM,-0.1080,0.033,-3.287,0.001,-0.173,-0.043
DIS,-1.4756,0.199,-7.398,0.000,-1.867,-1.084
INDUS,0.0206,0.061,0.334,0.738,-0.100,0.141
LSTAT,-0.5248,0.051,-10.347,0.000,-0.624,-0.425
NOX,-17.7666,3.820,-4.651,0.000,-25.272,-10.262

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0


### Linear regression using scikit-learn 

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [10]:
rl = LinearRegression()

In [11]:
rl.fit(X, y)

LinearRegression()

In [27]:
# 기울기 및 절편 확인
print(f'기울기: {rl.coef_[0]}')
print(f'절편: {rl.intercept_[0]:.4f}' )

# R^2 및 MSE 확인
y_hat = rl.predict(X)
print(f'R^2: {rl.score(X, y):.4f}')
print(f'MSE: {mean_squared_error(y, y_hat):.4f}')

기울기: [ 6.92e-04  9.31e-03  3.81e+00 -1.08e-01 -1.48e+00  2.06e-02 -5.25e-01
 -1.78e+01 -9.53e-01  3.06e-01  4.64e-02 -1.23e-02  2.69e+00]
절편: 36.4595
R^2: 0.7406
MSE: 21.8948


## 실습: Wine-aroma 데이터 셋에 대해 다중 회귀 모델링하기

In [36]:
# 데이터 불러오기
wine = pd.read_excel('./wine_aroma.xlsx')
wine.head()

Unnamed: 0,Mo,Ba,Cr,Sr,Pb,B,Mg,Ca,K,Aroma
0,0.044,0.387,0.029,1.23,0.561,2.63,128.0,80.5,1130,3.3
1,0.16,0.312,0.038,0.975,0.697,6.21,193.0,75.0,1010,4.4
2,0.146,0.308,0.035,1.14,0.73,3.05,127.0,91.0,1160,3.9
3,0.191,0.165,0.036,0.927,0.796,2.57,112.0,93.6,924,3.9
4,0.363,0.38,0.059,1.13,1.73,3.07,138.0,84.6,1090,5.6


이번 실습에서는 위 wine 데이터를 이용해 다중 회귀를 모델링해보겠습니다.     
(1) 독립 변수는 Mo, Ba, Cr, Sr, Pb, B, Mg, Ca, K 로 설정하시고 종속 변수는 Aroma 를 사용해주세요.    
(2) train-test 비율을 7:3 (test_size =0.3)으로 해서 모델 학습 및 평가를 해주세요.    
(3) 모델 평가 지표는 R^2 와 MSE를 사용해 주시면 됩니다.

In [41]:
# 필요한 라이브러리 import
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### 실습 가이드 라인

In [39]:
# (1) 독립 변수는 Mo, Ba, Cr, Sr, Pb, B, Mg, Ca, K 로 설정하시고 종속 변수는 Aroma 를 사용해주세요.
features = wine.columns
features_to_use = features.drop('Aroma')

X, y = wine[features_to_use], wine[['Aroma']]

In [42]:
# (2) train-test 비율을 7:3 (test_size =0.3)으로 해서 모델 학습 및 평가를 해주세요.

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)

In [43]:
# 모델 선언
lr = LinearRegression()

# 모델 학습
lr.fit(X_train, y_train)

LinearRegression()

In [45]:
# 모델 평가
y_hat_train, y_hat_test = lr.predict(X_train), lr.predict(X_test)
mse_train, mse_test = mean_squared_error(y_train, y_hat_train), mean_squared_error(y_test, y_hat_test)
r2_train, r2_test = lr.score(X_train, y_train), lr.score(X_test, y_test)

In [47]:
print(f'MSE for the training set: {mse_train:.4f}')
print(f'MSE for the testset: {mse_train:.4f}')
print(f'R^2 for the training set: {r2_train:.4f}')
print(f'R^2 for the testset: {r2_test:.4f}')

MSE for the training set: 0.1816
MSE for the testset: 0.1816
R^2 for the training set: 0.8452
R^2 for the testset: 0.6898


### 참고 사항: Cross validation (K fold) 를 사용한 결과는?

In [48]:
# 참고 사항
# K-fold를 사용한다면?
from sklearn.model_selection import KFold

In [49]:
# 직접 cross validation 활용하기
def my_cv(model, X, y, n_splits:int, shuffle = True):
    
    # performance 저장용
    perf = {'MSE_training':[], 'MSE_test':[], 'R2_train':[],'R2_test':[]}
    idx = X.index
    folds = KFold(n_splits=n_splits, shuffle= shuffle)
    for idx_train, idx_test in folds.split(idx):
        X_train = X.loc[idx_train,:]
        X_test = X.loc[idx_test, :]
        y_train = y.loc[idx_train]
        y_test = y.loc[idx_test]
        
        model.fit(X_train, y_train)
        # performance 계산
        mse_train = mean_squared_error(y_train, model.predict(X_train))
        mse_test = mean_squared_error(y_test, model.predict(X_test))
        r2_train = model.score(X_train, y_train)
        r2_test = model.score(X_test, y_test)
        
        # performance list에 저장
        perf['MSE_training'].append(mse_train)
        perf['MSE_test'].append(mse_test)
        perf['R2_train'].append(r2_train)
        perf['R2_test'].append(r2_test)
        
    return perf

In [50]:
perf = my_cv(lr, X, y, n_splits= 5)

In [52]:
df_perf = pd.DataFrame(perf)

In [53]:
df_perf

Unnamed: 0,MSE_training,MSE_test,R2_train,R2_test
0,0.150389,2.055252,0.865757,-0.714997
1,0.176405,0.583342,0.867691,-4.928274
2,0.142162,0.513782,0.869274,0.544519
3,0.188261,0.446464,0.861111,-1.066965
4,0.120452,1.475537,0.861326,-0.005134


In [55]:
df_perf.mean(axis=0)

MSE_training    0.155534
MSE_test        1.014876
R2_train        0.865032
R2_test        -1.234170
dtype: float64

In [57]:
df_perf.std(axis=0)

MSE_training    0.027123
MSE_test        0.716763
R2_train        0.003698
R2_test         2.157439
dtype: float64

### 결과 해석     
* Training data 에 대해 잘 설명하지만 Test data 에 대해서 잘 설명하지 못함.   
* Generalization performance 가 좋지 않음.   

---

## 명목형 데이터 처리 방법

In [58]:
# 데이터 불러오기
df = pd.read_csv('./abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Class_number_of_rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### 데이터 설명
타겟 데이터   
전복 (Abalone) 고리 수 예측  (Class_number_of_rings) 

|특징 데이터||
|------|---|
|Sex|성별 (M:Male, I:Infant, F:Female)|
|Length|전복 길이|
|Diameter|전복 지름|
|Height|전복 높이|
|Whole_weight|전복 무게|
|Shucked_weight|전복 껍데기를 제외한 무게|
|Viscera_weight|전복 내장 무게|
|Shell_weight|전복 껍데기 무게|
|Class_number_of_rings|전복 고리 수 (Target)|


이 때 전복의 성별이 수치형 변수가 아니므로 수치형 변수로 변경해줄 필요가 있다. 
(One hot encoding)

### One hot encoding

In [59]:
# 전복의 성별 --> one-hot encoding
sex = df.pop('Sex') # df 에서 'Sex' column 을 제거하고 이를 리턴함.
sex

0       M
1       M
2       F
3       M
4       I
       ..
4172    F
4173    M
4174    M
4175    F
4176    M
Name: Sex, Length: 4177, dtype: object

In [60]:
df.columns # 성별이 없는 것 확인.

Index(['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight', 'Class_number_of_rings'],
      dtype='object')

In [80]:
# 전복의 성별 one-hot-encoding
sex_one_hot = pd.get_dummies(sex)
sex_one_hot.head(4)

Unnamed: 0,F,I,M
0,0,0,1
1,0,0,1
2,1,0,0
3,0,0,1


In [72]:
# 위 encoding을 그대로 사용하면, multi-collinearity 문제가 발생하므로 (why?)
# 한 개 컬럼을 탈락시킨 encoding을 사용하기로 함.
# sex_one_hot에서 원하는 컬럼 하나를 탈락시켜도 되고 
# 처음부터 pd.get_dummies(sex, drop_first=True)를 이용해서 첫 번째 열을 자동 탈락 시켜도 됨. 

sex_one_hot_drop_first = pd.get_dummies(sex,drop_first=True)
sex_one_hot_drop_first.head(2)

Unnamed: 0,I,M
0,0,1
1,0,1


In [71]:
sex_one_hot_I = sex_one_hot[['F','M']]
sex_one_hot_I.head(2)

Unnamed: 0,F,M
0,0,1
1,0,1


In [74]:
# 분석에 용이한 encoding을 선택하면 됨. 
# 본 실습 시간에서는 F, M 컬럼을 사용하겠음.
# 원래 데이터와 concat 해줌.
df_concat = pd.concat([sex_one_hot_I, df], axis=1)
df_concat.head()

Unnamed: 0,F,M,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Class_number_of_rings
0,0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### 다중 회귀 모델링

In [75]:
# 변수 선언
X, y = df_concat[df_concat.columns.drop('Class_number_of_rings')], df_concat[['Class_number_of_rings']]

In [76]:
# 모델 선언
rl = LinearRegression()

In [77]:
# 모델 학습
rl.fit(X, y)

LinearRegression()

In [78]:
# 모델 평가
rl.score(X,y)

0.5378844030211949

## 실습: One hot encoding

In [81]:
data = pd.read_csv('./student-mat.csv')

In [82]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [83]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [84]:
# categorical data 와 numeric data 분리
idx_cat = data.dtypes=='object' 
# 모든 카테고리 데이터가 타입이 object인 것은 아니지만 오늘 실습에서는 편의상 
# object 타입의 데이터가 모두 categorical 데이터라고 가정하겠습니다.

data_cat, data_num = data.loc[:, idx_cat], data.loc[:, ~idx_cat]

### 위에서 분리한 categorical data를 one-hot-encoding해서 학습용 데이터를 만들어 주세요.

(1) categorical data one hot encoding (   
Hint: pd.get_dummies(data_cat, drop_first =True) )     
(2) numeric data 와 one hot encoding 된 데이터 concat.

### 실습: 가이드 라인

In [87]:
# categorical data one hot encoding
data_cat_one_hot =  pd.get_dummies(data_cat, drop_first =True)
data_cat_one_hot.head()

Unnamed: 0,school_MS,sex_M,address_U,famsize_LE3,Pstatus_T,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_health,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,1,0,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
2,0,0,1,1,1,0,0,0,0,0,...,1,0,1,0,1,0,1,1,1,0
3,0,0,1,0,1,1,0,0,0,0,...,1,0,0,1,1,1,1,1,1,1
4,0,0,1,0,1,0,1,0,0,0,...,0,0,0,1,1,0,1,1,0,0


In [88]:
df_concat = pd.concat([data_cat_one_hot,data_num ], axis= 1)

In [89]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   school_MS          395 non-null    uint8
 1   sex_M              395 non-null    uint8
 2   address_U          395 non-null    uint8
 3   famsize_LE3        395 non-null    uint8
 4   Pstatus_T          395 non-null    uint8
 5   Mjob_health        395 non-null    uint8
 6   Mjob_other         395 non-null    uint8
 7   Mjob_services      395 non-null    uint8
 8   Mjob_teacher       395 non-null    uint8
 9   Fjob_health        395 non-null    uint8
 10  Fjob_other         395 non-null    uint8
 11  Fjob_services      395 non-null    uint8
 12  Fjob_teacher       395 non-null    uint8
 13  reason_home        395 non-null    uint8
 14  reason_other       395 non-null    uint8
 15  reason_reputation  395 non-null    uint8
 16  guardian_mother    395 non-null    uint8
 17  guardian_other  

## 생각해 볼만한 것

* 주어진 독립 변수를 모두 사용하는 것이 항상 옳을까?    
* 아니라면 어떤 방법이 있을까?   