# Linear Regression
다음의 항목들에 대해서 학습합니다.

* 선형회귀(Linear Regression)
* 회귀모델 평가


## **1.환경준비**

* 라이브러리 로딩

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split  # 데이터 분할
from sklearn.metrics import *          # 모델 평가 함수들 전체(*)
from sklearn.linear_model import LinearRegression

## **2. 데이터 준비**

### (1) data loading

In [None]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'

data = pd.read_csv(path)
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


|	변수명	|	설명	|	구분	|
|----|----|----|
|	Sales 	|	 각 지역 판매량(단위 : 1000개)	|	Target	|
|	CompPrice 	|	지역별 경쟁사 판매가격(달러)	|	feature	|
|	Income 	|	가구당 평균 소득액(1000달러)	|	feature	|
|	Advertising 	|	 각 지역, 회사의 광고 예산(1000달러)	|	feature	|
|	Population 	|	 지역 인구수(단위 : 1000명)	|	feature	|
|	Price 	|	 자사 지역별 판매가격(달러)	|	feature	|
|	ShelveLoc 	|	 진열상태(범주 : Bad, Medium, Good)	|	feature	|
|	Age 	|	 지역 인구의 평균 연령	|	feature	|
|	Education 	|	 교육수준(범주 : 10~18)	|	feature	|
|	Urban 	|	 매장이 도심에 있는지 여부(범주 : Yes, No)	|	feature	|
|	US 	|	 매장이 미국에 있는지 여부(범주 : Yes, No)	|	feature	|


* 둘러보기

In [None]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


### (2) 데이터분할1 : x, y 나누기

In [None]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) 가변수화

In [None]:
cat_cols = ['ShelveLoc', 'US','Urban']

In [None]:
# 첫번째 가변수 제거
x = pd.get_dummies(x, columns=cat_cols, drop_first=True)  # drop_first = True: 원핫인코딩에서 첫번째 칼럼 날리기(그래도 나머지 칼럼으로 설명 가능)
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,US_Yes,Urban_Yes
0,138,73,11,276,120,42,17,False,False,True,True
1,111,48,16,260,83,65,10,True,False,True,True
2,113,35,10,269,80,59,12,False,True,True,True
3,117,100,4,466,97,55,14,False,True,True,True
4,141,64,3,340,128,38,13,False,False,False,True


### (4) 데이터분할2 : train : validation 나누기

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 20)

## 3.모델링 : 다중회귀

* 이번에는 다음의 변수로 예측하는 모델을 각각 만들어 봅시다.
    * model1 : Price + Age
    * model2 : Price + Age + ShelveLoc
    * model3 : 전체 변수

### (1) 모델1
* Price + Age

In [None]:
features = ['Price','Age' ]
x_train1 = x_train[features]
x_val1 = x_val[features]

In [None]:
x_train1.head()

Unnamed: 0,Price,Age
268,105,39
108,103,65
21,109,62
231,122,27
33,128,50


In [None]:
model1 = LinearRegression()
model1.fit(x_train1, y_train)

In [None]:
print(x_train1)
print(list(x_train1))
print(model1.coef_, model1.intercept_)

     Price  Age
268    105   39
108    103   65
21     109   62
231    122   27
33     128   50
..     ...  ...
393    120   30
218    120   25
223    125   62
271    110   62
355    146   42

[280 rows x 2 columns]
['Price', 'Age']
[-0.05668523 -0.05057859] 16.87498339774416


In [None]:
pred1 = model1.predict(x_val1)
pred1    # y예측값

array([ 9.89141676,  6.67788339,  7.80893224,  5.69418907,  7.97115588,
        6.79470467,  8.6949202 ,  9.16061537,  7.4382876 ,  7.10255743,
        6.8636032 ,  9.3193882 ,  8.45424057,  7.97633211,  8.08890759,
        8.7708558 ,  7.49497283,  6.81913126,  9.58449443,  6.59584114,
        6.00204182,  6.07877241,  6.3272841 ,  8.32255016,  6.16771629,
        6.68399004,  8.87373838,  7.85512958,  9.83393655,  6.87409109,
        2.3925068 ,  7.67458599,  7.57263384,  8.53017617,  8.73859716,
        8.96268226,  6.95082167,  8.12461705,  7.83774007,  5.39589377,
        5.50143218,  9.32894566,  8.0784197 ,  8.00513993, 10.47579406,
        8.05571852,  6.99794944,  8.064481  ,  6.88537396,  7.45488213,
        7.5560393 ,  8.14731823,  8.64182123,  9.3455402 ,  5.9610207 ,
        8.13855575,  6.47995028,  9.49462013, 11.2405795 ,  8.12023581,
        3.93694682,  6.69620333,  6.47291321,  6.76337645,  7.78623105,
       10.52716763, 11.57803508,  5.02007291,  4.39042869,  7.94

In [None]:
print('RMSE :', root_mean_squared_error(y_val, pred1))
print('MAE  :', mean_absolute_error(y_val, pred1))
print('r2 :', r2_score(y_val, pred1))

RMSE : 2.5176666485070442
MAE  : 2.053592574723096
r2 : 0.23901833455873955


### (2) 모델2
* Price + Age + ShelveLoc

In [None]:
x_train.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,US_Yes,Urban_Yes
268,123,57,0,66,105,39,11,False,True,False,True
108,107,79,2,488,103,65,16,False,False,False,True
21,134,29,12,239,109,62,18,True,False,True,False
231,132,69,0,123,122,27,11,False,True,False,False
33,114,38,13,317,128,50,16,True,False,True,True


In [None]:
features = ['Price', 'Age', 'ShelveLoc_Good', 'ShelveLoc_Medium']
x_train2 = x_train[features]
x_val2 = x_val[features]

In [None]:
model2 = LinearRegression()
model2.fit(x_train2, y_train)

In [None]:
print(list(x_train2))
print(model2.coef_, model2.intercept_)

['Price', 'Age', 'ShelveLoc_Good', 'ShelveLoc_Medium']
[-0.05952678 -0.04824324  5.00353761  1.99168349] 14.855540275889222


In [None]:
pred2 = model2.predict(x_val2)

In [None]:
print('RMSE :', root_mean_squared_error(y_val, pred2))
print('MAE  :', mean_absolute_error(y_val, pred2))
print('r2 :', r2_score(y_val, pred2))

RMSE : 1.8789210897060296
MAE  : 1.5399212833651519
r2 : 0.5761669476428513


### (3) 모델3

In [None]:
model3 = LinearRegression()
model3.fit(x_train, y_train)

In [None]:
print(list(x_train))
print(model3.coef_, model3.intercept_)

['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'Age', 'Education', 'ShelveLoc_Good', 'ShelveLoc_Medium', 'US_Yes', 'Urban_Yes']
[ 9.14550155e-02  1.52896750e-02  1.11545812e-01  4.10916196e-04
 -9.49346042e-02 -4.32955626e-02 -2.39647930e-02  4.75921210e+00
  1.82116013e+00 -1.00667072e-01  9.23749941e-02] 5.7885182934110055


In [None]:
pred3 = model3.predict(x_val)

In [None]:
print('RMSE :', root_mean_squared_error(y_val, pred3))
print('MAE  :', mean_absolute_error(y_val, pred3))
print('r2 :', r2_score(y_val, pred3))

RMSE : 1.1271151463198543
MAE  : 0.9415085484647864
r2 : 0.8474843845917948


### (4) 참조 : 릿지, 라쏘 모델링

In [None]:
from sklearn.linear_model import Ridge, Lasso

#### 1) 릿지 모델링

In [None]:
# 모델 선언
ridge_model = Ridge(alpha=1)    # alpha : 규제 강도

ridge_model.fit(x_train, y_train)

pred_r = ridge_model.predict(x_val)

print('RMSE :', root_mean_squared_error(y_val, pred_r))
print('MAE  :', mean_absolute_error(y_val, pred_r))
print('r2 :', r2_score(y_val, pred_r))

RMSE : 1.1372643509781297
MAE  : 0.95445098078036
r2 : 0.8447253384495139


#### 2) Lasso 모델링

In [None]:
# 모델 선언
lasso_model = Lasso(alpha=1)    # alpha : 규제 강도

lasso_model.fit(x_train, y_train)

pred_l = lasso_model.predict(x_val)

print('RMSE :', root_mean_squared_error(y_val, pred_l))
print('MAE  :', mean_absolute_error(y_val, pred_l))
print('r2 :', r2_score(y_val, pred_l))

RMSE : 2.076195993279619
MAE  : 1.6934439395436158
r2 : 0.48249513511692343
