# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [2]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [3]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 3.데이터 준비

### (1) 데이터 정리

In [4]:
data['Price_Diff'] = data['CompPrice'] - data['Price']
data.drop(['CompPrice'], axis=1, inplace=True)

### (2) 데이터분할1 : x, y 나누기

In [5]:
target = 'Sales'
x = data.drop(target,axis=1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

In [6]:
# 범주형(가변수화 대상) 변수 리스트(목록) 작성
dumm_var = ['ShelveLoc','Education','Urban','US']

# 리스트로 가변수화 수행. 
x = pd.get_dummies(x, columns = dumm_var, drop_first = True)
x.head()

Unnamed: 0,Income,Advertising,Population,Price,Age,Price_Diff,ShelveLoc_Good,ShelveLoc_Medium,Education_11,Education_12,Education_13,Education_14,Education_15,Education_16,Education_17,Education_18,Urban_Yes,US_Yes
0,73,11,276,120,42,18,0,0,0,0,0,0,0,0,1,0,1,1
1,48,16,260,83,65,28,1,0,0,0,0,0,0,0,0,0,1,1
2,35,10,269,80,59,33,0,1,0,1,0,0,0,0,0,0,1,1
3,100,4,466,97,55,20,0,1,0,0,0,1,0,0,0,0,1,1
4,64,3,340,128,38,13,0,0,0,0,1,0,0,0,0,0,1,0


In [12]:
x.head()

Unnamed: 0,Income,Advertising,Population,Price,Age,Price_Diff,ShelveLoc_Good,ShelveLoc_Medium,Education_11,Education_12,Education_13,Education_14,Education_15,Education_16,Education_17,Education_18,Urban_Yes,US_Yes
0,73,11,276,120,42,18,0,0,0,0,0,0,0,0,1,0,1,1
1,48,16,260,83,65,28,1,0,0,0,0,0,0,0,0,0,1,1
2,35,10,269,80,59,33,0,1,0,1,0,0,0,0,0,0,1,1
3,100,4,466,97,55,20,0,1,0,0,0,1,0,0,0,0,1,1
4,64,3,340,128,38,13,0,0,0,0,1,0,0,0,0,0,1,0


### (5) 데이터분할2 : train : validation 나누기

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)
# 실전에서는 random_state 잘 안씀

### (6) Scaling   -> NaN이 있거나 가변수화하지 않으면 오류발생
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_s1 = scaler.fit_transform(x_train)
x_val_s1 = scaler.transform(x_val)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_s2 = scaler2.fit_transform(x_train)
x_val_s2 = scaler2.transform(x_val)

In [11]:
x_train_s1 = pd.DataFrame(x_train_s1, columns = list(x))  # 칼럼이름 지정 필요!!
x_train_s2 = pd.DataFrame(x_train_s2, columns = list(x))

## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *

In [13]:
data.head()

Unnamed: 0,Sales,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,Price_Diff
0,9.5,73,11,276,120,Bad,42,17,Yes,Yes,18
1,11.22,48,16,260,83,Good,65,10,Yes,Yes,28
2,10.06,35,10,269,80,Medium,59,12,Yes,Yes,33
3,7.4,100,4,466,97,Medium,55,14,Yes,Yes,20
4,4.15,64,3,340,128,Bad,38,13,Yes,No,13


In [14]:
features = ['Advertising','Population','Price','Age','ShelveLoc_Good','ShelveLoc_Medium']
x_train1 = x_train[features]
x_val1 = x_val[features]

In [15]:
m1 = LinearRegression()
m1.fit(x_train1, y_train)
p1 = m1.predict(x_val1)

print('RMSE : ', mean_squared_error(y_val, p1, squared=False))
print('MAE  : ', mean_absolute_error(y_val, p1))
print('MAPE : ', mean_absolute_percentage_error(y_val, p1))

RMSE :  1.7227784485989415
MAE  :  1.404102046469267
MAPE :  0.5049345553687143


* 모델1

In [16]:
features = ['Price']
x_train1 = x_train[features]
x_val1 = x_val[features]

model1 = LinearRegression()
model1.fit(x_train1, y_train)
pred1 = model1.predict(x_val1)

In [17]:
model1.coef_, model1.intercept_  # Sales = -0.05price + 13.5

(array([-0.04635436]), 12.964838899123844)

In [18]:
print('RMSE', mean_squared_error(y_val, pred1, squared=False))
print('MAE', mean_absolute_error(y_val, pred1))
print('MAPE', mean_absolute_percentage_error(y_val,pred1))

RMSE 2.588984159746482
MAE 2.1228736535124924
MAPE 0.7488971847312499


* 모델2

In [19]:
model2 = LinearRegression()
model2.fit(x_train, y_train)
pred2 = model2.predict(x_val)

In [20]:
model2.coef_, model2.intercept_  # sales = 전부다 + 5.67

(array([ 1.49099531e-02,  1.31016340e-01, -2.42557969e-04, -8.02574826e-04,
        -4.49425965e-02,  9.09406764e-02,  4.77251512e+00,  1.88225236e+00,
        -3.42125036e-01, -1.23913347e-01, -5.57250842e-02, -4.25888459e-01,
         1.33067906e-01, -4.26207086e-01, -9.31346130e-02, -2.49240303e-01,
         5.84815845e-02, -2.82999161e-01]),
 5.663851202199168)

## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

In [21]:
# 모델링용
from sklearn.neighbors import KNeighborsRegressor    

# 회귀모델 평가용
from sklearn.metrics import * 

* 모델3

In [22]:
model3 = KNeighborsRegressor(n_neighbors = 3, metric = 'euclidean')
model3.fit(x_train_s1, y_train)
p3 = model3.predict(x_val_s1)

* 모델4

In [23]:
model4 = KNeighborsRegressor(n_neighbors = 10, metric = 'euclidean')
model4.fit(x_train_s1, y_train)
p4 = model4.predict(x_val_s1)

* 모델5

In [24]:
model5 = KNeighborsRegressor(n_neighbors = 3, metric = 'manhattan')
model5.fit(x_train_s1, y_train)
p5 = model5.predict(x_val_s1)

In [25]:
model6 = KNeighborsRegressor(n_neighbors = 10, metric = 'manhattan')
model6.fit(x_train_s1, y_train)
p6 = model6.predict(x_val_s1)

## 6.성능비교

In [26]:
RMSE, MAE, MAPE = [],[],[]
model_desc = ['lr_1', 'lr_2','knn_1','knn_2','knn_3','knn_4']
pred = [pred1, pred2, p3, p4, p5, p6]

for i, p in enumerate(pred) :
    RMSE.append(mean_squared_error(y_val, p, squared=False))
    MAE.append(mean_absolute_error(y_val, p))
    MAPE.append(mean_absolute_percentage_error(y_val, p))

result = pd.DataFrame({'model_desc':model_desc,'RMSE':RMSE,'MAE':MAE,'MAPE':MAPE})
result

Unnamed: 0,model_desc,RMSE,MAE,MAPE
0,lr_1,2.588984,2.122874,0.748897
1,lr_2,1.041721,0.82917,0.217355
2,knn_1,2.387443,1.884222,0.664397
3,knn_2,2.323664,1.918317,0.741459
4,knn_3,2.188326,1.773472,0.646474
5,knn_4,2.246946,1.843633,0.676131


In [None]:
# 상황에 따라 선형, KNN의 적용이 다름