# 과제1 : 회귀 모델링

캔자스시티 집값 데이터를 예측해 봅시다.

출처 : https://www.kaggle.com/datasets/astronautelvis/kc-house-data?resource=download

![image.png](https://danibeyer.com/wp-content/uploads/2021/03/Banner-Image_Average-Home-Price_Dani-Beyer-Real-Estate.jpg)

## 1.환경준비

### (1) Import

In [None]:
#라이브러리들을 불러옵니다.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *

### (2) Data Loading
* 함께 제공된 csv 파일을 로딩합니다.
* 'kc_final.csv'

In [None]:
data = pd.read_csv("/var/kc_final.csv")

**Column Names**
* id - Unique ID for each home sold
* date - Date of the home sale
* **price - Price of each home sold(target)**
* bedrooms - Number of bedrooms
* bathrooms - Number of bathrooms, where .5 accounts for a room with a toilet but no shower
* sqft_living - Square footage of the apartment interior living space
* sqft_lot - Square footage of the land space
* floors - Number of floors
* waterfront - A dummy variable for whether the apartment was overlooking the waterfront or not
* view - An index from 0 to 4 of how good the view of the property was
* condition - An index from 1 to 5 on the condition of the apartment,
* grade - An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high-quality level of construction and design.
* sqft_above - The square footage of the interior housing space that is above ground level
* sqft_basement - The square footage of the interior housing space that is below ground level
* yr_built - The year the house was initially built
* yr_renovated - The year of the house’s last renovation
* sqft_living15 - The square footage of interior housing living space for the nearest 15 neighbors
* sqft_lot15 - The square footage of the land lots of the nearest 15 neighbors

### (3) 데이터 둘러보기
* 상위 5개
* 칼럼 정보
* 데이터 크기 : 행, 열

In [None]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,1340,5650
1,6414100192,2014-12-09,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,1690,7639
2,5631500400,2015-02-25,180000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,2720,8062
3,2487200875,2014-12-09,604000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,1360,5000
4,1954400510,2015-02-18,510000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,1800,7503


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  int64  
 1   bedrooms       21613 non-null  int64  
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  int64  
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  float64
 6   waterfront     21613 non-null  int64  
 7   view           21613 non-null  int64  
 8   condition      21613 non-null  int64  
 9   grade          21613 non-null  int64  
 10  sqft_above     21613 non-null  int64  
 11  sqft_basement  21613 non-null  int64  
 12  yr_built       21613 non-null  int64  
 13  yr_renovated   21613 non-null  int64  
 14  sqft_living15  21613 non-null  int64  
 15  sqft_lot15     21613 non-null  int64  
dtypes: float64(2), int64(14)
memory usage: 2.6 MB


In [None]:
data.shape

(21613, 16)

## id, date 칼럼은 삭제하고 모델 실행 (안그러면 error)

In [None]:
data.drop(columns=['id', 'date'], inplace=True)

In [None]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,1340,5650
1,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,1690,7639
2,180000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,2720,8062
3,604000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,1360,5000
4,510000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,1800,7503


## 2.데이터 준비

### (1) x, y 분할

In [None]:
target = 'price'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (2) 가변수화

In [None]:
cat_cols = ['waterfront', 'view', 'condition', 'grade']
x = pd.get_dummies(x, columns=cat_cols, drop_first=True)

In [None]:
x.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,...,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9,grade_10,grade_11,grade_12,grade_13
0,3,1.0,1180,5650,1.0,1180,0,1955,0,1340,...,False,False,False,True,False,False,False,False,False,False
1,3,2.25,2570,7242,2.0,2170,400,1951,1991,1690,...,False,False,False,True,False,False,False,False,False,False
2,2,1.0,770,10000,1.0,770,0,1933,0,2720,...,False,False,True,False,False,False,False,False,False,False
3,4,3.0,1960,5000,1.0,1050,910,1965,0,1360,...,False,False,False,True,False,False,False,False,False,False
4,3,2.0,1680,8080,1.0,1680,0,1987,0,1800,...,False,False,False,False,True,False,False,False,False,False


### (3) train, val 데이터분할

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state=20)

## 3.모델링
* 선형회귀 모델링
* (옵션) 릿지, 라쏘 모델도 생성해서 성능 비교를 해 봅시다.

### (1) 모델 선언

In [None]:
model = LinearRegression()

### (2) 학습

In [None]:
model.fit(x_train, y_train)

* 회귀 계수 열어보기

In [None]:
print("회귀 계수:", model.coef_)

회귀 계수: [-2.77091966e+04  5.21893348e+04  8.77250623e+01  3.68746913e-02
  4.35488959e+04  2.84328540e+01  5.92921832e+01 -3.14163317e+03
  1.25830399e+01  3.16099669e+01 -5.44336025e-01  4.52579395e+05
  1.17685358e+05  4.35930965e+04  1.01964517e+05  2.55877703e+05
  8.27314779e+04  1.03795022e+05  1.23291757e+05  1.60664405e+05
 -9.23984163e+04 -1.62576193e+05 -1.78332920e+05 -1.30275134e+05
 -5.74412917e+04  2.43023841e+04  1.57835170e+05  3.42855566e+05
  5.98698005e+05  1.04886059e+06  2.52220508e+06]


In [None]:
print("y 절편:", model.intercept_)

y 절편: 6192251.391374273


### (3) 예측

In [None]:
pred = model.predict(x_val)

### (4) 검증 평가
* R2, RMSE, MAE, MAPE 로 평가해 봅시다.

In [None]:
print('RMSE :', root_mean_squared_error(y_val, pred))
print('MAE  :', mean_absolute_error(y_val, pred))
print('r2 :', r2_score(y_val, pred))

RMSE : 209042.84109383565
MAE  : 132822.64754450877
r2 : 0.6733356879912097


### (옵션) 릿지, 라쏘 모델링

In [None]:
from sklearn.linear_model import Ridge, Lasso

#### 1) 릿지 모델링

In [None]:
# 모델 선언
ridge_model = Ridge(alpha=1)    # alpha : 규제 강도

# 학습
ridge_model.fit(x_train, y_train)

# 예측
pred_r = ridge_model.predict(x_val)

# 평가
print('RMSE :', root_mean_squared_error(y_val, pred_r))
print('MAE  :', mean_absolute_error(y_val, pred_r))
print('r2 :', r2_score(y_val, pred_r))

RMSE : 208736.5225997961
MAE  : 132980.03556443317
r2 : 0.6742923340669813


#### 2) 라쏘 모델링

In [None]:
# 모델 선언
lasso_model = Lasso(alpha=1)    # alpha : 규제 강도

# 학습
lasso_model.fit(x_train, y_train)

# 예측
pred_l = lasso_model.predict(x_val)

# 평가
print('RMSE :', root_mean_squared_error(y_val, pred_l))
print('MAE  :', mean_absolute_error(y_val, pred_l))
print('r2 :', r2_score(y_val, pred_l))

RMSE : 209042.96420379615
MAE  : 132828.03815202453
r2 : 0.6733353032313951


  model = cd_fast.enet_coordinate_descent(
