#### 목표
- sklearn 사용법 숙지
- 집 특징 데이터를 학습해서 집 가격을 예측해보자
- 올바른 모델 평가방법을 알아보자

## 모델정의

In [93]:
from sklearn.tree import DecisionTreeRegressor  #예측하는 결과값이 범주화되어있지않음-> DecisionTreeRegressor
                                                                                        # cf)범주형 KNeighborsClassifier

In [94]:
house_model = DecisionTreeRegressor(random_state=203)

## 학습

In [95]:
import pandas as pd

In [96]:
model_data = pd.read_csv("./data/melb_data.csv")

In [97]:
model_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [98]:
model_data.info()  # 데이터 프레임 정보를 요약해서 볼 수 있는 함수

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [99]:
X = model_data[['Rooms','Bathroom','Landsize','Lattitude','Longtitude']]
X

y = model_data['Price']
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [100]:
X.shape, y.shape

((13580, 5), (13580,))

In [101]:
## 기술통계(describe)
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [102]:
y.describe() # 1.358000e+04: e뒤에서부터 소수점 4번옮겨라 -> 13580.00

count    1.358000e+04
mean     1.075684e+06
std      6.393107e+05
min      8.500000e+04
25%      6.500000e+05
50%      9.030000e+05
75%      1.330000e+06
max      9.000000e+06
Name: Price, dtype: float64

In [103]:
house_model.fit(X,y)

DecisionTreeRegressor(random_state=203)

In [104]:
X_samples = X.iloc[:5000, :] #data.iloc[행, 열]
y_samples = y.iloc[:5000]

In [105]:
pre = house_model.predict(X_samples)
pre

array([1480000., 1035000., 1465000., ...,  790000., 1000000.,  860000.])

### 평가 
- 수치형 데이터는 오차 기반의 평가를 사용
- 평균절대값 오차(Mean Absolute Error):예측값과 실제값의 차이를 구한다

In [106]:
from sklearn.metrics import mean_absolute_error

In [107]:
mean_absolute_error(y_samples, pre)

1380.7166666666667

In [108]:
## 활용
house_model.predict([[5,3,600,-37,145],[10,4,1200,-37,145]])

array([675000., 925000.])

## 올바른 평가를 위한 훈련, 평가 데이터 나누기(train:70%, test:30%)

In [109]:
from sklearn.model_selection import train_test_split  #랜덤샘플링

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=203)

In [111]:
X_test.shape, y_test.shape

((3395, 5), (3395,))

In [112]:
house_model.fit(X_train, y_train)  #학습

DecisionTreeRegressor(random_state=203)

In [113]:
pre = house_model.predict(X_test)  #예측

In [114]:
mean_absolute_error(y_test, pre)  #평가

247448.44177712323