# 결정트리를 이용한 멜버른 집값 예측

## mount Google Drive and unzip zip file

In [5]:
!ls drive/MyDrive/21_ls

N
'[러닝스푼즈]머신러닝_가우스킴_1회차_강의자료_v2.pdf'	 input
'[러닝스푼즈]머신러닝_가우스킴_1회차_강의자료_v2.pptx'	 melbourne_data.zip
'[러닝스푼즈]머신러닝_가우스킴_2회차_강의자료.pptx'	 Unit_1


In [6]:
!unzip drive/MyDrive/21_ls/melbourne_data.zip -d drive/MyDrive/21_ls/input/

Archive:  drive/MyDrive/21_ls/melbourne_data.zip
replace drive/MyDrive/21_ls/input/melb_data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


## 데이터 불러오기, 모델 학습 및 예측

In [7]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
melbourne_file_path = 'drive/MyDrive/21_ls/input/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [9]:
y = melbourne_data.Price
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [10]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude'] # Longitude

In [11]:
X = melbourne_data[melbourne_features]
X

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.79960,144.99840
1,2,1.0,156.0,-37.80790,144.99340
2,3,2.0,134.0,-37.80930,144.99440
3,3,2.0,94.0,-37.79690,144.99690
4,4,1.0,120.0,-37.80720,144.99410
...,...,...,...,...,...
13575,4,2.0,652.0,-37.90562,145.16761
13576,3,2.0,333.0,-37.85927,144.87904
13577,3,2.0,436.0,-37.85274,144.88738
13578,4,1.0,866.0,-37.85908,144.89299


In [12]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [14]:
X.head(10)

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,2,1.0,202.0,-37.7996,144.9984
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
3,3,2.0,94.0,-37.7969,144.9969
4,4,1.0,120.0,-37.8072,144.9941
5,2,1.0,181.0,-37.8041,144.9953
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954
8,1,1.0,0.0,-37.8008,144.9973
9,2,1.0,220.0,-37.801,144.9989


In [17]:
melbourne_model = DecisionTreeRegressor(random_state=2021)

melbourne_model.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2021, splitter='best')

In [18]:
melbourne_model.predict(X.head())

array([1480000., 1035000., 1465000.,  850000., 1600000.])

In [19]:
melbourne_model.score(X, y) #결정 계수, Coefficient of determination)

0.9997390882943573

In [20]:
melbourne_model.predict(X[:5]) - y[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Price, dtype: float64

## 모델 '제대로' 평가하기

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(12222, 5) (12222,)
(1358, 5) (1358,)


In [22]:
melbourne_model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=2021, splitter='best')

In [23]:
melbourne_model.predict(X_train.head())

array([1480000., 1035000., 1465000.,  850000., 1600000.])

In [24]:
melbourne_model.predict(X_train[:5]) - y_train[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Price, dtype: float64

In [25]:
melbourne_model.predict(X_test)[:5] - y_test[:5]

12222     115000.0
12223      25000.0
12224     -36000.0
12225    -291000.0
12226    1150000.0
Name: Price, dtype: float64

In [26]:
melbourne_model.score(X_train, y_train)

0.9997104417677788

In [27]:
dt_r2 = melbourne_model.score(X_test, y_test)
dt_r2

0.405412915944668

### 바꿔보기
1. Light GBM 모델로 변경
2. 사용하는 feature 변경
3. 모델 파라미터 조정
4. Dataset split 비율 조정
5. 여러분들이 배우셨던 통계 지식 활용

## Light GBM

In [None]:
import lightgbm

In [None]:
train_data = lightgbm.Dataset(X_train, label=y_train) #categorical_feature=categorical_features
test_data = lightgbm.Dataset(X_test, label=y_test)

In [None]:
parameters = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose_eval': 10,
    'metric_freq': 10,
}

In [None]:
lgbm_model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=10)

In [None]:
lgbm_model.predict(X_test[:5])

In [None]:
lgbm_model.predict(X_test)[:5] - y_test[:5]

In [None]:
lgbm_r2 = r2_score(lgbm_model.predict(X_test), y_test)
print(dt_r2, lgbm_r2)

In [None]:
dt_mse = mean_squared_error(melbourne_model.predict(X_test), y_test, squared=False)
lgbm_mse = mean_squared_error(lgbm_model.predict(X_test), y_test, squared=False)
print(dt_mse, lgbm_mse)