In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE

## 解題步驟：

1. 讀取 x_train.npy, y_train.npy, x_test.npy, y_test.npy
2. 將 training dataset 再切分為 training set, validation set (圖一)
3. 先以上課的知識調整出一個不會 over-fitting 太多的決策樹模型
4. 以 validation set 作為調整參數的基準，陸續調整其他參數 (請同學測試看看 validation 要佔多少比例，後續的調整會比較客觀)
5. 將最終調整結果與一開始的決策樹做比較，誤差是否有降低
6. 同學若也懂其他模型的知識也可以試試看一樣的做法比較看看

#### 圖一 (Train, Validation and Test)
<img src="./train_val_test.png" style="zoom:30%;" />

#### 圖二 (是我前後調整的結果)
<img src="./report.png" style="zoom:30%;" />

In [135]:
# Read data
""" Your code here """
x_train = np.load('x_train.npy')
y_train = np.load('y_train.npy')
x_test = np.load('x_test.npy')
y_test = np.load('y_test.npy')

# split data
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, train_size=0.7)

print(x_train.shape, y_train.shape,x_validation.shape, y_validation.shape, x_test.shape, y_test.shape)

(766, 89) (766,) (329, 89) (329,) (365, 89) (365,)


## 請同學先依照上課的知識，調整一個沒有 overfitting 的決策樹，請以 mape 作為參考誤差指標，較容易看出關係

In [136]:
""" Your code here """
DT = DecisionTreeRegressor(criterion='mae', 
                           splitter='best', 
                           max_depth=None, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features=None, 
                           random_state=None, 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           min_impurity_split=None, 
                           ccp_alpha=200)
DT.fit(x_train, y_train)

DecisionTreeRegressor(ccp_alpha=200, criterion='mae')

In [137]:
print('Training report')
pred = DT.predict(x_train)
print('MSE:',MSE(y_train, pred))
print('MAE:',MAE(y_train, pred))
print('MAPE:',MAPE(y_train, pred))
print("------------------------------------")

print('Validation report')
pred = DT.predict(x_validation)
print('MSE:',MSE(y_validation, pred))
print('MAE:',MAE(y_validation, pred))
print('MAPE:',MAPE(y_validation, pred))
print("------------------------------------")

print('Testing report')
pred = DT.predict(x_test)
print('MSE:',MSE(y_test, pred))
print('MAE:',MAE(y_test, pred))
print('MAPE:',MAPE(y_test, pred))

Training report
MSE: 671711678.9582245
MAE: 17128.879895561357
MAPE: 0.10645954275494504
------------------------------------
Validation report
MSE: 1884432638.6291792
MAE: 24981.0547112462
MAPE: 0.14940136051918398
------------------------------------
Testing report
MSE: 1617854749.3835616
MAE: 25432.939726027398
MAPE: 0.13914276529190037


## 調整 決策樹參數
### 請同學從 criterion 的設定中，判斷這組資料集裡的 outlier 多嗎？

In [138]:
""" Your code here """
criterion_all = ['mse', 'friedman_mse', 'mae', 'poisson']
for i in criterion_all:
    DT = DecisionTreeRegressor(criterion=i,
                               ccp_alpha=200)
    DT.fit(x_train, y_train)
    print('#### criterion:',i)
    print('Training report')
    pred = DT.predict(x_train)
    print('MSE:',MSE(y_train, pred))
    print('MAE:',MAE(y_train, pred))
    print('MAPE:',MAPE(y_train, pred))
    print("------------------------------------")
    
    print('Validation report')
    pred = DT.predict(x_validation)
    print('MSE:',MSE(y_validation, pred))
    print('MAE:',MAE(y_validation, pred))
    print('MAPE:',MAPE(y_validation, pred))
    print("------------------------------------")
    
    print('Testing report')
    pred = DT.predict(x_test)
    print('MSE:',MSE(y_test, pred))
    print('MAE:',MAE(y_test, pred))
    print('MAPE:',MAPE(y_test, pred))
    print("==============================================")
        


#### criterion: mse
Training report
MSE: 5646.249129677982
MAE: 25.011749347258487
MAPE: 0.00017911449804430634
------------------------------------
Validation report
MSE: 2577672552.435157
MAE: 27154.072948328267
MAPE: 0.155170979692411
------------------------------------
Testing report
MSE: 2381279718.171918
MAE: 28525.672602739727
MAPE: 0.1506483850202162
#### criterion: friedman_mse
Training report
MSE: 5646.249129677982
MAE: 25.011749347258487
MAPE: 0.00017911449804430634
------------------------------------
Validation report
MSE: 2699661669.7028875
MAE: 27218.338905775076
MAPE: 0.15356298012435599
------------------------------------
Testing report
MSE: 1598914692.1404872
MAE: 26381.543378995437
MAPE: 0.14850667753256072
#### criterion: mae
Training report
MSE: 671711678.9582245
MAE: 17128.879895561357
MAPE: 0.10645954275494504
------------------------------------
Validation report
MSE: 1884432638.6291792
MAE: 24981.0547112462
MAPE: 0.14940136051918398
--------------------------

##### ANS: MAE比MSE更加不受outlier影響，看MSE在不同criterion中的浮動較大，因此推測資料集裡有較多的outlier