In [None]:
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.tree import DecisionTree
import math 

In [39]:
#----------------------3.定義字典----------------
SeasonDict = { 1 : "春",  2 : "夏",  3 :"秋",  4 : "冬"   }
HoildayDict={  0 : "非假日", 1 : "假日"  }  
WeekDict = {0:"一",1:"二",2:"三",3:"四",4 :"五",5:"六",6:"日"}
WorkDayDict={ 1 : "工作日",  0 : "非工作日"  }
WeatherDict={ 1 : "晴",  2 : "陰",  3 : "小雨", 4 : "大雨" }

In [None]:
global Path  
Path="file:/home/spark/ntcu_workshop/"

In [None]:
def convert_float(x):
    return (0 if x=="?" else float(x))

In [None]:
def extract_label(record):
    label=(record[-1])
    return float(label)

In [None]:
def extract_features(record,featureEnd):
    featureSeason=[convert_float(field)  for  field in record[2]] 
    features=[convert_float(field)  for  field in record[4: featureEnd-2]]
    return  np.concatenate( (featureSeason, features))

In [None]:
def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels=score.zip(validationData.map(lambda p: p.label))
    metrics = RegressionMetrics(scoreAndLabels)
    RMSE=metrics.rootMeanSquaredError
    return( RMSE)

In [None]:
#----------------------1.匯入並轉換資料-------------
print("開始匯入資料...")
rawDataWithHeader = sc.textFile(Path+"data/hour.csv")
header = rawDataWithHeader.first() 
rawData = rawDataWithHeader.filter(lambda x:x !=header)    
lines = rawData.map(lambda x: x.split(","))
print (lines.first())
print("共計：" + str(lines.count()) + "筆")
#----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
labelpointRDD = lines.map(lambda r:LabeledPoint(
                                        extract_label(r), 
                                        extract_features(r,len(r) - 1)))

print labelpointRDD.first()
#----------------------3.以隨機方式將資料分為3部份並且回傳-------------
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
trainData.persist()
validationData.persist()
testData.persist()

In [35]:
model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=10, maxBins=100)
RMSE = evaluateModel(model, validationData)

In [36]:
RMSE

79.29740294363832

In [38]:
for lp in validationData.take(100):
        predict = int(model.predict(lp.features))
        label=lp.label
        features=lp.features
        result = ("正確" if  (label == predict) else "錯誤")
        error = math.fabs(label - predict)
        dataDesc = "==> 預測結果:" + str(predict ) + " , 實際: " + str(label) + result +", 誤差:" + str(error)
        print dataDesc

==> 預測結果:12  , 實際:13.0錯誤,  誤差:1.0
==> 預測結果:85  , 實際:28.0錯誤,  誤差:57.0
==> 預測結果:39  , 實際:6.0錯誤,  誤差:33.0
==> 預測結果:202  , 實際:182.0錯誤,  誤差:20.0
==> 預測結果:125  , 實際:62.0錯誤,  誤差:63.0
==> 預測結果:67  , 實際:67.0正確,  誤差:0.0
==> 預測結果:59  , 實際:39.0錯誤,  誤差:20.0
==> 預測結果:34  , 實際:25.0錯誤,  誤差:9.0
==> 預測結果:3  , 實際:1.0錯誤,  誤差:2.0
==> 預測結果:103  , 實際:75.0錯誤,  誤差:28.0
==> 預測結果:31  , 實際:38.0錯誤,  誤差:7.0
==> 預測結果:9  , 實際:3.0錯誤,  誤差:6.0
==> 預測結果:258  , 實際:188.0錯誤,  誤差:70.0
==> 預測結果:67  , 實際:74.0錯誤,  誤差:7.0
==> 預測結果:127  , 實際:99.0錯誤,  誤差:28.0
==> 預測結果:87  , 實際:32.0錯誤,  誤差:55.0
==> 預測結果:22  , 實際:19.0錯誤,  誤差:3.0
==> 預測結果:2  , 實際:2.0正確,  誤差:0.0
==> 預測結果:258  , 實際:202.0錯誤,  誤差:56.0
==> 預測結果:67  , 實際:42.0錯誤,  誤差:25.0
==> 預測結果:148  , 實際:117.0錯誤,  誤差:31.0
==> 預測結果:67  , 實際:61.0錯誤,  誤差:6.0
==> 預測結果:13  , 實際:10.0錯誤,  誤差:3.0
==> 預測結果:53  , 實際:33.0錯誤,  誤差:20.0
==> 預測結果:89  , 實際:41.0錯誤,  誤差:48.0
==> 預測結果:2  , 實際:1.0錯誤,  誤差:1.0
==> 預測結果:98  , 實際:32.0錯誤,  誤差:66.0
==> 預測結果:127  , 實際:99.0錯誤,  誤差:28.0
==> 預測結果:80  , 實際:61.0錯誤,  誤差