# 預測腳踏車每小時的租借量


In [None]:
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.tree import DecisionTree
import math 

In [None]:
#----------------------3.定義字典----------------
SeasonDict = { 1 : "春",  2 : "夏",  3 :"秋",  4 : "冬"   }
HoildayDict={  0 : "非假日", 1 : "假日"  }  
WeekDict = {0:"一",1:"二",2:"三",3:"四",4 :"五",5:"六",6:"日"}
WorkDayDict={ 1 : "工作日",  0 : "非工作日"  }
WeatherDict={ 1 : "晴",  2 : "陰",  3 : "小雨", 4 : "大雨" }

In [None]:
global Path  
Path="file:/home/spark/ntcu_workshop/"

## Note: we need some utility function to hanlde RDD

In [None]:
def convert_float(x):
    return (0 if x=="?" else float(x))

In [None]:
def extract_label(record):
    label=(record[-1])
    return float(label)

In [None]:
def extract_features(record,featureEnd):
    featureSeason=[convert_float(field)  for  field in record[2]] 
    features=[convert_float(field)  for  field in record[4: featureEnd-2]]
    return  np.concatenate( (featureSeason, features))

In [None]:
def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels=score.zip(validationData.map(lambda p: p.label))
    metrics = RegressionMetrics(scoreAndLabels)
    RMSE=metrics.rootMeanSquaredError
    return( RMSE)

# 準備資料

In [None]:
#----------------------1.匯入並轉換資料-------------
print("開始匯入資料...")
rawDataWithHeader = sc.textFile(Path+"data/hour.csv")
header = rawDataWithHeader.first() 
rawData = rawDataWithHeader.filter(lambda x:x !=header)    
lines = rawData.map(lambda x: x.split(","))
# print (lines.first())
print("共計：" + str(lines.count()) + "筆")

In [None]:
#----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
labelpointRDD = lines.map(lambda r:LabeledPoint(
                                        extract_label(r), 
                                        extract_features(r,len(r) - 1)))

### RDD[tuple] -> RDD[LabelPoint]

In [None]:
print (lines.first())

In [None]:
print labelpointRDD.first()

In [None]:
#----------------------3.以隨機方式將資料分為3部份並且回傳-------------
(trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
trainData.persist()
validationData.persist()
testData.persist()

# 訓練模型


In [None]:
model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=10, maxBins=100)

# 進行預測

In [None]:
for lp in validationData.take(100):
        predict = int(model.predict(lp.features))
        label=lp.label
        features=lp.features
        result = ("正確" if  (label == predict) else "錯誤")
        error = math.fabs(label - predict)
        dataDesc = "==> 預測結果:" + str(predict ) + " , 實際: " + str(label) + result +", 誤差:" + str(error)
        print dataDesc