# 預測腳踏車每小時的租借量

## 資料準備

In [None]:
global Path    
Path = "file:/home/spark/ntcu_workshop/"

In [None]:
hour_df= spark.read.format('csv').option("header", 'true').load(Path+"data/hour.csv")
hour_df.count()

In [None]:
hour_df=hour_df.drop("instant").drop("dteday").drop('yr').drop("casual").drop("registered")

In [None]:
from pyspark.sql.functions import col  

In [None]:
hour_df= hour_df.select([ col(column).cast("double").alias(column) 
                                          for column in hour_df.columns])

In [None]:
hour_df.show(5)

In [None]:
train_df, test_df = hour_df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

# 建立pipeline所需元件

## 這個範例前處理只需要VectorAssembler

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer,  VectorIndexer,VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
featuresCols = hour_df.columns[:-1]
print featuresCols 

In [None]:
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="aFeatures")
vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24)

## 建立 Decision Tree

In [None]:
dt = DecisionTreeRegressor(labelCol="cnt",featuresCol= 'features')
dt_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,dt])

## 使用pipeline進行資料處理

In [None]:
dt_pipelineModel = dt_pipeline.fit(train_df)

In [None]:
print dt_pipelineModel.stages[2].toDebugString[:500]

## 使用pipelineModel 進行預測

In [None]:
predicted_df=dt_pipelineModel.transform(test_df)

In [None]:
predicted_df.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', \
                     'weathersit', 'temp', 'atemp', 'hum', 'windspeed','cnt','prediction').show(10)

# 評估模型的準確率

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
evaluator = RegressionEvaluator(labelCol='cnt',predictionCol='prediction',metricName="rmse")

In [None]:
predicted_df=dt_pipelineModel.transform(test_df)
rmse = evaluator.evaluate(predicted_df)
rmse