# 預測腳踏車每小時的租借量

## 資料準備

In [1]:
global Path    
Path = "file:/home/spark/ntcu_workshop/"

In [2]:
hour_df= spark.read.format('csv').option("header", 'true').load(Path+"data/hour.csv")
hour_df.count()

17379

In [3]:
hour_df=hour_df.drop("instant").drop("dteday").drop('yr').drop("casual").drop("registered")

In [4]:
from pyspark.sql.functions import col  

In [5]:
hour_df= hour_df.select([ col(column).cast("double").alias(column) 
                                          for column in hour_df.columns])

In [6]:
hour_df.show(5)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|   1.0| 1.0|0.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.81|      0.0|16.0|
|   1.0| 1.0|1.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|40.0|
|   1.0| 1.0|2.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|32.0|
|   1.0| 1.0|3.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0|13.0|
|   1.0| 1.0|4.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0| 1.0|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
only showing top 5 rows



In [7]:
train_df, test_df = hour_df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

DataFrame[season: double, mnth: double, hr: double, holiday: double, weekday: double, workingday: double, weathersit: double, temp: double, atemp: double, hum: double, windspeed: double, cnt: double]

# 建立pipeline所需元件

## 這個範例前處理只需要VectorAssembler

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer,  VectorIndexer,VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

In [9]:
featuresCols = hour_df.columns[:-1]
print featuresCols 

['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']


In [10]:
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="aFeatures")
vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24)

## 建立 Decision Tree

In [11]:
dt = DecisionTreeRegressor(labelCol="cnt",featuresCol= 'features')
dt_pipeline = Pipeline(stages=[vectorAssembler,vectorIndexer ,dt])

## 使用pipeline進行資料處理

In [12]:
dt_pipelineModel = dt_pipeline.fit(train_df)

In [13]:
print dt_pipelineModel.stages[2].toDebugString[:500]

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_42d08924194587723241) of depth 5 with 63 nodes
  If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0,6.0,22.0,23.0})
   If (feature 2 in {0.0,1.0,2.0,3.0,4.0,5.0})
    If (feature 2 in {2.0,3.0,4.0,5.0})
     If (feature 4 in {1.0,2.0,3.0,4.0,5.0})
      If (feature 2 in {2.0,3.0,4.0})
       Predict: 6.783380018674136
      Else (feature 2 not in {2.0,3.0,4.0})
       Predict: 23.752186588921283
     Else (feature 4 not in {1.0,2.0,3.0,4.0,5.0})
    


## 使用pipelineModel 進行預測

In [14]:
predicted_df=dt_pipelineModel.transform(test_df)

In [15]:
predicted_df.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', \
                     'weathersit', 'temp', 'atemp', 'hum', 'windspeed','cnt','prediction').show(10)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|        prediction|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+------------------+
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.16|0.1364|0.47|   0.3284|59.0|57.060810810810814|
|   1.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.16|0.1364| 0.8|   0.2985|52.0|57.060810810810814|
|   1.0| 1.0|0.0|    0.0|    1.0|       1.0|       1.0|0.06|0.0606|0.41|    0.194| 7.0| 37.53443526170799|
|   1.0| 1.0|0.0|    0.0|    2.0|       1.0|       1.0|0.14|0.1667|0.59|   0.1045|12.0| 37.53443526170799|
|   1.0| 1.0|0.0|    0.0|    2.0|       1.0|       1.0|0.16|0.1818|0.55|   0.1045| 5.0| 37.53443526170799|
|   1.0| 1.0|0.0|    0.0|    2.0|       1.0|       1.0| 0.2| 0.197|0.51|   0.2537|13.0| 37.53443526170799|
|   1.0| 1.0|0.0|    0.0|    2.0|    

# 評估模型的準確率

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

In [17]:
evaluator = RegressionEvaluator(labelCol='cnt',predictionCol='prediction',metricName="rmse")

In [18]:
predicted_df=dt_pipelineModel.transform(test_df)
rmse = evaluator.evaluate(predicted_df)
rmse

95.99263119750364