In [1]:
df = spark.read.csv('/FileStore/tables/hour.csv',header = 'True')
df.show()

In [2]:
print "Our dataset has %d rows." % df.count()


In [3]:
df = df.drop("instant").drop("dteday").drop("casual").drop("registered")
df.printSchema()


In [4]:
df = df.select([col(c).cast("double").alias(c) for c in df.columns])

In [5]:
df.printSchema()

In [6]:
train, test = df.randomSplit([0.7, 0.3])

In [7]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = df.columns
featuresCols.remove('cnt')


In [8]:
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)
from pyspark.ml.regression import GBTRegressor
# Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="cnt")

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()

In [9]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
pipelineModel = pipeline.fit(train)
predictions = pipelineModel.transform(test)
predictions.select("cnt", "prediction", *featuresCols)
rmse = evaluator.evaluate(predictions)
print "RMSE on our test set: %g" % rmse