In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,SQLContext,Row
from pyspark.streaming import StreamingContext
#from pyspark.streaming.kafka import KafkaUtils
from kafka import KafkaProducer
import pyspark.sql.functions as F
from pyspark.sql.types import *

# machine learning APIs
from pyspark.ml.feature import VectorAssembler, Vector
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression,DecisionTreeRegressor, GBTRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import NGram
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator


In [6]:
spark = SparkSession.builder.master('spark://nasa:7077').appName('bikemodeldevelopment').config("spark.executor.memory", "1g").config('spark.jars', 'mysql-connector-j-8.1.0.jar').config("spark.cores.max", "2").getOrCreate()

In [7]:
spark.sparkContext.setLogLevel('ERROR')

In [8]:
spark

# Data exploration and transformation 

In [5]:
!ls -lh dataset

total 640K
-rw-rw-r-- 1 hadoop hadoop  617 mar  8 00:59 bikestreamdata.csv
-rw-rw-r-- 1 hadoop hadoop 634K dez 11  2019 train.csv


# 1)  Read dataset

In [9]:
train = spark.read.csv('dataset/train.csv', sep=',', header=True, inferSchema=True)
train.show(2)

+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
|           datetime|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|
+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
|2011-01-01 00:00:00|     1|      0|         0|      1|9.84|14.395|      81|      0.0|     3|        13|   16|
|2011-01-01 01:00:00|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     8|        32|   40|
+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
only showing top 2 rows



# 2) summary data types 

In [7]:
train.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- season: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)



In [8]:
train.describe().show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|summary|            season|            holiday|        workingday|           weather|              temp|            atemp|          humidity|         windspeed|           casual|        registered|             count|
+-------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|  count|             10886|              10886|             10886|             10886|             10886|            10886|             10886|             10886|            10886|             10886|             10886|
|   mean|2.5066139996325556|0.02856880396839978|0.6808745177291935| 1.418427337865148|20.230859819952173|23.65508405291192| 61.8

                                                                                

# 3) understanding categorical columns 

In [9]:
train.show(2)

+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
|           datetime|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|
+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
|2011-01-01 00:00:00|     1|      0|         0|      1|9.84|14.395|      81|      0.0|     3|        13|   16|
|2011-01-01 01:00:00|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     8|        32|   40|
+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+
only showing top 2 rows



In [10]:
# check distinct columns values
train.agg(*[F.countDistinct(col).alias(col) for col in train.columns ]).show()

[Stage 10:>                                                         (0 + 1) / 1]

+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
|datetime|season|holiday|workingday|weather|temp|atemp|humidity|windspeed|casual|registered|count|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
|   10885|     4|      2|         2|      4|  49|   60|      89|       28|   309|       731|  822|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+



                                                                                

In [10]:
# define the most suitable categorical columns 
categorical_cols=['season','holiday','workingday','weather']
onehotencoder = [OneHotEncoder(inputCol=col, outputCol=f'{col}_ecd') for col in categorical_cols]

In [11]:
stage = onehotencoder
pipeline = Pipeline().setStages(stage)
traindf = pipeline.fit(train).transform(train)
traindf.show(3)

[Stage 10:>                                                         (0 + 1) / 1]

+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+-------------+-------------+--------------+-------------+
|           datetime|season|holiday|workingday|weather|temp| atemp|humidity|windspeed|casual|registered|count|   season_ecd|  holiday_ecd|workingday_ecd|  weather_ecd|
+-------------------+------+-------+----------+-------+----+------+--------+---------+------+----------+-----+-------------+-------------+--------------+-------------+
|2011-01-01 00:00:00|     1|      0|         0|      1|9.84|14.395|      81|      0.0|     3|        13|   16|(4,[1],[1.0])|(1,[0],[1.0])| (1,[0],[1.0])|(4,[1],[1.0])|
|2011-01-01 01:00:00|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     8|        32|   40|(4,[1],[1.0])|(1,[0],[1.0])| (1,[0],[1.0])|(4,[1],[1.0])|
|2011-01-01 02:00:00|     1|      0|         0|      1|9.02|13.635|      80|      0.0|     5|        27|   32|(4,[1],[1.0])|(1,[0],[1.0])| (1,[0],[1.0])|(4,[1],

                                                                                

# 4) check missed value 

In [13]:
# check null values
traindf.agg(*[F.count(F.when(F.isnull(c), c)).alias(c) for c in traindf.columns]).show()

+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+----------+-----------+--------------+-----------+
|datetime|season|holiday|workingday|weather|temp|atemp|humidity|windspeed|casual|registered|count|season_ecd|holiday_ecd|workingday_ecd|weather_ecd|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+----------+-----------+--------------+-----------+
|       0|     0|      0|         0|      0|   0|    0|       0|        0|     0|         0|    0|         0|          0|             0|          0|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+----------+-----------+--------------+-----------+



# 5-6) explode season and weather columns

In [14]:
traindf.select('season').distinct().show(),traindf.select('weather').distinct().show()

+------+
|season|
+------+
|     1|
|     3|
|     4|
|     2|
+------+

+-------+
|weather|
+-------+
|      1|
|      3|
|      4|
|      2|
+-------+



(None, None)

In [12]:
traindf = traindf.withColumn('season_1',F.when(F.col('season')==1,1).otherwise(0))
traindf = traindf.withColumn('season_2',F.when(F.col('season')==2,2).otherwise(0))
traindf = traindf.withColumn('season_3',F.when(F.col('season')==3,3).otherwise(0))
traindf = traindf.withColumn('season_4',F.when(F.col('season')==4,4).otherwise(0))


In [13]:
traindf = traindf.withColumn('weather_1',F.when(F.col('weather')==1,1).otherwise(0))
traindf = traindf.withColumn('weather_2',F.when(F.col('weather')==2,2).otherwise(0))
traindf = traindf.withColumn('weather_3',F.when(F.col('weather')==3,3).otherwise(0))
traindf = traindf.withColumn('weather_4',F.when(F.col('weather')==4,4).otherwise(0))


In [14]:
df = traindf
# df = traindf.drop('season','weather')

In [12]:
df.show(2)

+-------------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+--------------+-------------+--------+--------+--------+--------+---------+---------+---------+---------+
|           datetime|holiday|workingday|temp| atemp|humidity|windspeed|casual|registered|count|   season_ecd|  holiday_ecd|workingday_ecd|  weather_ecd|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|
+-------------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+--------------+-------------+--------+--------+--------+--------+---------+---------+---------+---------+
|2011-01-01 00:00:00|      0|         0|9.84|14.395|      81|      0.0|     3|        13|   16|(4,[1],[1.0])|(1,[0],[1.0])| (1,[0],[1.0])|(4,[1],[1.0])|       1|       0|       0|       0|        1|        0|        0|        0|
|2011-01-01 01:00:00|      0|         0|9.02|13.635|      80|      0.0|     8|      

# 7) split datetime column in hour, day, month, year 

In [15]:
df = df.withColumn('datetime', F.to_timestamp(F.col('datetime'),'d-M-y H:m'))

In [16]:
dfdate=df.withColumn('year', F.year(F.col('datetime'))) \
        .withColumn('month', F.month(F.col('datetime'))) \
        .withColumn('day', F.dayofmonth(F.col('datetime'))) \
        .withColumn('hour', F.hour(F.col('datetime')))
dfdate.select('datetime','year','month','day','hour').show(10)

+-------------------+----+-----+---+----+
|           datetime|year|month|day|hour|
+-------------------+----+-----+---+----+
|2011-01-01 00:00:00|2011|    1|  1|   0|
|2011-01-01 01:00:00|2011|    1|  1|   1|
|2011-01-01 02:00:00|2011|    1|  1|   2|
|2011-01-01 03:00:00|2011|    1|  1|   3|
|2011-01-01 04:00:00|2011|    1|  1|   4|
|2011-01-01 05:00:00|2011|    1|  1|   5|
|2011-01-01 06:00:00|2011|    1|  1|   6|
|2011-01-01 07:00:00|2011|    1|  1|   7|
|2011-01-01 08:00:00|2011|    1|  1|   8|
|2011-01-01 09:00:00|2011|    1|  1|   9|
+-------------------+----+-----+---+----+
only showing top 10 rows



In [15]:
dfdate.show(1)

+-------------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+--------------+-------------+--------+--------+--------+--------+---------+---------+---------+---------+----+-----+---+----+
|           datetime|holiday|workingday|temp| atemp|humidity|windspeed|casual|registered|count|   season_ecd|  holiday_ecd|workingday_ecd|  weather_ecd|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|year|month|day|hour|
+-------------------+-------+----------+----+------+--------+---------+------+----------+-----+-------------+-------------+--------------+-------------+--------+--------+--------+--------+---------+---------+---------+---------+----+-----+---+----+
|2011-01-01 00:00:00|      0|         0|9.84|14.395|      81|      0.0|     3|        13|   16|(4,[1],[1.0])|(1,[0],[1.0])| (1,[0],[1.0])|(4,[1],[1.0])|       1|       0|       0|       0|        1|        0|        0|        0|2011|    1|  1|   0|
+---

In [16]:
print(dfdate.columns)

['datetime', 'holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'season_ecd', 'holiday_ecd', 'workingday_ecd', 'weather_ecd', 'season_1', 'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2', 'weather_3', 'weather_4', 'year', 'month', 'day', 'hour']


# 8) explore feature count along the time

In [24]:
dfdate.groupby('month').agg(F.count('count').alias('Count')).sort(F.desc('Count')).show()
dfdate.groupby('hour').agg(F.count('count').alias('Count')).sort(F.desc('Count')).show()

                                                                                

+-----+-----+
|month|Count|
+-----+-----+
|   12|  912|
|    8|  912|
|    6|  912|
|    7|  912|
|    5|  912|
|   10|  911|
|   11|  911|
|    9|  909|
|    4|  909|
|    2|  901|
|    3|  901|
|    1|  884|
+-----+-----+

+----+-----+
|hour|Count|
+----+-----+
|  13|  456|
|  22|  456|
|  14|  456|
|  18|  456|
|  19|  456|
|  15|  456|
|  12|  456|
|  17|  456|
|  16|  456|
|  23|  456|
|  21|  456|
|  20|  456|
|   6|  455|
|   7|  455|
|   1|  455|
|   9|  455|
|  10|  455|
|   8|  455|
|  11|  455|
|   0|  454|
+----+-----+
only showing top 20 rows



# Model development 

## Split data 

In [17]:
# split data to test the model
traindata,testdata = dfdate.randomSplit([0.7,0.3],seed=1234)
traindata.count(),testdata.count()

                                                                                

(7630, 3256)

In [18]:
traindata.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- season: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- season_ecd: vector (nullable = true)
 |-- holiday_ecd: vector (nullable = true)
 |-- workingday_ecd: vector (nullable = true)
 |-- weather_ecd: vector (nullable = true)
 |-- season_1: integer (nullable = false)
 |-- season_2: integer (nullable = false)
 |-- season_3: integer (nullable = false)
 |-- season_4: integer (nullable = false)
 |-- weather_1: integer (nullable = false)
 |-- weather_2: integer (nullable = false)
 |-- weather_3: integer (nullable = false)
 |-- weather_4: integer (nullable = false)
 

## Regression algorithms

In [19]:
# traindata = traindata.drop('casual','registered')
print(traindata.columns)
len(traindata.columns)

['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'season_ecd', 'holiday_ecd', 'workingday_ecd', 'weather_ecd', 'season_1', 'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2', 'weather_3', 'weather_4', 'year', 'month', 'day', 'hour']


28

In [20]:
feature = [ 'holiday', 'workingday', 'temp', 'count', 'season_ecd', 'holiday_ecd', 'workingday_ecd', 'weather_ecd', 'year', 'month', 'day', 'hour']

len(feature)

12

In [21]:
vectorassembler = VectorAssembler(inputCols=feature, outputCol='features')

# vectorassembler = VectorAssembler(inputCols=feature, outputCol='features',handleInvalid="skip")

# Linear regression implementation 

In [22]:
# model setting
lr = LinearRegression(featuresCol="features", labelCol="count", maxIter=100, regParam=0.8, elasticNetParam=0.2)

In [23]:
# pipeline
pipeline = Pipeline().setStages([vectorassembler,lr])

In [24]:
# training model
lrmodel = pipeline.fit(traindata)
predictionlr = lrmodel.transform(testdata)

                                                                                

In [25]:
predictionlr.select('features','count','prediction').show(10)

+--------------------+-----+------------------+
|            features|count|        prediction|
+--------------------+-----+------------------+
|(18,[2,3,5,8,9,11...|   16|   16.254671829779|
|(18,[2,3,5,8,9,11...|   40| 40.13419650353228|
|(18,[2,3,5,8,9,11...|    1|1.4568736550925792|
|(18,[2,3,5,8,9,12...|   34|  34.9524258604564|
|(18,[2,3,5,8,9,12...|   17| 17.47186565716089|
|(18,[2,3,5,8,9,12...|   17|17.481800632114698|
|(18,[2,3,5,8,9,12...|    9| 9.535205707468663|
|(18,[2,3,5,8,9,12...|    6| 6.622158585508544|
|(18,[2,3,5,8,9,13...|    2|2.6939374323820857|
|(18,[2,3,5,8,9,13...|    8| 8.701443678801581|
+--------------------+-----+------------------+
only showing top 10 rows



In [26]:
# evaluate the model accuracy
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictionlr)

                                                                                

In [27]:
# Show  RMSE
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 0.8522963740033911


# Random forest 

In [28]:
# model setting
rf = RandomForestRegressor(featuresCol="features", labelCol="count",numTrees=10, maxDepth=25, maxBins=32, seed=1234)

In [29]:
# pipeline
pipeline = Pipeline().setStages([vectorassembler,rf])

In [30]:
# training model
rfmodel = pipeline.fit(traindata)
predictionrf = rfmodel.transform(testdata)

                                                                                

In [31]:
predictionrf.select('features','count','prediction').show(10)

+--------------------+-----+------------------+
|            features|count|        prediction|
+--------------------+-----+------------------+
|(18,[2,3,5,8,9,11...|   16|              15.8|
|(18,[2,3,5,8,9,11...|   40| 33.36666666666666|
|(18,[2,3,5,8,9,11...|    1|               2.1|
|(18,[2,3,5,8,9,12...|   34| 33.77333333333333|
|(18,[2,3,5,8,9,12...|   17|              18.1|
|(18,[2,3,5,8,9,12...|   17|              17.3|
|(18,[2,3,5,8,9,12...|    9|             11.55|
|(18,[2,3,5,8,9,12...|    6|              9.65|
|(18,[2,3,5,8,9,13...|    2|              4.04|
|(18,[2,3,5,8,9,13...|    8|22.669047619047618|
+--------------------+-----+------------------+
only showing top 10 rows



                                                                                

In [32]:
# evaluate the model accuracy
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmserf = evaluator.evaluate(predictionrf)

                                                                                

In [33]:
# Show  RMSE
print(f"Root Mean Squared Error (RMSE): {rmserf}")

Root Mean Squared Error (RMSE): 15.554131865945793


# Decision Tree Regressor 

In [70]:
# model setting
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="count",maxDepth=5, maxBins=32, minInstancesPerNode=2, minInfoGain=0.0, seed=1234)

In [71]:
# pipeline
pipeline = Pipeline().setStages([vectorassembler,dtr])

In [72]:
# training model
dtrmodel = pipeline.fit(traindata)
predictiondtr = dtrmodel.transform(testdata)

In [73]:
predictiondtr.select('features','count','prediction').show(10)

+--------------------+-----+------------------+
|            features|count|        prediction|
+--------------------+-----+------------------+
|(24,[0,1,3,6,11,1...|   16| 71.65573770491804|
|(24,[0,1,3,6,11,1...|   40|61.917073170731705|
|(24,[0,1,3,6,11,1...|    1|14.447019867549669|
|(24,[0,1,3,6,12,1...|   34| 53.04950495049505|
|(24,[0,1,3,6,12,1...|   17| 71.65573770491804|
|(24,[0,1,3,6,12,1...|   17|61.917073170731705|
|(24,[0,1,3,6,12,1...|    9|61.917073170731705|
|(24,[0,1,3,6,12,1...|    6|14.447019867549669|
|(24,[0,1,3,6,13,1...|    2|              20.0|
|(24,[0,1,3,6,13,1...|    8|110.55213675213675|
+--------------------+-----+------------------+
only showing top 10 rows



In [74]:
# evaluate the model accuracy
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmsedtr = evaluator.evaluate(predictiondtr)

In [75]:
# Show  RMSE
print(f"Root Mean Squared Error (RMSE): {rmsedtr}")

Root Mean Squared Error (RMSE): 109.0188618316442


# GBT Regressor  

In [92]:
# model setting
gbt = GBTRegressor(featuresCol="features", labelCol="count",maxIter=10, maxDepth=10,seed=1234)

In [93]:
# pipeline
pipeline = Pipeline().setStages([vectorassembler,gbt])

In [94]:
# training model
gbtmodel = pipeline.fit(traindata)
predictiongbt = gbtmodel.transform(testdata)

In [95]:
predictiongbt.select('features','count','prediction').show(10)

+--------------------+-----+------------------+
|            features|count|        prediction|
+--------------------+-----+------------------+
|(24,[0,3,7,10,14,...|    1| 2.897580222130014|
|(24,[0,3,7,10,14,...|    1|1.5788989034486953|
|(24,[0,3,7,10,14,...|   14|16.989734142350272|
|(24,[0,1,3,7,10,1...|   26|62.095216048495466|
|(24,[0,1,3,9,10,1...|   24|  46.6383101293009|
|(24,[0,3,7,10,14,...|    7| 7.274393456773156|
|(24,[0,3,7,10,14,...|   68| 32.30023170853538|
|(24,[0,3,7,10,14,...|   17| 9.983125827339773|
|(24,[0,3,7,10,14,...|  199| 200.6515169642931|
|(24,[0,1,3,7,10,1...|   12|21.216400386806015|
+--------------------+-----+------------------+
only showing top 10 rows



In [96]:
# evaluate the model accuracy
evaluator = RegressionEvaluator(labelCol="count", predictionCol="prediction", metricName="rmse")
rmsegbt = evaluator.evaluate(predictiongbt)

In [97]:
# Show  RMSE
print(f"Root Mean Squared Error (RMSE): {rmsegbt}")

Root Mean Squared Error (RMSE): 59.221514677453484


# best choose model  

In [53]:
# the best model is linear regression

In [240]:
# persist the model
lrmodel.write().overwrite().save("lrmodel")
print('Model saved')
!ls -ld lr*

Model saved
drwxr-xr-x 4 hadoop hadoop 4096 mar 11 05:37 lrmodel


In [100]:
spark.stop()