In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


import matplotlib.pyplot as plt
#%matplotlib inline

In [2]:
#create Spark session
spark = SparkSession.builder.appName('ChicagoTaxiML').getOrCreate()

#change configuration settings on Spark 
#conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','8g')])

#print spark configuration settings
spark.sparkContext.getConf().getAll()

In [3]:
datadir = "/FileStore/tables/"
cleaned_df = spark.read.csv(datadir + 'FInal_Cleaned_All.csv',header=True, inferSchema=True)

# cleaned_df = spark.read \
#     .option("quote", "\"")  \
#     .option("escape", "\"") \
#     .option("ignoreLeadingWhiteSpace",True) \
#     .csv("/user/atal/data/FInal_Cleaned_All.csv",inferSchema=True, header=True )

In [4]:
cleaned_df.printSchema()

In [5]:
cleaned_df =cleaned_df.fillna(0)

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

#convert relevant categorical into one hot encoded
indexer1 = StringIndexer(inputCol="Community_Name", outputCol="communityIdx").setHandleInvalid("skip")

#gather all indexers as inputs to the One Hot Encoder
inputs = [indexer1.getOutputCol()]

#create the one hot encoder
encoder = OneHotEncoderEstimator(inputCols=inputs,  \
                                 outputCols=["communityVec"])

#run it through a pipeline
pipeline = Pipeline(stages=[indexer1, encoder])
encodedData = pipeline.fit(cleaned_df).transform(cleaned_df)


encodedData.show(5)

In [7]:
from pyspark.ml.feature import VectorAssembler

#gather feature vector and identify features
assembler = VectorAssembler(inputCols = ['AvgTemp', 'Divvy_TotalTrips', \
                                          'L_CTA_Sum_rides', 'communityVec'], \
                            outputCol = 'features')

encodedData = assembler.transform(encodedData)

In [8]:
encodedData.show()

In [9]:
encodedData = encodedData.withColumn('trip_date',F.from_unixtime(F.unix_timestamp(encodedData.Date,format='MM/dd/yyyy'),format ='yyyy-MM-dd').cast('date').alias('trip_date'))

In [10]:
#split data into train and test
train_df = encodedData.filter(encodedData.trip_date <='2016-12-31')
train_df = train_df.fillna(0)
train_df.show()

In [11]:
test_df = encodedData.filter(encodedData.trip_date >'2016-12-31')
test_df = test_df.na.fill(0)
test_df.show()

In [12]:
%%time
from pyspark.ml.regression import LinearRegression

#Elastic Net
lr = LinearRegression(featuresCol = 'features', labelCol='Taxi_total', regParam=0.3, elasticNetParam=0.8, maxIter=10)
lrm = lr.fit(train_df)

#coefficients
print("Coefficients: " + str(lrm.coefficients))
print("Intercept: " + str(lrm.intercept))

#model summary
print("RMSE: %f" % lrm.summary.rootMeanSquaredError)
print("r2: %f" % lrm.summary.r2)

#p-values are not provided in this model for the solver being used
#print("pValues: " + str(lrm.summary.pValues))

In [13]:
#make predictions
predictions = lrm.transform(test_df)

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator

eval = RegressionEvaluator(labelCol="Taxi_total", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

In [15]:
predictions.show()

In [16]:
#view predictions against test
predictions.select("Taxi_Total","prediction").show(365)

In [17]:
display(predictions.select("Taxi_Total","prediction"))

Taxi_Total,prediction
104,117.00570900764568
44,106.17707227152688
56,195.8571933166576
59,244.85120953265616
68,250.92485691810043
72,252.8574962256747
64,168.89336051180155
58,150.20745423720518
53,229.28825631373525
52,206.5849316039104
