In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz
!tar -xvf spark-3.0.0-preview2-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-preview2-bin-hadoop2.7"
import findspark
findspark.init()

In [0]:
# Importing the libraries required 
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import sum
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


In [71]:
# Reading the csv file into the dataframe
df_taxi = spark.read.csv('/content/nyc_taxi .csv', header='true', inferSchema='true')
df_taxi.show()

+-----------+-----------+------------+------------+--------+----+-----+
|pickup_date|pickup_time|dropoff_date|dropoff_time|distance| tip| fare|
+-----------+-----------+------------+------------+--------+----+-----+
|   1/1/2017|       0:00|    1/1/2017|        0:00|    0.02| 0.0| 52.8|
|   1/1/2017|       0:00|    1/1/2017|        0:03|     0.5| 0.0|  5.3|
|   1/1/2017|       0:00|    1/1/2017|        0:39|    7.75|4.66|27.96|
|   1/1/2017|       0:00|    1/1/2017|        0:06|     0.8|1.45| 8.75|
|   1/1/2017|       0:00|    1/1/2017|        0:08|     0.9| 0.0|  8.3|
|   1/1/2017|       0:00|    1/1/2017|        0:05|    1.76| 0.0|  8.3|
|   1/1/2017|       0:00|    1/1/2017|        0:15|    8.47|7.71|38.55|
|   1/1/2017|       0:00|    1/1/2017|        0:11|     2.4| 0.0| 11.8|
|   1/1/2017|       0:00|    1/1/2017|        0:23|    12.6|10.0| 70.3|
|   1/1/2017|       0:00|    1/1/2017|        0:08|     0.9|2.05|10.35|
|   1/1/2017|       0:00|    1/1/2017|        0:09|    2.43| 2.7

In [64]:
# Printing the schema of the dataset
df_taxi.printSchema()

root
 |-- pickup_date: string (nullable = true)
 |-- pickup_time: string (nullable = true)
 |-- dropoff_date: string (nullable = true)
 |-- dropoff_time: string (nullable = true)
 |-- distance: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- fare: double (nullable = true)



In [0]:
# Creating a feature array by omitting the last column

feature_col = ['distance']
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_col,outputCol="dist_feature")

In [43]:
#Utilize Assembler created above in order to add the feature column
feature_data = vect_assembler.transform(df_taxi)
feature_data.show()

+-----------+-----------+------------+------------+--------+----+-----+------------+
|pickup_date|pickup_time|dropoff_date|dropoff_time|distance| tip| fare|dist_feature|
+-----------+-----------+------------+------------+--------+----+-----+------------+
|   1/1/2017|       0:00|    1/1/2017|        0:00|    0.02| 0.0| 52.8|      [0.02]|
|   1/1/2017|       0:00|    1/1/2017|        0:03|     0.5| 0.0|  5.3|       [0.5]|
|   1/1/2017|       0:00|    1/1/2017|        0:39|    7.75|4.66|27.96|      [7.75]|
|   1/1/2017|       0:00|    1/1/2017|        0:06|     0.8|1.45| 8.75|       [0.8]|
|   1/1/2017|       0:00|    1/1/2017|        0:08|     0.9| 0.0|  8.3|       [0.9]|
|   1/1/2017|       0:00|    1/1/2017|        0:05|    1.76| 0.0|  8.3|      [1.76]|
|   1/1/2017|       0:00|    1/1/2017|        0:15|    8.47|7.71|38.55|      [8.47]|
|   1/1/2017|       0:00|    1/1/2017|        0:11|     2.4| 0.0| 11.8|       [2.4]|
|   1/1/2017|       0:00|    1/1/2017|        0:23|    12.6|10.0|

In [44]:
#Selecting the single feature (Distance) and fare to build the model
final_data = feature_data.select("dist_feature","fare")
final_data.show()

+------------+-----+
|dist_feature| fare|
+------------+-----+
|      [0.02]| 52.8|
|       [0.5]|  5.3|
|      [7.75]|27.96|
|       [0.8]| 8.75|
|       [0.9]|  8.3|
|      [1.76]|  8.3|
|      [8.47]|38.55|
|       [2.4]| 11.8|
|      [12.6]| 70.3|
|       [0.9]|10.35|
|      [2.43]| 13.5|
|       [2.6]|16.56|
|      [4.25]| 17.8|
|      [0.65]|  9.5|
|      [3.42]| 23.8|
|       [6.6]| 24.3|
|       [0.5]|  5.3|
|       [1.2]|10.55|
|       [1.7]| 10.8|
|       [5.3]| 17.3|
+------------+-----+
only showing top 20 rows



In [0]:
# Splitting the data into train and test data
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [67]:
# Display stats of train and test data
print("Train data statistics") 
train_data.describe().show()
print("Test data statistics")
test_data.describe().show()

Train data statistics
+-------+------------------+
|summary|              fare|
+-------+------------------+
|  count|            734371|
|   mean|16.328822516104452|
| stddev| 628.7022673516291|
|    min|            -120.3|
|    max|          538580.0|
+-------+------------------+

Test data statistics
+-------+------------------+
|summary|              fare|
+-------+------------------+
|  count|            314199|
|   mean| 15.61728640130228|
| stddev|13.713411273659126|
|    min|             -70.3|
|    max|             450.3|
+-------+------------------+



In [69]:
# Building the regression model using  MLlib to predict fare using distance

#Import Linear Regression class called LinearRegression
from pyspark.ml.regression import LinearRegression

# Creating an object to run the linear regression model
lg = LinearRegression(featuresCol="dist_feature", labelCol="fare")

# Linear Regression Model : Training the data using the fit()
lg_model = lg.fit(train_data)

# Testing the data
lg_model_pred = lg_model.evaluate(test_data)

# Display the predictions
lg_model_pred.predictions.show()

+------------+-----+-----------------+
|dist_feature| fare|       prediction|
+------------+-----+-----------------+
|       [0.0]|-65.3|6.923642343530351|
|       [0.0]|-52.8|6.923642343530351|
|       [0.0]|-52.8|6.923642343530351|
|       [0.0]|-52.8|6.923642343530351|
|       [0.0]|-40.8|6.923642343530351|
|       [0.0]|-3.96|6.923642343530351|
|       [0.0]| -3.8|6.923642343530351|
|       [0.0]| -3.8|6.923642343530351|
|       [0.0]| -3.8|6.923642343530351|
|       [0.0]| -3.8|6.923642343530351|
|       [0.0]| -3.3|6.923642343530351|
|       [0.0]| -3.3|6.923642343530351|
|       [0.0]| -3.3|6.923642343530351|
|       [0.0]| -3.3|6.923642343530351|
|       [0.0]| -3.3|6.923642343530351|
|       [0.0]| -3.3|6.923642343530351|
|       [0.0]|  0.0|6.923642343530351|
|       [0.0]|  0.0|6.923642343530351|
|       [0.0]|  0.0|6.923642343530351|
|       [0.0]|  0.0|6.923642343530351|
+------------+-----+-----------------+
only showing top 20 rows



In [75]:
data2 = df_taxi.select(df_taxi.distance,df_taxi.fare.alias('label'))
train, test = data2.randomSplit([0.7,0.3])
assembler = VectorAssembler().setInputCols(['distance'])\
.setOutputCol('features')
train01 = assembler.transform(train)
''' we only need features and label column '''
train02 = train01.select("features","label")
train02.show(truncate=False)

+--------+------+
|features|label |
+--------+------+
|[0.0]   |-120.3|
|[0.0]   |-80.8 |
|[0.0]   |-80.8 |
|[0.0]   |-52.8 |
|[0.0]   |-52.8 |
|[0.0]   |-52.8 |
|[0.0]   |-52.8 |
|[0.0]   |-45.8 |
|[0.0]   |-36.8 |
|[0.0]   |-10.3 |
|[0.0]   |-8.8  |
|[0.0]   |-4.3  |
|[0.0]   |-4.3  |
|[0.0]   |-3.96 |
|[0.0]   |-3.8  |
|[0.0]   |-3.8  |
|[0.0]   |-3.8  |
|[0.0]   |-3.8  |
|[0.0]   |-3.8  |
|[0.0]   |-3.8  |
+--------+------+
only showing top 20 rows



In [76]:
lr = LinearRegression()
model = lr.fit(train02)
test01 = assembler.transform(test)
test02 = test01.select('features', 'label')
test03 = model.transform(test02)
test03.show(truncate=False)

+--------+------+-----------------+
|features|label |prediction       |
+--------+------+-----------------+
|[0.0]   |-75.3 |6.940570979646689|
|[0.0]   |-65.3 |6.940570979646689|
|[0.0]   |-52.8 |6.940570979646689|
|[0.0]   |-40.8 |6.940570979646689|
|[0.0]   |-40.3 |6.940570979646689|
|[0.0]   |-36.36|6.940570979646689|
|[0.0]   |-35.8 |6.940570979646689|
|[0.0]   |-13.1 |6.940570979646689|
|[0.0]   |-4.3  |6.940570979646689|
|[0.0]   |-4.3  |6.940570979646689|
|[0.0]   |-4.3  |6.940570979646689|
|[0.0]   |-3.8  |6.940570979646689|
|[0.0]   |-3.8  |6.940570979646689|
|[0.0]   |-3.8  |6.940570979646689|
|[0.0]   |-3.8  |6.940570979646689|
|[0.0]   |-3.8  |6.940570979646689|
|[0.0]   |-3.8  |6.940570979646689|
|[0.0]   |-3.3  |6.940570979646689|
|[0.0]   |-3.3  |6.940570979646689|
|[0.0]   |-3.3  |6.940570979646689|
+--------+------+-----------------+
only showing top 20 rows

