In [1]:
import boto3
import os
import re
from tqdm import tqdm
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, IntegerType, BooleanType, FloatType, NumericType
import pyspark.sql.functions as F

In [2]:
os.getcwd()

'/home/jovyan/work/notebooks'

In [3]:
os.chdir('../')

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName('assignment_1') \
        .getOrCreate()

In [5]:
df = spark.read.parquet('data/combined_cleaned_data.parquet')

In [6]:
df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- taxi_colour: string (nullable = true)
 |-- pickup_date: timestamp (nullable = true)
 |-- dropoff_date: timestamp (nullable = true)



In [7]:
from pyspark.sql.types import DateType, IntegerType, BooleanType, FloatType
df= df.withColumn('total_amount', F.col('total_amount').astype(FloatType())).\
    withColumn('mta_tax', F.col('mta_tax').astype(FloatType())).\
    withColumn('improvement_surcharge', F.col('improvement_surcharge').astype(FloatType()))



### Data Cleaning 

In [8]:
cat_cols = ['RatecodeID','payment_type','taxi_colour','VendorID']
cols = ["VendorID",
 "RatecodeID",
 "passenger_count",
 "trip_distance",
 "extra",
 "mta_tax",
 "tip_amount",
 "tolls_amount",
 "improvement_surcharge",
 "total_amount",
 "payment_type",
 "taxi_colour",
 "pickup_date",
 "dropoff_date"]

In [9]:
df = df.select(cols)

In [10]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [11]:
stages = []
for cat_col in cat_cols:
    col_indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_ind")
    col_encoder = OneHotEncoderEstimator(inputCols=[f"{cat_col}_ind"], outputCols=[f"{cat_col}_ohe"])
    stages += [col_indexer, col_encoder]

In [12]:
df = df.withColumn('trip_duration',F.col("dropoff_date").cast("long") - F.col('pickup_date').cast("long"))

In [13]:
num_cols = [
 "passenger_count",
 "trip_distance",
 "extra",
 "mta_tax",
 "tip_amount",
 "tolls_amount",
 "improvement_surcharge",
 "trip_duration"]

In [14]:
cat_cols_ohe = [f"{cat_col}_ohe" for cat_col in cat_cols]

In [15]:
assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol="features")

In [16]:
stages += [assembler]

In [17]:
from pyspark.ml import Pipeline

In [18]:
pipeline = Pipeline(stages=stages)

### Split into Training and Test Set

In [19]:
import pyspark.sql.functions as F

In [20]:
pipeline_model = pipeline.fit(df)

In [21]:
df = pipeline_model.transform(df)

In [22]:
df_train = df.filter(F.col("pickup_date")<"2018-10-01")

In [23]:
df_train.show(5)

+--------+----------+---------------+-------------+-----+-------+----------+------------+---------------------+------------+------------+-----------+-------------------+-------------------+-------------+--------------+--------------+----------------+----------------+---------------+---------------+------------+-------------+--------------------+
|VendorID|RatecodeID|passenger_count|trip_distance|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type|taxi_colour|        pickup_date|       dropoff_date|trip_duration|RatecodeID_ind|RatecodeID_ohe|payment_type_ind|payment_type_ohe|taxi_colour_ind|taxi_colour_ohe|VendorID_ind| VendorID_ohe|            features|
+--------+----------+---------------+-------------+-----+-------+----------+------------+---------------------+------------+------------+-----------+-------------------+-------------------+-------------+--------------+--------------+----------------+----------------+---------------+---------------+-----

In [24]:
df_test = df.filter(F.col("pickup_date")>="2018-10-01")

In [25]:
from pyspark.ml.regression import RandomForestRegressor

In [26]:
rf = RandomForestRegressor(featuresCol='features', labelCol='total_amount', numTrees=100, maxDepth=15, minInstancesPerNode=10)

In [None]:
rf_model = rf.fit(df_train)

In [None]:
rf_model.save("models/rf_basic")