# NYC Taxi Fare Prediction with RayDP and Tensorflow

In [1]:
import ray
import os
import pandas as pd, numpy as np

from pyspark.sql.functions import *

from tensorflow import keras

import raydp
from raydp.tf import TFEstimator
from raydp.utils import random_split

## Initialize or connect to existed Ray cluster

Firstly, you need to init or connect to a ray cluster. Note that you should set include_java to True.

In [None]:
# For more config info in ray, please refer the ray doc. https://docs.ray.io/en/latest/package-ref.html
# ray.init(address="auto", redis_password="123")
ray.init()

After initializing ray cluster, you can use the raydp api to get a spark session  
`init_spark` take 4 required parameters and 1 optional parameters:  
1. app_name: the application name
2. num_executors: number of executors for spark application
3. cores_per_executor: number of cores for each executor
4. executor_memory: memory size for each executor 
5. config[option]: extra config for spark

In [None]:
app_name = "NYC_Taxi Fare Prediction with RayDP"
num_executors = 4
cores_per_executor = 1
memory_per_executor = "2GB"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor)

## Distributed data preprocessing with pyspark

Then you can do distributed data processing with spark session  
You can download the dataset from https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/data

In [6]:
# Here we just use a subset of the training data
train = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load("/mnt/DP_disk8/nyc_train_1.csv")

# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")

In [7]:
# Clean up the outlier
def clean_up(data):
    
    data = data.filter(col('pickup_longitude')<=-72) \
            .filter(col('pickup_longitude')>=-76) \
            .filter(col('dropoff_longitude')<=-72) \
            .filter(col('dropoff_longitude')>=-76) \
            .filter(col('pickup_latitude')<=42) \
            .filter(col('pickup_latitude')>=38) \
            .filter(col('dropoff_latitude')<=42) \
            .filter(col('dropoff_latitude')>=38) \
            .filter(col('passenger_count')<=6) \
            .filter(col('passenger_count')>=1) \
            .filter(col('fare_amount') > 0) \
            .filter(col('fare_amount') < 250) \
            .filter(col('dropoff_longitude') != col('pickup_longitude')) \
            .filter(col('dropoff_latitude') != col('pickup_latitude')) 
    
    return data

In [8]:
# Add time related features
def add_time_features(data):
    
    data = data.withColumn("day", dayofmonth(col("pickup_datetime")))
    data = data.withColumn("hour_of_day", hour(col("pickup_datetime")))
    data = data.withColumn("day_of_week", dayofweek(col("pickup_datetime"))-2)
    data = data.withColumn("week_of_year", weekofyear(col("pickup_datetime")))
    data = data.withColumn("month_of_year", month(col("pickup_datetime")))
    data = data.withColumn("quarter_of_year", quarter(col("pickup_datetime")))
    data = data.withColumn("year", year(col("pickup_datetime")))
    
    @udf("int")
    def night(hour, weekday):
        if ((hour <= 20) and (hour >= 16) and (weekday < 5)):
            return int(1)
        else:
            return int(0)

    @udf("int")
    def late_night(hour):
        if ((hour <= 6) and (hour >= 20)):
            return int(1)
        else:
            return int(0)
    data = data.withColumn("night", night("hour_of_day", "day_of_week"))
    data = data.withColumn("late_night", late_night("hour_of_day"))
    
    return data

In [9]:
# Add distance related features
def add_distance_features(data):

    @udf("float")
    def manhattan(lat1, lon1, lat2, lon2):
        return float(np.abs(lat2 - lat1) + np.abs(lon2 - lon1))
    
    # Location of NYC downtown
    ny = (-74.0063889, 40.7141667)
    # Location of the three airport in NYC
    jfk = (-73.7822222222, 40.6441666667)
    ewr = (-74.175, 40.69)
    lgr = (-73.87, 40.77)
    
    # Feature about the distance from pickup and dropoff location to airports
    data = data.withColumn("abs_diff_longitude", abs(col("dropoff_longitude")-col("pickup_longitude"))) \
            .withColumn("abs_diff_latitude", abs(col("dropoff_latitude") - col("pickup_latitude")))
    data = data.withColumn("manhattan", col("abs_diff_latitude")+col("abs_diff_longitude"))
    data = data.withColumn("pickup_distance_jfk", manhattan("pickup_longitude", "pickup_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("dropoff_distance_jfk", manhattan("dropoff_longitude", "dropoff_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("pickup_distance_ewr", manhattan("pickup_longitude", "pickup_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("dropoff_distance_ewr", manhattan("dropoff_longitude", "dropoff_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("pickup_distance_lgr", manhattan("pickup_longitude", "pickup_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("dropoff_distance_lgr", manhattan("dropoff_longitude", "dropoff_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("pickup_distance_downtown", manhattan("pickup_longitude", "pickup_latitude", lit(ny[0]), lit(ny[1])))
    data = data.withColumn("dropoff_distance_downtown", manhattan("dropoff_longitude", "dropoff_latitude", lit(ny[0]), lit(ny[1])))
    
    return data

In [10]:
# Drop unused features
def drop_col(data):
    
    data = data.drop("pickup_datetime") \
            .drop("pickup_longitude") \
            .drop("pickup_latitude") \
            .drop("dropoff_longitude") \
            .drop("dropoff_latitude") \
            .drop("passenger_count") \
            .drop("key")
    
    return data

In [11]:
train_data = clean_up(train)

train_data = add_time_features(train_data)

train_data = add_distance_features(train_data)

train_data = drop_col(train_data)

In [None]:
train_data.show(5)

## Distributed model training and evaluation

Split dataset into training and evaluation dataset

In [15]:
train_df, test_df = random_split(train_data, [0.9, 0.1])
features = [field.name for field in list(train_df.schema) if field.name != "fare_amount"]

Define the keras model  
Each feature will be regarded as an input with shape (1,）

In [16]:
inTensor = []
for _ in range(len(features)):
    inTensor.append(keras.Input((1,)))
    
concatenated = keras.layers.concatenate(inTensor)
fc1 = keras.layers.Dense(256, activation='relu')(concatenated)
bn1 = keras.layers.BatchNormalization()(fc1)
fc2 = keras.layers.Dense(128, activation='relu')(bn1)
bn2 = keras.layers.BatchNormalization()(fc2)
fc3 = keras.layers.Dense(64, activation='relu')(bn2)
bn3 = keras.layers.BatchNormalization()(fc3)
fc4 = keras.layers.Dense(32, activation='relu')(bn3)
bn4 = keras.layers.BatchNormalization()(fc4)
fc5 = keras.layers.Dense(16, activation='relu')(bn4)
bn5 = keras.layers.BatchNormalization()(fc5)
fc6 = keras.layers.Dense(1)(bn5)

model = keras.models.Model(inTensor, fc6)

Define the optimizer and loss function  
Then create the tensorflow estimator provided by Raydp

In [17]:
adam = keras.optimizers.Adam(lr=0.001)
loss = keras.losses.MeanSquaredError()
estimator = TFEstimator(num_workers=6, model=model, optimizer=adam, loss=loss, metrics=["mae"],
                       feature_columns=features, label_column="fare_amount", batch_size=256, num_epochs=30,
                        config={"fit_config": {"steps_per_epoch": train_df.count() // 256}})

In [None]:
estimator.fit_on_spark(train_df, test_df)

In [20]:
estimator.shutdown()
raydp.stop_spark()
ray.shutdown()