# NYC_Taxi Fare Prediction with RayDP

In [1]:
import ray
import os
import datetime
import pandas as pd, numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from pyspark.sql.functions import *

from raydp.spark import context
from raydp.spark.torch.estimator import TorchEstimator
from raydp.spark.utils import random_split

## Initialize or connect to existed Ray cluster

In [None]:
# Firstly, You need to init or connect to a ray cluster. Note that you should set include_java to True.
# For more config info in ray, please refer the ray doc. https://docs.ray.io/en/latest/package-ref.html
# ray.init(address="auto", redis_password="123")
ray.init(include_java=True)

## Create Spark session based on RayDP API

In [None]:
# After initialize ray cluster, you can use the raydp api to get a spark session
app_name = "NYC_Taxi Fare Prediction with RayDP"
num_executors = 2
cores_per_executor = 1
memory_per_executor = "5GB"
spark = context.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor)

## Distributed data preprocessing with pyspark

In [4]:
# Then you can operate as you are using spark
# The dataset can be downloaded from https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/data
data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load("/mnt/DP_disk8/nyc_taxi.csv")

In [5]:
# filter data
data = data.select("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "fare_amount") \
    .filter(data['pickup_longitude'] <= -73.0)  \
    .filter(data['pickup_longitude'] >= -74.3)  \
    .filter(data['dropoff_longitude'] <= -73.0) \
    .filter(data['dropoff_longitude'] >= -74.3) \
    .filter(data['pickup_latitude'] <= 41.7)    \
    .filter(data['pickup_latitude'] >= 40.6)    \
    .filter(data['dropoff_latitude'] <= 41.7)   \
    .filter(data['dropoff_latitude'] >= 40.6)   \
    .filter(data['passenger_count'] <= 10)      \
    .filter(data['fare_amount'] >= 0.0)         \
    .filter(abs(data['dropoff_longitude'] - data['pickup_longitude']) <= 5) \
    .filter(abs(data['dropoff_latitude'] - data['pickup_latitude']) <= 5)
# add two new features
data = data.withColumn("abs_diff_longitude", abs(col("dropoff_longitude") - col("pickup_longitude"))) \
           .withColumn("abs_diff_latitude", abs(col("dropoff_latitude") - col("pickup_latitude")))

In [None]:
# split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.7, 0.3])
# train_df = train_data_select.sample(False, 0.0005)
# test_df = train_data_select.sample(False, 0.0001)

In [None]:
train_df.show()

In [None]:
test_df.show()

## Distributed model training and evaluation

In [7]:
# Define the model, loss function and optimizer
class NYC_Model(nn.Module):
    def __init__(self):
        super(NYC_Model, self).__init__()
        
        self.fc1 = nn.Linear(7, 14)
        self.fc2 = nn.Linear(14, 7)
        self.fc3 = nn.Linear(7, 1)

    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

nyc_model = NYC_Model()
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(nyc_model.parameters(), lr=0.001)

In [9]:
# create the estimator which is a scikit-learn like API for Torch distributed model training
features = [field.name for field in list(train_data_select.schema) if field.name != "fare_amount"]
estimator = TorchEstimator(num_workers=4,
                           model=nyc_model,
                           optimizer=optimizer,
                           loss=criterion, 
                           feature_columns=features,
                           label_column="fare_amount",
                           batch_size=100,
                           num_epochs=100)

In [None]:
# distributed training the model
estimator.fit(train_df)

In [None]:
# evaluate the model
estimator.evaluate(test_df)

In [12]:
# shutdown raydp and ray
estimator.shutdown()
context.stop_spark()
ray.shutdown()