In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shapefile

# a nice way of filtering out deprecated warnings
import warnings
warnings.filterwarnings("ignore")

# Spark

In [2]:
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")

# create a spark session (which will run spark jobs)
spark = SparkSession.builder.getOrCreate()

21/08/15 02:22:17 WARN Utils: Your hostname, LIVIA resolves to a loopback address: 127.0.1.1; using 172.22.214.215 instead (on interface eth0)
21/08/15 02:22:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/08/15 02:22:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Import CSV from January 2020 to March 2020

In [3]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)

sdf = spark.read.csv('../data/yellow_tripdata_2020-0[1-3].csv', header=True)

f"{sdf.count():,} rows."

                                                                                

'15,711,654 rows.'

In [4]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
sdf.limit(5)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1,2020-01-01 00:28:15,2020-01-01 00:33:03,1,1.2,1,N,238,239,1,6.0,3.0,0.5,1.47,0,0.3,11.27,2.5
1,2020-01-01 00:35:39,2020-01-01 00:43:04,1,1.2,1,N,239,238,1,7.0,3.0,0.5,1.5,0,0.3,12.3,2.5
1,2020-01-01 00:47:41,2020-01-01 00:53:52,1,0.6,1,N,238,238,1,6.0,3.0,0.5,1.0,0,0.3,10.8,2.5
1,2020-01-01 00:55:23,2020-01-01 01:00:14,1,0.8,1,N,238,151,1,5.5,0.5,0.5,1.36,0,0.3,8.16,0.0
2,2020-01-01 00:01:58,2020-01-01 00:04:16,1,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,0,0.3,4.8,0.0


## Create a Schema

In [5]:
import pyspark.sql.functions as F

from pyspark.sql.types import *
from pyspark.sql.functions import col

In [6]:
ints = ('VendorID', 'passenger_count', 'RatecodeID', 
        'PULocationID', 'DOLocationID', 'payment_type', )
doubles = ('trip_distance', 'fare_amount', 'extra', 
           'mta_tax', 'tip_amount', 'tolls_amount', 
           'improvement_surcharge', 'total_amount', 'congestion_surcharge')
strings = ('store_and_fwd_flag',)
dtimes = ('tpep_pickup_datetime', 'tpep_dropoff_datetime', )

dtypes = {column: IntegerType() for column in ints}
dtypes.update({column: DoubleType() for column in doubles})
dtypes.update({column: StringType() for column in strings})
dtypes.update({column: TimestampType() for column in dtimes})

In [7]:
schema = StructType()

for column in sdf.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )

In [8]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)

sdf = spark.read.csv('../data/yellow_tripdata_2020-0[1-3].csv', header=True, schema=schema)

f"{sdf.count():,} rows."

                                                                                

'15,711,654 rows.'

In [9]:
sdf.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [10]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
sdf.limit(5)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1,2020-01-01 00:28:15,2020-01-01 00:33:03,1,1.2,1,N,238,239,1,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5
1,2020-01-01 00:35:39,2020-01-01 00:43:04,1,1.2,1,N,239,238,1,7.0,3.0,0.5,1.5,0.0,0.3,12.3,2.5
1,2020-01-01 00:47:41,2020-01-01 00:53:52,1,0.6,1,N,238,238,1,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5
1,2020-01-01 00:55:23,2020-01-01 01:00:14,1,0.8,1,N,238,151,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,0.0
2,2020-01-01 00:01:58,2020-01-01 00:04:16,1,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,0.0


## Cleaning the Data

### Initial Cleaning

Find trips that are happening in 2020 and ensure that the pickup time is before the dropoff time

In [11]:
sdf = sdf.filter((sdf.tpep_pickup_datetime.startswith('2020')) 
                 & (sdf.tpep_dropoff_datetime.startswith('2020')))

sdf = sdf.filter(sdf.tpep_dropoff_datetime > sdf.tpep_pickup_datetime)

Ensure all trips are not empty within and the maximum passenger count (5)

In [12]:
sdf = sdf.filter(sdf.passenger_count > 0)

sdf = sdf.filter(sdf.passenger_count < 6)

Ensure all trips are within the minimum fare ($2.5)

In [13]:
sdf = sdf.filter(sdf.fare_amount > 2.5)

Ensure the trip distance is more than 0.1

In [14]:
sdf = sdf.filter((sdf.trip_distance > 0.1))

Find trips paid by credit card (1) and unknown (5)

In [15]:
payment_id = [1, 5]
sdf = sdf.filter(F.col("payment_type").isin(payment_id))

Remove trips with unknown location ID (location ID 264 and 265 are unknown location)

In [16]:
sdf = sdf.filter((sdf.PULocationID < 264))
sdf = sdf.filter((sdf.DOLocationID < 264))

In [17]:
# Add duration per second for each record
diff_hour_col = (col("tpep_dropoff_datetime").cast("long") - col("tpep_pickup_datetime").cast("long"))/3600
sdf = sdf.withColumn( "diff_hour", diff_hour_col )

# Remove trips less than a minute and more than 10 hours

sdf = sdf.filter(sdf.diff_hour <= 10)
sdf = sdf.filter(sdf.diff_hour > (1/60))

In [18]:
tip_pct_col = ((col("tip_amount") / col("total_amount"))*100)
sdf = sdf.withColumn("tip_pct", tip_pct_col)

In [19]:
sdf.limit(1)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,diff_hour,tip_pct
1,2020-01-01 00:28:15,2020-01-01 00:33:03,1,1.2,1,N,238,239,1,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5,0.08,0.1304347826086956


After initial cleaning, we have:

In [35]:
f"{sdf.count():,} rows."

                                                                                

'10,916,614 rows.'

### Adding columns for hour, month, and day_of_week

NOTE: for day_of week --> 1 - Sunday, 2 - Monday, 3 - Tuesday, etc

In [None]:
small = sdf.limit(50)

In [20]:
sdf = sdf.withColumn('pickup_month', F.month(sdf.tpep_pickup_datetime))
sdf = sdf.withColumn('pickup_day_of_week', F.dayofweek(sdf.tpep_pickup_datetime))
sdf = sdf.withColumn('pickup_hour', F.hour(sdf.tpep_pickup_datetime))

sdf = sdf.withColumn('dropoff_month', F.month(sdf.tpep_dropoff_datetime))
sdf = sdf.withColumn('dropoff_day_of_week', F.dayofweek(sdf.tpep_dropoff_datetime))
sdf = sdf.withColumn('dropoff_hour', F.hour(sdf.tpep_dropoff_datetime))

In [21]:
sdf.limit(5)

21/08/15 02:24:55 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,diff_hour,tip_pct,pickup_month,pickup_day_of_week,pickup_hour,dropoff_month,dropoff_day_of_week,dropoff_hour
1,2020-01-01 00:28:15,2020-01-01 00:33:03,1,1.2,1,N,238,239,1,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5,0.08,0.1304347826086956,1,4,0,1,4,0
1,2020-01-01 00:35:39,2020-01-01 00:43:04,1,1.2,1,N,239,238,1,7.0,3.0,0.5,1.5,0.0,0.3,12.3,2.5,0.1236111111111111,0.1219512195121951,1,4,0,1,4,0
1,2020-01-01 00:47:41,2020-01-01 00:53:52,1,0.6,1,N,238,238,1,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5,0.1030555555555555,0.0925925925925925,1,4,0,1,4,0
1,2020-01-01 00:55:23,2020-01-01 01:00:14,1,0.8,1,N,238,151,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,0.0,0.0808333333333333,0.1666666666666666,1,4,0,1,4,1
1,2020-01-01 00:29:01,2020-01-01 00:40:28,2,0.7,1,N,246,48,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,0.1908333333333333,0.166077738515901,1,4,0,1,4,0


## Dropping Columns

In [None]:
filtered_sdf.limit(1)

In [22]:
final_sdf = sdf.drop("VendorID","store_and_fwd_flag","tpep_pickup_datetime","tpep_dropoff_datetime",
                     "payment_type", "extra", "mta_tax", "tip_amount", "total_amount", 
                     "improvement_surcharge", "congestion_surcharge")

In [23]:
final_sdf.limit(3)

passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,fare_amount,tolls_amount,diff_hour,tip_pct,pickup_month,pickup_day_of_week,pickup_hour,dropoff_month,dropoff_day_of_week,dropoff_hour
1,1.2,1,238,239,6.0,0.0,0.08,0.1304347826086956,1,4,0,1,4,0
1,1.2,1,239,238,7.0,0.0,0.1236111111111111,0.1219512195121951,1,4,0,1,4,0
1,0.6,1,238,238,6.0,0.0,0.1030555555555555,0.0925925925925925,1,4,0,1,4,0


## Export as Pickled Pandas Dataframe

In [24]:
final_df_1 = final_sdf.select(["passenger_count", "trip_distance", "RatecodeID", 
                             "PULocationID", "DOLocationID", "fare_amount"]).toPandas()

                                                                                

In [26]:
final_df_2 = final_sdf.select(["tolls_amount", "diff_hour", "tip_pct", "pickup_month", 
                               "pickup_day_of_week", "pickup_hour", "dropoff_month", 
                               "dropoff_day_of_week", "dropoff_hour"]).toPandas()

                                                                                

In [25]:
final_df_1.to_pickle("../data/final_df_1_test.pkl")

In [28]:
final_df_2.to_pickle("../data/final_df_2_test.pkl")