In [1]:
# define path constants
LANDING_DATA = '../data/landing/'
RAW_DATA = '../data/raw/'
CURATED_DATA = '../data/curated/'
ANALYSIS_DATA = '../data/analysis/'

# max levels for filtering data
MAX_FARE = 500
MAX_DISTANCE = 100
MAX_TIME = 300
MAX_LOCATION_ID = 264

In [2]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
23/08/15 20:49:50 WARN Utils: Your hostname, DESKTOP-VP2PCTV resolves to a loopback address: 127.0.1.1; using 172.21.252.215 instead (on interface eth0)
23/08/15 20:49:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/15 20:49:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [15]:
all_data.select('source').distinct().collect()

                                                                                

[Row(source='fhvhv'), Row(source='green'), Row(source='yellow')]

In [51]:
all_data3.select(F.percentile_approx("fare", 0.9998))

                                                                                

"percentile_approx(fare, 0.9998, 10000)"
180.0


In [3]:
from pyspark.sql import DataFrame
from typing import Callable
from pyspark.sql import functions as F
import datetime

def apply_filter(data: DataFrame, filter_function: Callable) -> DataFrame:
    rows_before_filter = data.count()
    data = data.where(filter_function)
    print(f'applied filter{str(filter_function)}, rows before:{rows_before_filter}, rows after:{data.count()}')
    return data
    

In [4]:
from pyspark.sql import functions as F
import datetime

def prepare_curated_tlc_data(source: str, keep_cols: list[str]) -> None:
    '''
    Retrieves downloaded raw TLC data, combining separate monthly data
    into one dataframe, and renames and filters columnsof interest (). 
    Saves this raw data down accordingly
    Arguments:
        source = TLC data source (yellow, green, or fhvhv)
        rename_cols = dict with keys = original column name, values = new column name
    Ouput: None
    '''
    
    # get raw data
    all_data = spark.read.parquet(f'{RAW_DATA}{source}/*')
    print(f'Number of raw records for {source}: {str(all_data.count())}')
    # select columns of interest
    all_data = all_data.select(*keep_cols)
    
    # add derived columns
    # adding trip time in minutes
    all_data = all_data.withColumn('trip_time', 
        F.round((all_data.dropoff_datetime- all_data.pickup_datetime).cast('long')/60,2))
    # adding temporal variables
    all_data = all_data.select('*',
        F.hour("pickup_datetime").alias('hour'), 
        F.month("pickup_datetime").alias('month'), 
        F.dayofweek("pickup_datetime").alias('day')
    )
    
    # adding an identifier for data source, as curated data will combine all sources
    # into one dataframe
    all_data=all_data.withColumn('source',F.lit(source))
    
    # filter out records with suspect data, do this one by one so we can establish
    # impact of filters
    all_data = apply_filter(all_data, (F.col('fare') > 0) & (F.col('fare') < MAX_FARE))
    all_data = apply_filter(all_data, (F.col('trip_distance') > 0) & (F.col('trip_distance') < MAX_DISTANCE))
    all_data = apply_filter(all_data, (F.col('trip_time') > 0) & (F.col('trip_time') < MAX_TIME))
    all_data = apply_filter(all_data, (F.col('pickup_id') > 0) & (F.col('pickup_id') < MAX_LOCATION_ID))
    all_data = apply_filter(all_data, (F.col("pickup_datetime") >= datetime.datetime(2020, 5, 1, 0, 0, 0)) \
        & (F.col("pickup_datetime") < datetime.datetime(2023, 6, 1, 0, 0, 0)))
    
    # save the curated data
    all_data.write.mode('overwrite').parquet(CURATED_DATA+source)
    print(f'Number of curated records for {source}: {str(all_data.count())}')

In [5]:
keep_cols = ['pickup_datetime', 'dropoff_datetime', 'trip_distance', 'pickup_id', 'fare']
prepare_curated_tlc_data('yellow', keep_cols)
prepare_curated_tlc_data('green', keep_cols)
prepare_curated_tlc_data('fhvhv', keep_cols)

                                                                                

Number of raw records for yellow: 95445749


                                                                                

applied filterColumn<'((fare > 0) AND (fare < 500))'>, rows before:95445749, rows after:94829572


                                                                                

applied filterColumn<'((trip_distance > 0) AND (trip_distance < 100))'>, rows before:94829572, rows after:93551616


                                                                                

applied filterColumn<'((trip_time > 0) AND (trip_time < 300))'>, rows before:93551616, rows after:93340310


                                                                                

applied filterColumn<'((pickup_id > 0) AND (pickup_id < 264))'>, rows before:93340310, rows after:92284232


                                                                                

applied filterColumn<'((pickup_datetime >= TIMESTAMP '2020-04-30 14:00:00') AND (pickup_datetime < TIMESTAMP '2023-05-31 14:00:00'))'>, rows before:92284232, rows after:92215295


                                                                                

Number of curated records for yellow: 92215295
Number of raw records for green: 2877421
applied filterColumn<'((fare > 0) AND (fare < 500))'>, rows before:2877421, rows after:2864204
applied filterColumn<'((trip_distance > 0) AND (trip_distance < 100))'>, rows before:2864204, rows after:2734280
applied filterColumn<'((trip_time > 0) AND (trip_time < 300))'>, rows before:2734280, rows after:2724395
applied filterColumn<'((pickup_id > 0) AND (pickup_id < 264))'>, rows before:2724395, rows after:2719252


                                                                                

applied filterColumn<'((pickup_datetime >= TIMESTAMP '2020-04-30 14:00:00') AND (pickup_datetime < TIMESTAMP '2023-05-31 14:00:00'))'>, rows before:2719252, rows after:2718001


                                                                                

Number of curated records for green: 2718001
Number of raw records for fhvhv: 566168421


                                                                                

applied filterColumn<'((fare > 0) AND (fare < 500))'>, rows before:566168421, rows after:565300188


                                                                                

applied filterColumn<'((trip_distance > 0) AND (trip_distance < 100))'>, rows before:565300188, rows after:565131616


                                                                                

applied filterColumn<'((trip_time > 0) AND (trip_time < 300))'>, rows before:565131616, rows after:565104793


                                                                                

applied filterColumn<'((pickup_id > 0) AND (pickup_id < 264))'>, rows before:565104793, rows after:565071103


                                                                                

applied filterColumn<'((pickup_datetime >= TIMESTAMP '2020-04-30 14:00:00') AND (pickup_datetime < TIMESTAMP '2023-05-31 14:00:00'))'>, rows before:565071103, rows after:564757294




Number of curated records for fhvhv: 564757294


                                                                                

In [6]:
keep_cols = ['pickup_datetime', 'dropoff_datetime', 'trip_distance', 'pickup_id', 'fare']
prepare_curated_tlc_data('fhvhv', keep_cols)

                                                                                

Number of raw records for fhvhv: 353752338


                                                                                

applied filterColumn<'((fare > 0) AND (fare < 500))'>, rows before:353752338, rows after:353166725


                                                                                

applied filterColumn<'((trip_distance > 0) AND (trip_distance < 100))'>, rows before:353166725, rows after:353056432


                                                                                

applied filterColumn<'((trip_time > 0) AND (trip_time < 300))'>, rows before:353056432, rows after:353040008


                                                                                

applied filterColumn<'((pickup_id > 0) AND (pickup_id < 264))'>, rows before:353040008, rows after:353016834


                                                                                

applied filterColumn<'((pickup_datetime >= TIMESTAMP '2020-04-30 14:00:00') AND (pickup_datetime < TIMESTAMP '2023-05-31 14:00:00'))'>, rows before:353016834, rows after:352703025




Number of curated records for fhvhv: 352703025


                                                                                