## Import Library

In [2]:
!pip install findspark

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import findspark
findspark.init()

In [4]:
from argparse import ArgumentParser
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from datetime import datetime
import sys

## Define Functions

In [5]:
def merge_data(spark: SparkSession, output_dir: str, start_date: str, end_date: str):
    sy, sm = start_date.split('-')
    ey, em = end_date.split('-')
    tmp_df = []
    for year in range(int(sy), int(ey)+1):
        for month in range(1, 13):
            if year == int(sy) and month < int(sm):
                continue
            if year == int(ey) and month > int(em):
                continue
            file_path = f'{output_dir}/fhvhv_tripdata_{year}-{str(month).zfill(2)}.parquet'
            tmp_df.append(spark.read.parquet(file_path))

    if len(tmp_df) == 1:
        return tmp_df[0]
    else:
        tmp = tmp_df[0]
        for idx in range(1, len(tmp_df)):
            tmp = tmp.union(tmp_df[idx])
        return tmp

In [6]:
def clean_data(row):
    """Remove missing/invalid data"""
    try:
        pickup_datetime, trip_mile, base_passenger_fare = row
        if not all([
            pickup_datetime is not None,
            trip_mile is not None,
            base_passenger_fare is not None
        ]):
            return False

        if trip_mile <= 0. or base_passenger_fare <= 0.:
            return False
        return True
    except Exception as e:
        print(f"Error in row validation: {e}")
        return False

## Run SparkSession

In [7]:
conf = SparkConf() \
    .setAppName("NYC Taxi Data Analysis") \
    .setMaster("spark://spark-master:7077") \
    .set("spark.executor.memory", "20g")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/03 23:26:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
output_dir = "hdfs://spark-master:9000/data"
result_dir = "hdfs://spark-master:9000/result"
start_date = "2023-01"
end_date = "2023-01"

In [9]:
# load dataframe from hdfs
df = merge_data(spark, output_dir, start_date, end_date)

                                                                                

In [10]:
rdd_data = df.rdd.map(lambda row: (
    row.pickup_datetime,
    row.trip_miles,
    row.base_passenger_fare
))

In [11]:
# Parse data
parsed_data = rdd_data.filter(lambda x: x is not None)

In [12]:
# Remove missing values
valid_data = parsed_data.filter(clean_data)

In [15]:
valid_data.cache()

PythonRDD[8] at RDD at PythonRDD.scala:53

In [17]:
# count total trips
try:
    total_trips = valid_data.count()
    print(f"Total Trips: {total_trips}")
except Exception as e:
    print(f"Error while counting valid_data: {e}")



Total Trips: 18462090


                                                                                

In [18]:
# sumation total revenue
try:
    total_revenue = valid_data.map(lambda x: x[2]).sum()
    print(f"Total Revenue: ${total_revenue:.2f}")
except Exception as e:
    print(f"Error while calculating total revenue: {e}")



Total Revenue: $398359394.00


                                                                                

In [19]:
# calcaulate avg_trip_dist
try:
    avg_trip_distance = valid_data.map(lambda x: x[1]).mean()
    print(f"Average Trip Distance: {avg_trip_distance:.2f} mile")
except Exception as e:
    print(f"Error while calculating avg trip distance: {e}")



Average Trip Distance: 4.87 mile


                                                                                

In [21]:
# Group by date
try:
    trips_by_date = valid_data.map(lambda x: (x[0].date(), 1)).reduceByKey(lambda a, b: a + b)
    trips_by_date.saveAsTextFile(f"{output_dir}/trips_by_date")
except:
    print(f"Error while calculating trips by date: {e}")

                                                                                

In [22]:
try:
    revenue_by_date = valid_data.map(lambda x: (x[0].date(), x[2])).reduceByKey(lambda a, b: a + b)
    revenue_by_date.saveAsTextFile(f"{output_dir}/revenue_by_date")
except:
    print(f"Error while calculating revenue by date: {e}")

                                                                                

In [28]:
# check txt files
file_path = "hdfs://spark-master:9000/data/trips_by_date/part*"
trip_res = sc.textFile(file_path)

In [30]:
trip_res.take(5)

['(datetime.date(2023, 1, 8), 554835)',
 '(datetime.date(2023, 1, 3), 477381)',
 '(datetime.date(2023, 1, 25), 660974)',
 '(datetime.date(2023, 1, 29), 620561)',
 '(datetime.date(2023, 1, 21), 752711)']

In [35]:
# check txt files
file_path = "hdfs://spark-master:9000/data/revenue_by_date/part*"
renevue_res = sc.textFile(file_path)

In [36]:
renevue_res.take(5)

['(datetime.date(2023, 1, 8), 11725043.3200018)',
 '(datetime.date(2023, 1, 3), 10433854.660002572)',
 '(datetime.date(2023, 1, 25), 14253913.040001681)',
 '(datetime.date(2023, 1, 29), 13272687.439999817)',
 '(datetime.date(2023, 1, 21), 15295015.960000405)']