In [1]:
# Importing libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, row_number, monotonically_increasing_id
from pyspark.sql.window import Window
import psycopg2 as psy
from pyspark.sql import functions as f


In [2]:
#Setup Spark Session

spark= SparkSession.builder.appName('NY Trip Data') \
                   .config("spark.jars" , "postgresql.42.7.3") \
                   .config ("spark.executor.memory", "4g") \
                   .getOrCreate()


In [3]:
file_path= "yellow_tripdata_2009-01DATASET.parquet"
ny_df=spark.read.option('mode','DROPMALFORMED' ).parquet(file_path)

In [4]:
print('Length of dataset is:', ny_df.count())

Length of dataset is: 14092413


In [5]:
ny_df.show()

+-----------+--------------------+---------------------+---------------+------------------+------------------+---------+---------+-----------------+------------------+---------+------------+-----------------+---------+-------+-------+---------+---------+
|vendor_name|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|     Trip_Distance|         Start_Lon|Start_Lat|Rate_Code|store_and_forward|           End_Lon|  End_Lat|Payment_Type|         Fare_Amt|surcharge|mta_tax|Tip_Amt|Tolls_Amt|Total_Amt|
+-----------+--------------------+---------------------+---------------+------------------+------------------+---------+---------+-----------------+------------------+---------+------------+-----------------+---------+-------+-------+---------+---------+
|        VTS| 2009-01-04 02:52:00|  2009-01-04 03:02:00|              1|              2.63|        -73.991957|40.721567|     NULL|             NULL|        -73.993803|40.695922|        CASH|              8.9|      0.5|   NULL|    0.0| 

In [6]:
#Checking columns
ny_df.columns

['vendor_name',
 'Trip_Pickup_DateTime',
 'Trip_Dropoff_DateTime',
 'Passenger_Count',
 'Trip_Distance',
 'Start_Lon',
 'Start_Lat',
 'Rate_Code',
 'store_and_forward',
 'End_Lon',
 'End_Lat',
 'Payment_Type',
 'Fare_Amt',
 'surcharge',
 'mta_tax',
 'Tip_Amt',
 'Tolls_Amt',
 'Total_Amt']

In [7]:
#Check schema of data
ny_df.printSchema()

root
 |-- vendor_name: string (nullable = true)
 |-- Trip_Pickup_DateTime: string (nullable = true)
 |-- Trip_Dropoff_DateTime: string (nullable = true)
 |-- Passenger_Count: long (nullable = true)
 |-- Trip_Distance: double (nullable = true)
 |-- Start_Lon: double (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Rate_Code: double (nullable = true)
 |-- store_and_forward: double (nullable = true)
 |-- End_Lon: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- Payment_Type: string (nullable = true)
 |-- Fare_Amt: double (nullable = true)
 |-- surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- Tip_Amt: double (nullable = true)
 |-- Tolls_Amt: double (nullable = true)
 |-- Total_Amt: double (nullable = true)



In [8]:
#taking a subset of our dataset

ny_df_subset=ny_df.limit(20000)

In [9]:
#Checking new row count
ny_df_subset.count()

20000

In [10]:
ny_df_subset.show()

+-----------+--------------------+---------------------+---------------+------------------+------------------+---------+---------+-----------------+------------------+---------+------------+-----------------+---------+-------+-------+---------+---------+
|vendor_name|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|     Trip_Distance|         Start_Lon|Start_Lat|Rate_Code|store_and_forward|           End_Lon|  End_Lat|Payment_Type|         Fare_Amt|surcharge|mta_tax|Tip_Amt|Tolls_Amt|Total_Amt|
+-----------+--------------------+---------------------+---------------+------------------+------------------+---------+---------+-----------------+------------------+---------+------------+-----------------+---------+-------+-------+---------+---------+
|        VTS| 2009-01-04 02:52:00|  2009-01-04 03:02:00|              1|              2.63|        -73.991957|40.721567|     NULL|             NULL|        -73.993803|40.695922|        CASH|              8.9|      0.5|   NULL|    0.0| 

In [11]:
#checkinf for duplicate rows
ny_df_subset.groupBy('vendor_name',
 'Trip_Pickup_DateTime',
 'Trip_Dropoff_DateTime',
 'Passenger_Count',
 'Trip_Distance',
 'Start_Lon',
 'Start_Lat',
 'Rate_Code',
 'store_and_forward',
 'End_Lon',
 'End_Lat',
 'Payment_Type',
 'Fare_Amt',
 'surcharge',
 'mta_tax',
 'Tip_Amt',
 'Tolls_Amt',
 'Total_Amt').count().filter('count > 1').show()

+-----------+--------------------+---------------------+---------------+-------------+---------+---------+---------+-----------------+-------+-------+------------+--------+---------+-------+-------+---------+---------+-----+
|vendor_name|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|Trip_Distance|Start_Lon|Start_Lat|Rate_Code|store_and_forward|End_Lon|End_Lat|Payment_Type|Fare_Amt|surcharge|mta_tax|Tip_Amt|Tolls_Amt|Total_Amt|count|
+-----------+--------------------+---------------------+---------------+-------------+---------+---------+---------+-----------------+-------+-------+------------+--------+---------+-------+-------+---------+---------+-----+
+-----------+--------------------+---------------------+---------------+-------------+---------+---------+---------+-----------------+-------+-------+------------+--------+---------+-------+-------+---------+---------+-----+



In [12]:
#Find the missing values in the dataset
for column in ny_df_subset.columns:
  null_count= ny_df_subset.filter(f.col(column).isNull()).count()
  print(column, null_count)


vendor_name 0
Trip_Pickup_DateTime 0
Trip_Dropoff_DateTime 0
Passenger_Count 0
Trip_Distance 0
Start_Lon 0
Start_Lat 0
Rate_Code 20000
store_and_forward 20000
End_Lon 0
End_Lat 0
Payment_Type 0
Fare_Amt 0
surcharge 0
mta_tax 20000
Tip_Amt 0
Tolls_Amt 0
Total_Amt 0


In [15]:
#FInding minssing values using list comprehension
null_counts_2= [(column, ny_df_subset.where(f.col(column).isNull()).count()) for column in ny_df_subset.columns]
null_counts_2


[('vendor_name', 0),
 ('Trip_Pickup_DateTime', 0),
 ('Trip_Dropoff_DateTime', 0),
 ('Passenger_Count', 0),
 ('Trip_Distance', 0),
 ('Start_Lon', 0),
 ('Start_Lat', 0),
 ('Rate_Code', 20000),
 ('store_and_forward', 20000),
 ('End_Lon', 0),
 ('End_Lat', 0),
 ('Payment_Type', 0),
 ('Fare_Amt', 0),
 ('surcharge', 0),
 ('mta_tax', 20000),
 ('Tip_Amt', 0),
 ('Tolls_Amt', 0),
 ('Total_Amt', 0)]

In [23]:
#Getting columns to drop
columns_to_drop = [column for column, count in null_counts_2 if count > 0.1 * ny_df_subset.count()]
columns_to_drop

['Rate_Code', 'store_and_forward', 'mta_tax']

In [25]:
#Droping columns with more than 10% nulls
ny_df_subset= ny_df_subset.drop(*columns_to_drop)

ny_df_subset.columns

['vendor_name',
 'Trip_Pickup_DateTime',
 'Trip_Dropoff_DateTime',
 'Passenger_Count',
 'Trip_Distance',
 'Start_Lon',
 'Start_Lat',
 'End_Lon',
 'End_Lat',
 'Payment_Type',
 'Fare_Amt',
 'surcharge',
 'Tip_Amt',
 'Tolls_Amt',
 'Total_Amt']

In [26]:
ny_df_subset.show()

+-----------+--------------------+---------------------+---------------+------------------+------------------+---------+------------------+---------+------------+-----------------+---------+-------+---------+---------+
|vendor_name|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|     Trip_Distance|         Start_Lon|Start_Lat|           End_Lon|  End_Lat|Payment_Type|         Fare_Amt|surcharge|Tip_Amt|Tolls_Amt|Total_Amt|
+-----------+--------------------+---------------------+---------------+------------------+------------------+---------+------------------+---------+------------+-----------------+---------+-------+---------+---------+
|        VTS| 2009-01-04 02:52:00|  2009-01-04 03:02:00|              1|              2.63|        -73.991957|40.721567|        -73.993803|40.695922|        CASH|              8.9|      0.5|    0.0|      0.0|      9.4|
|        VTS| 2009-01-04 03:31:00|  2009-01-04 03:38:00|              3|              4.55|        -73.982102| 40.73629|    

In [27]:
#Data transformation

ny_df_subset=ny_df_subset.filter(
    (f.col('Passenger_Count') > 0.0) &
    (f.col('Trip_Distance') > 0.0) & 
    (f.col("Fare_Amt") > 0.0) &
    (f.col("Total_Amt") > 0.0) & 
    (f.col("Tip_Amt")>= 0.0) &
    (f.col("Tolls_Amt")>= 0.0) &
    (f.col("surcharge") >= 0.0)
    )

In [28]:
ny_df_subset.count()

19835

In [29]:
#checking data type
ny_df_subset.dtypes

[('vendor_name', 'string'),
 ('Trip_Pickup_DateTime', 'string'),
 ('Trip_Dropoff_DateTime', 'string'),
 ('Passenger_Count', 'bigint'),
 ('Trip_Distance', 'double'),
 ('Start_Lon', 'double'),
 ('Start_Lat', 'double'),
 ('End_Lon', 'double'),
 ('End_Lat', 'double'),
 ('Payment_Type', 'string'),
 ('Fare_Amt', 'double'),
 ('surcharge', 'double'),
 ('Tip_Amt', 'double'),
 ('Tolls_Amt', 'double'),
 ('Total_Amt', 'double')]

In [31]:
#Changing data types
columns_to_cast = {
    "Trip_Pickup_DateTime": "timestamp",
    "Trip_Dropoff_DateTime": "timestamp",
    "Passenger_Count": "integer",
}

In [32]:
#Changing data types
for col_name, col_type in columns_to_cast.items():
    ny_df_subset= ny_df_subset.withColumn(col_name, f.col(col_name).cast(col_type))

In [33]:
ny_df_subset.dtypes

[('vendor_name', 'string'),
 ('Trip_Pickup_DateTime', 'timestamp'),
 ('Trip_Dropoff_DateTime', 'timestamp'),
 ('Passenger_Count', 'int'),
 ('Trip_Distance', 'double'),
 ('Start_Lon', 'double'),
 ('Start_Lat', 'double'),
 ('End_Lon', 'double'),
 ('End_Lat', 'double'),
 ('Payment_Type', 'string'),
 ('Fare_Amt', 'double'),
 ('surcharge', 'double'),
 ('Tip_Amt', 'double'),
 ('Tolls_Amt', 'double'),
 ('Total_Amt', 'double')]

In [34]:
#Print out all categorical columns
categorical_columns= [item[0] for item in ny_df_subset.dtypes if item[1].startswith('string')]

categorical_columns

['vendor_name', 'Payment_Type']

In [35]:
#Checking the distinct values in the categorical data
ny_df_subset.select('vendor_name').distinct().show()

+-----------+
|vendor_name|
+-----------+
|        VTS|
|        DDS|
|        CMT|
+-----------+



In [36]:
#Checking the distinct values in the categorical data
ny_df_subset.select('Payment_Type').distinct().show()

+------------+
|Payment_Type|
+------------+
|        CASH|
|      Credit|
|      CREDIT|
|        Cash|
|   No Charge|
|     Dispute|
+------------+



In [37]:
# Fixing payment_type column
ny_df_subset= ny_df_subset.withColumn("Payment_Type", 
                                      when(f.col('Payment_Type')== 'CASH', 'Cash')
                                      .when(f.col('Payment_Type') == 'CREDIT', 'Credit')
                                      .otherwise(f.col('Payment_Type'))
                                    )

In [38]:
ny_df_subset.select('Payment_Type').distinct().show()

+------------+
|Payment_Type|
+------------+
|        Cash|
|      Credit|
|   No Charge|
|     Dispute|
+------------+



Creating Different Tables

In [39]:
#Creating vendor table

vendors= ny_df_subset.select('vendor_name') \
                    .withColumn('vendor_id', monotonically_increasing_id()+1) \
                    .select('vendor_id', 'vendor_name')

vendors.show()

+---------+-----------+
|vendor_id|vendor_name|
+---------+-----------+
|        1|        VTS|
|        2|        VTS|
|        3|        VTS|
|        4|        DDS|
|        5|        DDS|
|        6|        DDS|
|        7|        DDS|
|        8|        VTS|
|        9|        CMT|
|       10|        CMT|
|       11|        CMT|
|       12|        CMT|
|       13|        DDS|
|       14|        CMT|
|       15|        CMT|
|       16|        CMT|
|       17|        CMT|
|       18|        CMT|
|       19|        DDS|
|       20|        CMT|
+---------+-----------+
only showing top 20 rows



In [40]:
vendors.count()

19835

In [41]:
payments= ny_df_subset.select('Payment_Type', 'Fare_Amt', 'surcharge', 'Tip_Amt', 'Tolls_Amt', 'Total_Amt') \
                      .withColumn('payment_id', monotonically_increasing_id()+1) \
                      .select('payment_id','Payment_Type', 'Fare_Amt', 'surcharge', 'Tip_Amt', 'Tolls_Amt', 'Total_Amt' )

payments.show(5)

+----------+------------+--------+---------+-------+---------+---------+
|payment_id|Payment_Type|Fare_Amt|surcharge|Tip_Amt|Tolls_Amt|Total_Amt|
+----------+------------+--------+---------+-------+---------+---------+
|         1|        Cash|     8.9|      0.5|    0.0|      0.0|      9.4|
|         2|      Credit|    12.1|      0.5|    2.0|      0.0|     14.6|
|         3|      Credit|    23.7|      0.0|   4.74|      0.0|    28.44|
|         4|      Credit|    14.9|      0.5|   3.05|      0.0|    18.45|
|         5|        Cash|     3.7|      0.0|    0.0|      0.0|      3.7|
+----------+------------+--------+---------+-------+---------+---------+
only showing top 5 rows



In [42]:
payments.count()

19835

In [43]:
#Locations table

locations= ny_df_subset.select('Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat') \
                       .withColumn('location_id', monotonically_increasing_id()+1) \
                       .select('location_id','Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat')

locations.show(3)

+-----------+----------+---------+----------+---------+
|location_id| Start_Lon|Start_Lat|   End_Lon|  End_Lat|
+-----------+----------+---------+----------+---------+
|          1|-73.991957|40.721567|-73.993803|40.695922|
|          2|-73.982102| 40.73629| -73.95585| 40.76803|
|          3|-74.002587|40.739748|-73.869983|40.770225|
+-----------+----------+---------+----------+---------+
only showing top 3 rows



In [44]:
locations.count()

19835

In [53]:
#Trip Table

trips= ny_df_subset.withColumn('trip_id',monotonically_increasing_id()+1 )

trips= trips.join(vendors, trips.trip_id==vendors.vendor_id, 'left')\
            .join(payments,trips.trip_id==payments.payment_id, 'left' )\
            .join (locations,trips.trip_id==locations.location_id, 'left' ) \
            .select ('trip_id', 'vendor_id','payment_id', 'location_id', 'Trip_Pickup_DateTime','Trip_Dropoff_DateTime','Passenger_Count','Trip_Distance')

trips.show()             

+-------+---------+----------+-----------+--------------------+---------------------+---------------+------------------+
|trip_id|vendor_id|payment_id|location_id|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|     Trip_Distance|
+-------+---------+----------+-----------+--------------------+---------------------+---------------+------------------+
|      1|        1|         1|          1| 2009-01-04 02:52:00|  2009-01-04 03:02:00|              1|              2.63|
|      2|        2|         2|          2| 2009-01-04 03:31:00|  2009-01-04 03:38:00|              3|              4.55|
|      3|        3|         3|          3| 2009-01-03 15:43:00|  2009-01-03 15:57:00|              5|             10.35|
|      4|        4|         4|          4| 2009-01-01 20:52:58|  2009-01-01 21:14:00|              1|               5.0|
|      5|        5|         5|          5| 2009-01-24 16:18:23|  2009-01-24 16:24:56|              1|               0.4|
|      6|        6|         6|  

In [55]:
trips.count()

19835

In [33]:
# # Cache the DataFrame
# trips.cache()

# # Perform the count
# trip_count = trips.count()
# print(f"Number of rows in trips: {trip_count}")

In [34]:
ny_df_subset.columns

['vendor_name',
 'Trip_Pickup_DateTime',
 'Trip_Dropoff_DateTime',
 'Passenger_Count',
 'Trip_Distance',
 'Start_Lon',
 'Start_Lat',
 'End_Lon',
 'End_Lat',
 'Payment_Type',
 'Fare_Amt',
 'surcharge',
 'Tip_Amt',
 'Tolls_Amt',
 'Total_Amt']