In [2]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


# The core syntax for reading data in Apache Spark

spark.read \
     .format() \ # this is the raw format you are reading from
     .option("key", "value") \
     .schema() \ # this is optional, use when you know the schema
     .load(path)

# Implementation on Databricks

dataPath = "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv"
diamonds = sqlContext.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load(dataPath)

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import col, avg, count, max, min, sum, when, datediff, to_date, hour

# Initialize Spark session
spark = SparkSession.builder.appName("UberDataAnalysis").getOrCreate()

# Load the CSV files into DataFrames
location_df = spark.read.csv("Location.csv", header=True, inferSchema=True)
payment_df = spark.read.csv("Payment.csv", header=True, inferSchema=True)
payment_method_df = spark.read.csv("PaymentMethod.csv", header=True, inferSchema=True)
payment_status_df = spark.read.csv("PaymentStatus.csv", header=True, inferSchema=True)
request_df = spark.read.csv("Request.csv", header=True, inferSchema=True)
trip_df = spark.read.csv("Trip.csv", header=True, inferSchema=True)
user_df = spark.read.csv("User.csv", header=True, inferSchema=True)
vehicle_df = spark.read.csv("Vehicles.csv", header=True, inferSchema=True)
vehicle_make_df = spark.read.csv("VehicleMakes.csv", header=True, inferSchema=True)

In [4]:
location_df.show(10)

+----------+------------+-----------+
|LocationID|   Longitude|   Latitude|
+----------+------------+-----------+
|     97296|-73.99623871|40.76356125|
|     15450|-74.00886536|40.71373367|
|    372799|-73.97459412|40.79328918|
|    422524|-73.96847534|40.76134491|
|     13283|-74.00952911|40.70638275|
|    140452|-73.99201202|40.72574615|
|    198469|-73.98804474|40.76394272|
|    182912|-73.98912811|40.75777054|
|    497703|-73.95536041|40.77952957|
|    245176|-73.98457336|40.73636627|
+----------+------------+-----------+
only showing top 10 rows



In [5]:
payment_df.show(10)

+---------+---------------+---------------+
|PaymentID|PaymentMethodID|PaymentStatusID|
+---------+---------------+---------------+
|        6|              6|              4|
|        6|              6|              2|
|        3|              3|              1|
|        6|              6|              4|
|        6|              6|              1|
|        3|              3|              2|
|        4|              4|              4|
|        5|              5|              2|
|        6|              6|              1|
|        6|              6|              2|
+---------+---------------+---------------+
only showing top 10 rows



In [6]:
payment_method_df.show(10)

+---------------+-----------+
|PaymentMethodID| MethodName|
+---------------+-----------+
|              5| Google Pay|
|              1|  Apple Pay|
|              6|     PayPal|
|              3|Credit Card|
|              2|       Cash|
|              4| Debit Card|
+---------------+-----------+



In [7]:
payment_status_df.show(10)

+---------------+----------+
|PaymentStatusID|StatusName|
+---------------+----------+
|              2|    Failed|
|              3|   Pending|
|              1| Completed|
|              4|  Refunded|
+---------------+----------+



In [8]:
request_df.show(10)

+---------+-----------+----------------+-----------------+--------------------+--------------------+
|RequestID|PassengerID|PickupLocationID|DropoffLocationID|         RequestTime|          AcceptTime|
+---------+-----------+----------------+-----------------+--------------------+--------------------+
|    49931|      25047|          414675|           300838|1/7/2015 11:11:00 AM|1/7/2015 11:16:00 AM|
|    78920|      39119|          213486|           528405|1/12/2015 8:50:00 PM|1/12/2015 8:56:00 PM|
|    67630|      33474|          229932|           251111|1/13/2015 8:27:00 PM|1/13/2015 8:30:00 PM|
|    99696|      48616|           37162|            48307|1/23/2015 9:57:00 AM|1/23/2015 10:01:0...|
|   373166|     185748|          208583|           342678|1/25/2015 1:05:00 AM|1/25/2015 1:15:00 AM|
|    98670|      48171|          400957|           419446|1/30/2015 6:26:00 PM|1/30/2015 6:35:00 PM|
|   523433|     261901|          544240|           371345| 1/8/2015 5:43:00 PM| 1/8/2015 5:

In [9]:
user_df.show(5)

+------+-------------+--------------------+--------------+----------------+
|UserID|     FullName|               Email|   PhoneNumber|DriverMeanRating|
+------+-------------+--------------------+--------------+----------------+
|     1| Aaron Acosta| vbishop@example.net|(677) 367-9557|            NULL|
|     2|  Aaron Adams| james82@example.com|(332) 224-7965|            NULL|
|     3|  Aaron Adams|nicholas92@exampl...|(351) 943-8670|            NULL|
|     4|  Aaron Adams|zjohnson@example.com|(324) 384-5822|            NULL|
|     5|Aaron Aguilar|anthony78@example...|(867) 762-3031|            NULL|
+------+-------------+--------------------+--------------+----------------+
only showing top 5 rows



# Data Cleaning (Drop NULL Values)

In [22]:
user_df = user_df.drop("DriverMeanRating")
trip_df = trip_df.drop("driver_rating")

In [23]:
user_df.show()

+------+---------------+--------------------+--------------+
|UserID|       FullName|               Email|   PhoneNumber|
+------+---------------+--------------------+--------------+
|     1|   Aaron Acosta| vbishop@example.net|(677) 367-9557|
|     2|    Aaron Adams| james82@example.com|(332) 224-7965|
|     3|    Aaron Adams|nicholas92@exampl...|(351) 943-8670|
|     4|    Aaron Adams|zjohnson@example.com|(324) 384-5822|
|     5|  Aaron Aguilar|anthony78@example...|(867) 762-3031|
|     6|  Aaron Aguilar|wilsonruben@examp...|(376) 707-8408|
|     7|  Aaron Aguirre|samantha45@exampl...|(147) 383-2497|
|     8|Aaron Alexander|   phall@example.org|(761) 557-5601|
|     9|    Aaron Allen| amber80@example.com|(613) 319-4730|
|    10|    Aaron Allen|joseph77@example.net|(104) 903-5333|
|    11|    Aaron Allen|nicolecasey@examp...|(921) 454-7983|
|    12|    Aaron Allen|reyesmonica@examp...|(603) 745-9470|
|    13|  Aaron Allison|sullivanamy@examp...|(288) 221-8294|
|    14|  Aaron Alvarez|

In [25]:
trip_df.show(10)

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:43:00 PM|         1.8|     9.5|      0.5|   0.5|      0.0|  

# Handling Missing Vlaues

In [32]:
trip_df = trip_df.na.fill({"BaseFare": 0, "ExtraFare": 0, "MtaTax": 0, "TipAmount": 0, "TollsAmount": 0, "ImprovementSurcharge": 0})

In [33]:
trip_df.show(5)

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:43:00 PM|         1.8|     9.5|      0.5|   0.5|      0.0|  

In [None]:
Converting to sutable 

In [37]:
# Assuming TripStartTime is the correct column for pickup time
trip_df = trip_df.withColumn("pickup_date", to_timestamp(col("TripStartTime")).cast("date"))
trip_df = trip_df.withColumn("pickup_date", to_timestamp(col("TripEndTime")).cast("date"))
# Show the resulting DataFrame
trip_df.show()

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+-----------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|pickup_date|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+-----------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|       NULL|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|       NULL|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:

# What are the most common pickup and dropoff locations?

In [10]:
# Join request_df with location_df for both PickupLocationID and DropoffLocationID
pickup_location_df = request_df.join(location_df, request_df.PickupLocationID == location_df.LocationID)
dropoff_location_df = request_df.join(location_df, request_df.DropoffLocationID == location_df.LocationID)

# Find the most common pickup and dropoff locations
pickup_location_df.groupBy("Longitude", "Latitude").agg(count("*").alias("pickup_count")).orderBy(col("pickup_count").desc()).show()
dropoff_location_df.groupBy("Longitude", "Latitude").agg(count("*").alias("dropoff_count")).orderBy(col("dropoff_count").desc()).show()

+------------+-----------+------------+
|   Longitude|   Latitude|pickup_count|
+------------+-----------+------------+
|-73.78796387|40.64152527|       11157|
|-73.95028687|40.79550171|          43|
|-74.18402863|40.68948364|          29|
|-74.00323486|40.73168182|          15|
|-73.93746185|40.74890137|          11|
|-73.98692322|40.72085953|          10|
|-73.97848511|40.76370239|          10|
|-73.98863983|40.71881866|           9|
|-74.00162506|40.74097824|           7|
| -73.9397049|40.79433823|           7|
|-73.99417877|40.75109863|           7|
|-73.94628906|40.77302933|           7|
|-73.99411011|40.75112152|           7|
|-73.99414063|40.75106812|           7|
|  -73.994133|40.75118637|           6|
|-73.87104034|40.77378082|           6|
|-73.87440491|40.77396011|           6|
|-73.92472076|40.76171875|           6|
| -73.7883606|40.64736938|           6|
|-73.94959259|40.77273178|           6|
+------------+-----------+------------+
only showing top 20 rows

+------------+

# How many trips does each passenger take, and what is their average trip distance?

In [31]:
# Join trip_df with request_df to get passenger information, then group by passenger
trip_passenger_df = trip_df.join(request_df, trip_df.RequestID == request_df.RequestID)

# Group by PassengerID to calculate the number of trips and average distance
trip_passenger_df.groupBy("PassengerID").agg(
    count("*").alias("trip_count"),
    avg("TripDistance").alias("avg_trip_distance")
).orderBy(col("trip_count").desc()).show()


+-----------+----------+------------------+
|PassengerID|trip_count| avg_trip_distance|
+-----------+----------+------------------+
|     195168|        23|2.7113043478260868|
|     183719|        19|2.6389473684210527|
|     208524|        17|2.9947058823529416|
|       6076|        16|          2.788125|
|     283761|        16|           2.47875|
|     259447|        16|1.9562499999999998|
|     207380|        16|          1.971875|
|      28440|        16|            2.9225|
|     194260|        15|             2.168|
|     208173|        15|1.6353333333333337|
|      56911|        15| 4.360666666666668|
|      81120|        15|2.9406666666666665|
|      86477|        15|3.3400000000000003|
|      77372|        15| 4.496666666666668|
|     125266|        15|2.6446666666666667|
|     219865|        15| 3.131333333333333|
|      32041|        15|3.3273333333333337|
|     258125|        15|2.0073333333333334|
|      51695|        15|1.8993333333333335|
|     269183|        15|2.127999

# What are the most common vehicle makes and models used in trips?

In [18]:
vehicle_full_df = vehicle_df.join(vehicle_make_df, on='MakeID')

# Group by MakeName and Model, and count the occurrences
vehicle_full_df.groupBy('MakeName', 'Model').count().orderBy('count', ascending=False).show(10)

+-------------+------+-----+
|     MakeName| Model|count|
+-------------+------+-----+
|        Honda|Accord| 7587|
|      Hyundai|Tucson| 7578|
|   Volkswagen|Beetle| 7568|
|          Kia| Forte| 7557|
|       Nissan|Sentra| 7540|
|       Nissan|Altima| 7534|
|Mercedes-Benz|   GLC| 7534|
|       Toyota|  RAV4| 7530|
|         Ford|Escape| 7527|
|       Nissan|Maxima| 7527|
+-------------+------+-----+
only showing top 10 rows



In [26]:
trip_df.show(5)

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:43:00 PM|         1.8|     9.5|      0.5|   0.5|      0.0|  