In [2]:
#pip install pyspark

Note: you may need to restart the kernel to use updated packages.


# The core syntax for reading data in Apache Spark

spark.read \
     .format() \ # this is the raw format you are reading from
     .option("key", "value") \
     .schema() \ # this is optional, use when you know the schema
     .load(path)

# Implementation on Databricks

dataPath = "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv"
diamonds = sqlContext.read.format("csv")\
  .option("header","true")\
  .option("inferSchema", "true")\
  .load(dataPath)

In [151]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import col, avg, count, max, min, sum, when, datediff, to_date, hour

# Initialize Spark session
spark = SparkSession.builder.appName("UberDataAnalysis").getOrCreate()

# Load the CSV files into DataFrames
location_df = spark.read.csv("Location.csv", header=True, inferSchema=True)
payment_df = spark.read.csv("Payment.csv", header=True, inferSchema=True)
payment_method_df = spark.read.csv("PaymentMethod.csv", header=True, inferSchema=True)
payment_status_df = spark.read.csv("PaymentStatus.csv", header=True, inferSchema=True)
request_df = spark.read.csv("Request.csv", header=True, inferSchema=True)
trip_df = spark.read.csv("Trip.csv", header=True, inferSchema=True)
user_df = spark.read.csv("User.csv", header=True, inferSchema=True)
vehicle_df = spark.read.csv("Vehicles.csv", header=True, inferSchema=True)
vehicle_make_df = spark.read.csv("VehicleMakes.csv", header=True, inferSchema=True)

In [None]:

# List of DataFrames to process
dataframes = [
    location_df,
    payment_df,
    payment_method_df,
    payment_status_df,
    request_df,
    trip_df,
    user_df,
    vehicle_df,
    vehicle_make_df
]  # Add more DataFrames as needed

In [159]:
for df in dataframes:
    df.show(5)

+-----------+------------+-----------+
|location_id|   longitude|   latitude|
+-----------+------------+-----------+
|       NULL|        NULL|       NULL|
|      76173|-73.99945068|40.72192383|
|     567607|-73.77745056|40.64664841|
|     498838|-73.95517731|40.76498795|
|     138392|-73.99216461|40.72513962|
+-----------+------------+-----------+
only showing top 5 rows

+----------+-----------------+-----------------+
|payment_id|payment_method_id|payment_status_id|
+----------+-----------------+-----------------+
|      NULL|             NULL|             NULL|
|         6|                6|                4|
|         6|                6|                2|
|         3|                3|                1|
|         6|                6|                4|
+----------+-----------------+-----------------+
only showing top 5 rows

+-----------------+-----------+
|payment_method_id|method_name|
+-----------------+-----------+
|             NULL| ----------|
|                5| Google Pay

# Data Cleaning (Drop NULL Values)

In [144]:
user_df = user_df.drop("DriverMeanRating")
trip_df = trip_df.drop("driver_rating")

In [70]:
user_df.show()

+------+---------------+--------------------+--------------+
|UserID|       FullName|               Email|   PhoneNumber|
+------+---------------+--------------------+--------------+
|     1|   Aaron Acosta| vbishop@example.net|(677) 367-9557|
|     2|    Aaron Adams| james82@example.com|(332) 224-7965|
|     3|    Aaron Adams|nicholas92@exampl...|(351) 943-8670|
|     4|    Aaron Adams|zjohnson@example.com|(324) 384-5822|
|     5|  Aaron Aguilar|anthony78@example...|(867) 762-3031|
|     6|  Aaron Aguilar|wilsonruben@examp...|(376) 707-8408|
|     7|  Aaron Aguirre|samantha45@exampl...|(147) 383-2497|
|     8|Aaron Alexander|   phall@example.org|(761) 557-5601|
|     9|    Aaron Allen| amber80@example.com|(613) 319-4730|
|    10|    Aaron Allen|joseph77@example.net|(104) 903-5333|
|    11|    Aaron Allen|nicolecasey@examp...|(921) 454-7983|
|    12|    Aaron Allen|reyesmonica@examp...|(603) 745-9470|
|    13|  Aaron Allison|sullivanamy@examp...|(288) 221-8294|
|    14|  Aaron Alvarez|

In [71]:
trip_df.show(10)

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:43:00 PM|         1.8|     9.5|      0.5|   0.5|      0.0|  

# Handling Missing Vlaues

In [32]:
trip_df = trip_df.na.fill({"BaseFare": 0, "ExtraFare": 0, "MtaTax": 0, "TipAmount": 0, "TollsAmount": 0, "ImprovementSurcharge": 0})

In [44]:
trip_df.show(5)

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:43:00 PM|         1.8|     9.5|      0.5|   0.5|      0.0|  

In [137]:
trip_df.printSchema()

root
 |-- TripID: integer (nullable = true)
 |-- RequestID: integer (nullable = true)
 |-- DriverID: integer (nullable = true)
 |-- VehicleID: integer (nullable = true)
 |-- PaymentID: integer (nullable = true)
 |-- TripStartTime: string (nullable = true)
 |-- TripEndTime: string (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- BaseFare: double (nullable = true)
 |-- ExtraFare: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)



# Converting Date columns

#### modify this cell for needed dataframes, columns  

In [145]:

# Define the mapping of DataFrames to their respective columns for timestamp conversion
dataframes_with_columns = {
    "trip_df": (trip_df, ["TripStartTime", "TripEndTime"]),
    "request_df": (request_df, ["RequestTime", "AcceptTime"]),
    # Add more DataFrames and their columns as needed
}


##### converting to date and verifying the result

In [146]:
from pyspark.sql.functions import to_timestamp, col

# Define the mapping of DataFrames to their respective columns for timestamp conversion
dataframes_with_columns = {
    "trip_df": (trip_df, ["TripStartTime", "TripEndTime"]),
    "request_df": (request_df, ["RequestTime", "AcceptTime"]),
    # Add more DataFrames and their columns as needed
}

# Function to convert timestamp columns
def convert_timestamp_columns(df, columns, date_format="M/d/yyyy h:mm:ss a"):
    for col_name in columns:
        df = df.withColumn(col_name, to_timestamp(col(col_name), date_format).cast("date"))
    return df

# Loop through each DataFrame, apply the conversion, and replace the old DataFrame directly
for table_name, (df, columns) in dataframes_with_columns.items():
    # Convert timestamp columns
    updated_df = convert_timestamp_columns(df, columns)
    
    # Replace the old DataFrame directly by updating the global variables
    globals()[table_name] = updated_df
    
    # Automatically show the updated DataFrame and print the schema
    print(f"DataFrame: {table_name}")
    updated_df.show(5)
    updated_df.printSchema()


DataFrame: trip_df
+------+---------+--------+---------+---------+-------------+-----------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|TripStartTime|TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+-------------+-----------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|   2015-01-15| 2015-01-15|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|   2015-01-10| 2015-01-10|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|   2015-01-10| 2015-01-10|         1.8|     9.5|      0.5|   0.5|      0.0|        0.0|                 0.3|
|     4|   289268|  985570|   336402|        

In [147]:
trip_df.printSchema()

root
 |-- TripID: integer (nullable = true)
 |-- RequestID: integer (nullable = true)
 |-- DriverID: integer (nullable = true)
 |-- VehicleID: integer (nullable = true)
 |-- PaymentID: integer (nullable = true)
 |-- TripStartTime: date (nullable = true)
 |-- TripEndTime: date (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- BaseFare: double (nullable = true)
 |-- ExtraFare: double (nullable = true)
 |-- MtaTax: double (nullable = true)
 |-- TipAmount: double (nullable = true)
 |-- TollsAmount: double (nullable = true)
 |-- ImprovementSurcharge: double (nullable = true)



# Renaming all columns in each dataframe to snake_case

In [150]:
import re
from pyspark.sql import DataFrame

# Function to convert a string to snake_case
def to_snake_case(name: str) -> str:
    # Replace spaces and hyphens with underscores
    name = re.sub(r'[\s-]+', '_', name)

    # Insert underscores before any uppercase letter that follows a lowercase letter
    name = re.sub(r'([a-z])([A-Z])', r'\1_\2', name)

    # Insert underscores between groups of uppercase letters and lowercase letters
    name = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name)  # e.g., ABc -> AB_c
    
    # Handle multiple uppercase letters
    name = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name)  # Ensure a transition from uppercase to lowercase has an underscore

    # Avoid double underscores by replacing multiple underscores with a single underscore
    name = re.sub(r'__+', '_', name)

    # Remove leading underscores if present and convert to lowercase
    name = name.lstrip('_').lower()

    return name

# Function to rename all columns in a DataFrame to snake_case
def convert_columns_to_snake_case(df: DataFrame) -> DataFrame:
    # Get the existing column names
    original_columns = df.columns
    # Generate new column names in snake_case
    new_columns = [to_snake_case(col) for col in original_columns]

    # Rename the columns in the DataFrame
    for original_col, new_col in zip(original_columns, new_columns):
        df = df.withColumnRenamed(original_col, new_col)
    return df

# Loop through the list of DataFrames and apply the column name conversion
for i in range(len(dataframes)):
    dataframes[i] = convert_columns_to_snake_case(dataframes[i])

# Check results by printing the schema of all updated DataFrames
for i, df in enumerate(dataframes):
    print(f"Schema for DataFrame {i}:")
    df.printSchema()  # Display the schema to verify changes


Schema for DataFrame 0:
root
 |-- location_id: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)

Schema for DataFrame 1:
root
 |-- payment_id: integer (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_status_id: integer (nullable = true)

Schema for DataFrame 2:
root
 |-- payment_method_id: integer (nullable = true)
 |-- method_name: string (nullable = true)

Schema for DataFrame 3:
root
 |-- payment_status_id: integer (nullable = true)
 |-- status_name: string (nullable = true)

Schema for DataFrame 4:
root
 |-- request_id: integer (nullable = true)
 |-- passenger_id: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- request_time: date (nullable = true)
 |-- accept_time: date (nullable = true)

Schema for DataFrame 5:
root
 |-- trip_id: integer (nullable = true)
 |-- request_id: integer (nullable = true)
 |-- drive

In [84]:
trip_df.select("TripDistance", "BaseFare").describe().show()


+-------+------------------+------------------+
|summary|      TripDistance|          BaseFare|
+-------+------------------+------------------+
|  count|            584875|            584875|
|   mean|2.9828643214362116| 11.86882723658903|
| stddev|126.60890739487783|10.125811442812736|
|    min|               0.0|            -138.9|
|    max|           92000.9|             900.0|
+-------+------------------+------------------+



# How long do trips take on average, and what is the average distance?

# What are the most common pickup and dropoff locations?

In [10]:
# Join request_df with location_df for both PickupLocationID and DropoffLocationID
pickup_location_df = request_df.join(location_df, request_df.PickupLocationID == location_df.LocationID)
dropoff_location_df = request_df.join(location_df, request_df.DropoffLocationID == location_df.LocationID)

# Find the most common pickup and dropoff locations
pickup_location_df.groupBy("Longitude", "Latitude").agg(count("*").alias("pickup_count")).orderBy(col("pickup_count").desc()).show()
dropoff_location_df.groupBy("Longitude", "Latitude").agg(count("*").alias("dropoff_count")).orderBy(col("dropoff_count").desc()).show()

+------------+-----------+------------+
|   Longitude|   Latitude|pickup_count|
+------------+-----------+------------+
|-73.78796387|40.64152527|       11157|
|-73.95028687|40.79550171|          43|
|-74.18402863|40.68948364|          29|
|-74.00323486|40.73168182|          15|
|-73.93746185|40.74890137|          11|
|-73.98692322|40.72085953|          10|
|-73.97848511|40.76370239|          10|
|-73.98863983|40.71881866|           9|
|-74.00162506|40.74097824|           7|
| -73.9397049|40.79433823|           7|
|-73.99417877|40.75109863|           7|
|-73.94628906|40.77302933|           7|
|-73.99411011|40.75112152|           7|
|-73.99414063|40.75106812|           7|
|  -73.994133|40.75118637|           6|
|-73.87104034|40.77378082|           6|
|-73.87440491|40.77396011|           6|
|-73.92472076|40.76171875|           6|
| -73.7883606|40.64736938|           6|
|-73.94959259|40.77273178|           6|
+------------+-----------+------------+
only showing top 20 rows

+------------+

# How many trips does each passenger take, and what is their average trip distance?

In [31]:
# Join trip_df with request_df to get passenger information, then group by passenger
trip_passenger_df = trip_df.join(request_df, trip_df.RequestID == request_df.RequestID)

# Group by PassengerID to calculate the number of trips and average distance
trip_passenger_df.groupBy("PassengerID").agg(
    count("*").alias("trip_count"),
    avg("TripDistance").alias("avg_trip_distance")
).orderBy(col("trip_count").desc()).show()


+-----------+----------+------------------+
|PassengerID|trip_count| avg_trip_distance|
+-----------+----------+------------------+
|     195168|        23|2.7113043478260868|
|     183719|        19|2.6389473684210527|
|     208524|        17|2.9947058823529416|
|       6076|        16|          2.788125|
|     283761|        16|           2.47875|
|     259447|        16|1.9562499999999998|
|     207380|        16|          1.971875|
|      28440|        16|            2.9225|
|     194260|        15|             2.168|
|     208173|        15|1.6353333333333337|
|      56911|        15| 4.360666666666668|
|      81120|        15|2.9406666666666665|
|      86477|        15|3.3400000000000003|
|      77372|        15| 4.496666666666668|
|     125266|        15|2.6446666666666667|
|     219865|        15| 3.131333333333333|
|      32041|        15|3.3273333333333337|
|     258125|        15|2.0073333333333334|
|      51695|        15|1.8993333333333335|
|     269183|        15|2.127999

# What is the average fare and tip per trip by payment method?

# What percentage of trips have different payment statuses?

# What are the most common vehicle makes and models used in trips?

In [18]:
vehicle_full_df = vehicle_df.join(vehicle_make_df, on='MakeID')

# Group by MakeName and Model, and count the occurrences
vehicle_full_df.groupBy('MakeName', 'Model').count().orderBy('count', ascending=False).show(10)

+-------------+------+-----+
|     MakeName| Model|count|
+-------------+------+-----+
|        Honda|Accord| 7587|
|      Hyundai|Tucson| 7578|
|   Volkswagen|Beetle| 7568|
|          Kia| Forte| 7557|
|       Nissan|Sentra| 7540|
|       Nissan|Altima| 7534|
|Mercedes-Benz|   GLC| 7534|
|       Toyota|  RAV4| 7530|
|         Ford|Escape| 7527|
|       Nissan|Maxima| 7527|
+-------------+------+-----+
only showing top 10 rows



In [26]:
trip_df.show(5)

+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|       TripStartTime|         TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|
+------+---------+--------+---------+---------+--------------------+--------------------+------------+--------+---------+------+---------+-----------+--------------------+
|     1|   369036|  968141|   363309|        6|1/15/2015 7:05:00 PM|1/15/2015 7:23:00 PM|        1.59|    12.0|      1.0|   0.5|     3.25|        0.0|                 0.3|
|     2|   369031| 1024488|   336402|        6|1/10/2015 8:33:00 PM|1/10/2015 8:53:00 PM|         3.3|    14.5|      0.5|   0.5|      2.0|        0.0|                 0.3|
|     3|   369030|  968141|   287846|        3|1/10/2015 8:33:00 PM|1/10/2015 8:43:00 PM|         1.8|     9.5|      0.5|   0.5|      0.0|  