In [0]:

from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta import *

In [0]:
# Get the list of tables in the specified schema
tables = spark.sql("SHOW TABLES IN bronze").collect()
# Dictionary to store DataFrames
dataframes = {}

# Loop through each table in the schema
for row in tables:
    table_name = row['tableName']  # Get the table name
    full_table_name = f"bronze.{table_name}"  # Construct the full table name
    # Read the data and filter based on the current date
    df = spark.read.table(full_table_name).filter(to_date("_processing_date") == current_date())
    # Store the DataFrame in the dictionary
    dataframes[full_table_name.split('.')[1]] = df

# Example: Show the first few rows of a specific DataFrame (e.g., user)
if "user" in dataframes:
    dataframes["user"].show()



+------+-------------------+--------------------+--------------+----------------+-------------------+--------------------+-----------------------------+
|UserID|           FullName|               Email|   PhoneNumber|DriverMeanRating|   _processing_date|     _input_filename|_input_file_modification_date|
+------+-------------------+--------------------+--------------+----------------+-------------------+--------------------+-----------------------------+
|  4583|     Alexander Ruiz|  bbarry@example.com|(687) 331-4684|            NULL|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4584|     Alexander Ruiz|sydneyfleming@exa...|(367) 990-5730|            NULL|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4585|  Alexander Salinas| tyler41@example.net|(678) 623-3713|            NULL|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4586|  Alexander Sanchez| james59@example.com|(740) 652-2668|            NULL|2

# Data Cleaning (Drop NULL Values)

In [0]:
dataframes["user"] = dataframes["user"].drop("DriverMeanRating")
dataframes["trip"] = dataframes["trip"].drop("driver_rating")

In [0]:
dataframes["user"].show(5)


+------+-----------------+--------------------+--------------+-------------------+--------------------+-----------------------------+
|UserID|         FullName|               Email|   PhoneNumber|   _processing_date|     _input_filename|_input_file_modification_date|
+------+-----------------+--------------------+--------------+-------------------+--------------------+-----------------------------+
|  4583|   Alexander Ruiz|  bbarry@example.com|(687) 331-4684|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4584|   Alexander Ruiz|sydneyfleming@exa...|(367) 990-5730|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4585|Alexander Salinas| tyler41@example.net|(678) 623-3713|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4586|Alexander Sanchez| james59@example.com|(740) 652-2668|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|  4587|Alexander Santana|qmitchell@example...|(995) 993-2936|

In [0]:
dataframes["trip"].show(5)

+------+---------+--------+---------+---------+-------------------+-------------------+------------+--------+---------+------+---------+-----------+--------------------+-------------------+--------------------+-----------------------------+
|TripID|RequestID|DriverID|VehicleID|PaymentID|      TripStartTime|        TripEndTime|TripDistance|BaseFare|ExtraFare|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|   _processing_date|     _input_filename|_input_file_modification_date|
+------+---------+--------+---------+---------+-------------------+-------------------+------------+--------+---------+------+---------+-----------+--------------------+-------------------+--------------------+-----------------------------+
|     1|   369036|  784422|   363309|        6|2015-01-15 19:05:00|2015-01-15 19:23:00|        1.59|   12.00|     1.00|  0.50|     3.25|       0.00|                0.30|2024-10-15 00:12:09|dbfs:/mnt/landing...|          2024-10-14 15:49:31|
|     2|   369031|  840769|   336402

# Create a function to remove rows where the first column contains the substring "---"
def remove_specific_rows(df, substring="---"):
    # Get the name of the first column dynamically
    first_col = df.columns[0]
    
    # Filter rows where the first column does not contain the specified substring
    df_filtered = df.filter(~df[first_col].contains(substring))
    
    return df_filtered

# Loop through the list of DataFrames and remove rows with the specified substring
df_list = [remove_specific_rows(df) for df in df_list]

# Show results
for i, df in enumerate(df_list):
    globals()[df_names[i]] = df
    


In [0]:
dataframes["user"].orderBy("FullName").show(35)

+------+---------------+--------------------+--------------+-------------------+--------------------+-----------------------------+
|UserID|       FullName|               Email|   PhoneNumber|   _processing_date|     _input_filename|_input_file_modification_date|
+------+---------------+--------------------+--------------+-------------------+--------------------+-----------------------------+
|700001|   Aaron Abbott|angela67@example.com|(453) 832-3608|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|     1|   Aaron Acosta| vbishop@example.net|(677) 367-9557|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|700002|   Aaron Acosta|vanessa20@example...|(953) 599-6007|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|     2|    Aaron Adams| james82@example.com|(332) 224-7965|2024-10-15 00:12:16|dbfs:/mnt/landing...|          2024-10-14 15:48:30|
|     3|    Aaron Adams|nicholas92@exampl...|(351) 943-8670|2024-10-15 00:12

In [0]:
"""

from pyspark.sql import Window
from pyspark.sql.functions import count, row_number, desc

# Define the window specification for counting
window_spec = Window.partitionBy("Longitude", "Latitude").orderBy(desc("count"))

# Group by Longitude and Latitude, count occurrences, and order
location_with_count = location_df.groupBy("Longitude", "Latitude") \
    .agg(count("*").alias("count"))

# Define a new window to assign row numbers based on count
rownum_window = Window.orderBy(desc("count"))

# Add a row number based on the ordered counts
location_with_rownum = location_with_count.withColumn("rownum", row_number().over(rownum_window))

# Filter to keep only rows with count equal to 1
filtered_df = location_with_rownum.filter(location_with_rownum["count"] == 1)

# Show the results, ordering by count
filtered_df.select("Longitude", "Latitude", "count").orderBy(desc("count")).show()
"""

'\n\nfrom pyspark.sql import Window\nfrom pyspark.sql.functions import count, row_number, desc\n\n# Define the window specification for counting\nwindow_spec = Window.partitionBy("Longitude", "Latitude").orderBy(desc("count"))\n\n# Group by Longitude and Latitude, count occurrences, and order\nlocation_with_count = location_df.groupBy("Longitude", "Latitude")     .agg(count("*").alias("count"))\n\n# Define a new window to assign row numbers based on count\nrownum_window = Window.orderBy(desc("count"))\n\n# Add a row number based on the ordered counts\nlocation_with_rownum = location_with_count.withColumn("rownum", row_number().over(rownum_window))\n\n# Filter to keep only rows with count equal to 1\nfiltered_df = location_with_rownum.filter(location_with_rownum["count"] == 1)\n\n# Show the results, ordering by count\nfiltered_df.select("Longitude", "Latitude", "count").orderBy(desc("count")).show()\n'

In [0]:
for df in dataframes.values():
    df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Longitude: decimal(12,8) (nullable = true)
 |-- Latitude: decimal(12,8) (nullable = true)
 |-- _processing_date: timestamp (nullable = true)
 |-- _input_filename: string (nullable = true)
 |-- _input_file_modification_date: timestamp (nullable = true)

root
 |-- PaymentID: integer (nullable = true)
 |-- PaymentMethodID: integer (nullable = true)
 |-- PaymentStatusID: integer (nullable = true)
 |-- _processing_date: timestamp (nullable = true)
 |-- _input_filename: string (nullable = true)
 |-- _input_file_modification_date: timestamp (nullable = true)

root
 |-- PaymentMethodID: integer (nullable = true)
 |-- MethodName: string (nullable = true)
 |-- _processing_date: timestamp (nullable = true)
 |-- _input_filename: string (nullable = true)
 |-- _input_file_modification_date: timestamp (nullable = true)

root
 |-- PaymentStatusID: integer (nullable = true)
 |-- StatusName: string (nullable = true)
 |-- _processing_date: timestamp (n

# Renaming all columns in each dataframe to snake_case

In [0]:
import re
# Function to convert a string to snake_case
def to_snake_case(name: str) -> str:
    # Replace spaces and hyphens with underscores
    name = re.sub(r'[\s-]+', '_', name)

    # Insert underscores before any uppercase letter that follows a lowercase letter
    name = re.sub(r'([a-z])([A-Z])', r'\1_\2', name)

    # Insert underscores between groups of uppercase letters and lowercase letters
    name = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name)  # e.g., ABc -> AB_c
    
    # Handle multiple uppercase letters
    name = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name)  # Ensure a transition from uppercase to lowercase has an underscore

    # Avoid double underscores by replacing multiple underscores with a single underscore
    name = re.sub(r'__+', '_', name)

    # Remove leading underscores if present and convert to lowercase
    name = name.lstrip('_').lower()

    return name

# Function to rename all columns in a DataFrame to snake_case
def convert_columns_to_snake_case(df: DataFrame) -> DataFrame:
    # Get the existing column names
    original_columns = df.columns
    # Generate new column names in snake_case
    new_columns = [to_snake_case(col) for col in original_columns]

    # Rename the columns in the DataFrame
    for original_col, new_col in zip(original_columns, new_columns):
        df = df.withColumnRenamed(original_col, new_col)
    return df



In [0]:
# Loop through the list of DataFrames and apply the column name conversion
for df_name, df in dataframes.items():
    # Apply column name conversion
    dataframes[df_name] = convert_columns_to_snake_case(df)
    dataframes[df_name].show(1)


+-----------+------------+-----------+-------------------+--------------------+----------------------------+
|location_id|   longitude|   latitude|    processing_date|      input_filename|input_file_modification_date|
+-----------+------------+-----------+-------------------+--------------------+----------------------------+
|      76173|-73.99945068|40.72192383|2024-10-15 00:11:46|dbfs:/mnt/landing...|         2024-10-14 15:47:38|
+-----------+------------+-----------+-------------------+--------------------+----------------------------+
only showing top 1 row

+----------+-----------------+-----------------+-------------------+--------------------+----------------------------+
|payment_id|payment_method_id|payment_status_id|    processing_date|      input_filename|input_file_modification_date|
+----------+-----------------+-----------------+-------------------+--------------------+----------------------------+
|         6|                6|                4|2024-10-15 00:11:53|dbfs:/

In [0]:
processing_date = date_trunc('second', current_timestamp())

for df_name, df in dataframes.items():
    # Apply column name conversion
    dataframes[df_name] = df.withColumn("processing_date", processing_date) \
    .withColumnRenamed("input_file_modification_date", "record_modified_date") \
    .drop("input_filename")



In [0]:
dataframes["user"].show(5)

+-------+-----------------+--------------------+--------------+-------------------+--------------------+
|user_id|        full_name|               email|  phone_number|    processing_date|record_modified_date|
+-------+-----------------+--------------------+--------------+-------------------+--------------------+
|   4583|   Alexander Ruiz|  bbarry@example.com|(687) 331-4684|2024-10-15 00:12:55| 2024-10-14 15:48:30|
|   4584|   Alexander Ruiz|sydneyfleming@exa...|(367) 990-5730|2024-10-15 00:12:55| 2024-10-14 15:48:30|
|   4585|Alexander Salinas| tyler41@example.net|(678) 623-3713|2024-10-15 00:12:55| 2024-10-14 15:48:30|
|   4586|Alexander Sanchez| james59@example.com|(740) 652-2668|2024-10-15 00:12:55| 2024-10-14 15:48:30|
|   4587|Alexander Santana|qmitchell@example...|(995) 993-2936|2024-10-15 00:12:55| 2024-10-14 15:48:30|
+-------+-----------------+--------------------+--------------+-------------------+--------------------+
only showing top 5 rows



In [0]:

# List of DataFrames, locations, and hash columns with an attribute for slowly changing dimension (SCD)
tables_info = [
    {"df": dataframes["user"], "location": f"/mnt/silver/user", "hash_columns": ["user_id", "full_name"], "is_scd": True},
    {"df": dataframes["location"], "location": f"/mnt/silver/location", "hash_columns": ["location_id"], "is_scd": True},
    {"df": dataframes["payment"], "location": f"/mnt/silver/payment", "hash_columns": ["payment_id"], "is_scd": True},
    {"df": dataframes["paymentmethod"], "location": f"/mnt/silver/paymentmethod", "hash_columns": ["payment_method_id"], "is_scd": True},
    {"df": dataframes["paymentstatus"], "location": f"/mnt/silver/paymentstatus", "hash_columns": ["payment_status_id"], "is_scd": True},
    {"df": dataframes["request"], "location": f"/mnt/silver/request", "hash_columns": ["request_id"], "is_scd": True},
    {"df": dataframes["trip"], "location": f"/mnt/silver/trip", "hash_columns": ["trip_id"], "is_scd": True},
    {"df": dataframes["vehicles"], "location": f"/mnt/silver/vehicles", "hash_columns": ["vehicle_id"], "is_scd": True},
    {"df": dataframes["vehiclemakes"], "location": f"/mnt/silver/vehiclemakes", "hash_columns": ["make_id"], "is_scd": True}
]

def deduplicate_source_df(source_df: DataFrame, hash_columns: list) -> DataFrame:
    """
    Deduplicates the source DataFrame based on the hash columns.
    
    Args:
        source_df (DataFrame): The source DataFrame to be deduplicated.
        hash_columns (list): List of columns to be used for deduplication.
        
    Returns:
        DataFrame: The deduplicated DataFrame.
    """
    return source_df.dropDuplicates(hash_columns)

# Function to generate a hash column for specified columns in a DataFrame
def generate_hash_column(df: DataFrame, columns: list, hash_column_name: str = "record_hash") -> DataFrame:
    """
    Generates a hash column for specified columns in the DataFrame using SHA-256.
    """
    return df.withColumn(hash_column_name, sha2(concat_ws("||", *columns), 256))

# Function to build merge condition using hash
def build_merge_condition_with_hash(target_alias: str, source_alias: str, hash_column: str) -> str:
    """
    Builds a merge condition string using the hash column.
    """
    return f"{target_alias}.{hash_column} = {source_alias}.{hash_column}"

# Function to merge Delta tables based on SCD flag
def merge_delta_table(source_df: DataFrame, delta_location: str, hash_columns: list, is_scd: bool):
    """
    Performs merge for slowly changing dimension (SCD) tables. If is_scd is False, data is simply overwritten.
    """
    # Generate hash column for the source DataFrame
    source_df_hashed = generate_hash_column(source_df, hash_columns)
    
    # If the table is an SCD, perform a merge operation
    if is_scd:
        try:
            # Check if the Delta table exists at the specified location
            if DeltaTable.isDeltaTable(spark, delta_location):
                print(f"Delta table found at {delta_location}. Proceeding with merge...")
                delta_table = DeltaTable.forPath(spark, delta_location)
                
                # Alias for target and source
                target_alias = "target"
                source_alias = "src"
                
                # Generate hash column for the target Delta table
                delta_table_df = spark.read.format("delta").load(delta_location)
                delta_table_hashed = generate_hash_column(delta_table_df, hash_columns)
                
                # Build the merge condition using the hash column
                merge_condition = build_merge_condition_with_hash(target_alias, source_alias, "record_hash")
                
                # Perform the merge operation
                delta_table.alias(target_alias).merge(
                    source=source_df_hashed.alias(source_alias),
                    condition=merge_condition
                ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
                
                print(f"Merge completed for {delta_location}.")
            else:
                # If Delta table doesn't exist, write the DataFrame to that location
                print(f"No Delta table found at {delta_location}. Writing new DataFrame...")
                source_df_hashed.write.mode("overwrite").format("delta").save(delta_location)
                print(f"DataFrame written to {delta_location}.")
        except Exception as e:
            print(f"Error processing table at {delta_location}: {e}")
    else:
        # For non-SCD tables, just overwrite the data
        print(f"{delta_location} is not an SCD table. Overwriting...")
        source_df_hashed.write.mode("overwrite").format("delta").save(delta_location)
        print(f"DataFrame overwritten at {delta_location}.")

In [0]:
# Loop through each table and perform the operations
for table_info in tables_info:
    # Print the schema of the source DataFrame
    print("Schema of source DataFrame:")
    table_info["df"].printSchema()  # Ensure we are printing the schema of the specific DataFrame

    # Deduplicate the source DataFrame based on hash columns
    df = deduplicate_source_df(table_info["df"], table_info["hash_columns"])
    
    # Prepare the Delta table location and SCD flag
    delta_location = table_info["location"]
    hash_columns = table_info["hash_columns"]
    is_scd = table_info["is_scd"]  # Check if it's a slowly changing dimension

    # Perform merge or overwrite based on is_scd flag
    merge_delta_table(df, delta_location, hash_columns, is_scd)

Schema of source DataFrame:
root
 |-- user_id: integer (nullable = true)
 |-- full_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- processing_date: timestamp (nullable = true)
 |-- record_modified_date: timestamp (nullable = true)

Delta table found at /mnt/silver/user. Proceeding with merge...
Merge completed for /mnt/silver/user.
Schema of source DataFrame:
root
 |-- location_id: integer (nullable = true)
 |-- longitude: decimal(12,8) (nullable = true)
 |-- latitude: decimal(12,8) (nullable = true)
 |-- processing_date: timestamp (nullable = true)
 |-- record_modified_date: timestamp (nullable = true)

/mnt/silver/location is not an SCD table. Overwriting...
DataFrame overwritten at /mnt/silver/location.
Schema of source DataFrame:
root
 |-- payment_id: integer (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_status_id: integer (nullable = true)
 |-- processing_date: timestamp (nullab

In [0]:
spark.read.format("delta").load(f"/mnt/silver/user").orderBy("full_name").show()


+-------+---------------+--------------------+--------------+-------------------+--------------------+--------------------+
|user_id|      full_name|               email|  phone_number|    processing_date|record_modified_date|         record_hash|
+-------+---------------+--------------------+--------------+-------------------+--------------------+--------------------+
| 700001|   Aaron Abbott|angela67@example.com|(453) 832-3608|2024-10-15 00:12:56| 2024-10-14 15:48:30|6e1ef69ae8734c534...|
|      1|   Aaron Acosta| vbishop@example.net|(677) 367-9557|2024-10-15 00:12:56| 2024-10-14 15:48:30|21d31195f82942e05...|
| 700002|   Aaron Acosta|vanessa20@example...|(953) 599-6007|2024-10-15 00:12:56| 2024-10-14 15:48:30|62d5bc8c4ecbeaf3d...|
|      4|    Aaron Adams|zjohnson@example.com|(324) 384-5822|2024-10-15 00:12:56| 2024-10-14 15:48:30|60f62b9afaac9bbb8...|
| 700006|    Aaron Adams| emily69@example.net|(217) 156-8782|2024-10-15 00:12:56| 2024-10-14 15:48:30|4866e617e54ca7de7...|
|      2

In [0]:
spark.read.format("delta").load(f"/mnt/silver/user").orderBy("full_name").count()


584246

In [0]:
spark.read.format("delta").load(f"/mnt/silver/user").select("user_id").distinct().count()


584246

In [0]:
spark.read.format("delta").load(f"/mnt/silver/paymentmethod").show()


+-----------------+-----------+-------------------+--------------------+--------------------+
|payment_method_id|method_name|    processing_date|record_modified_date|         record_hash|
+-----------------+-----------+-------------------+--------------------+--------------------+
|                1|  Apple Pay|2024-10-15 00:13:16| 2024-10-14 15:42:53|6b86b273ff34fce19...|
|                2|       Cash|2024-10-15 00:13:16| 2024-10-14 15:42:53|d4735e3a265e16eee...|
|                3|Credit Card|2024-10-15 00:13:16| 2024-10-14 15:42:53|4e07408562bedb8b6...|
|                4| Debit Card|2024-10-15 00:13:16| 2024-10-14 15:42:53|4b227777d4dd1fc61...|
|                5| Google Pay|2024-10-15 00:13:16| 2024-10-14 15:42:53|ef2d127de37b942ba...|
|                6|     PayPal|2024-10-15 00:13:16| 2024-10-14 15:42:53|e7f6c011776e8db7c...|
+-----------------+-----------+-------------------+--------------------+--------------------+



In [0]:
# After your write_to_delta function
for file in dataframes.keys():
    print(file)
    table_name = f"silver.{file}"
    print(f"Creating external table: {file}")
    # Creating schema and external table in Spark SQL
    spark.sql("CREATE SCHEMA IF NOT EXISTS silver")
    # Set the current database to 'silver'
    spark.sql("USE silver")
    delta_table_path = f"/mnt/silver/{file}"
    spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION r'{delta_table_path}'")
    #spark.sql(f"SELECT * FROM {table_name}").show()

location
Creating external table: location
payment
Creating external table: payment
paymentmethod
Creating external table: paymentmethod
paymentstatus
Creating external table: paymentstatus
request
Creating external table: request
trip
Creating external table: trip
user
Creating external table: user
vehiclemakes
Creating external table: vehiclemakes
vehicles
Creating external table: vehicles


In [0]:

# Verify the current schema
print("Current Schema:")
spark.sql("SELECT current_database()").show()

# List all tables in the bronze schema
print("Tables in silver schema:")
spark.sql("SHOW TABLES IN silver").show()
spark.sql("SELECT * FROM silver.Location LIMIT 1").show()
spark.sql("SELECT * FROM silver.payment LIMIT 1").show()
spark.sql("SELECT * FROM silver.paymentmethod LIMIT 1").show()
spark.sql("SELECT * FROM silver.paymentstatus LIMIT 1").show()
spark.sql("SELECT * FROM silver.user LIMIT 1").show()
spark.sql("SELECT * FROM silver.vehicles LIMIT 1").show()
spark.sql("SELECT * FROM silver.vehiclemakes LIMIT 1").show()
spark.sql("SELECT * FROM silver.trip LIMIT 1").show()
spark.sql("SELECT * FROM silver.request LIMIT 1").show()


Current Schema:
+----------------+
|current_schema()|
+----------------+
|          silver|
+----------------+

Tables in silver schema:
+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
|  silver|     location|      false|
|  silver|      payment|      false|
|  silver|paymentmethod|      false|
|  silver|paymentstatus|      false|
|  silver|      request|      false|
|  silver|         trip|      false|
|  silver|         user|      false|
|  silver| vehiclemakes|      false|
|  silver|     vehicles|      false|
+--------+-------------+-----------+

+-----------+------------+-----------+-------------------+--------------------+--------------------+
|location_id|   longitude|   latitude|    processing_date|record_modified_date|         record_hash|
+-----------+------------+-----------+-------------------+--------------------+--------------------+
|     165829|-73.99032593|40.74445343|2024-10-15 00:13:07| 2024-10-14 15:47:38

In [0]:

# Dictionary to store the DeltaTable objects
delta_tables = {}

for file in dataframes.keys():
    table_name = f"silver.{file}"  # Define the table name in the bronze schema
    print(table_name)
    # Store the DeltaTable object in the dictionary using the file name as the key
    delta_tables[file] = DeltaTable.forName(spark, table_name)

    # Vacuum the table if it has not been vacuumed in the last 30 days.
    if delta_tables[file].history(30).filter("operation = 'VACUUM START'").count() == 0:
        delta_tables[file].optimize()
        delta_tables[file].vacuum()  # Default = 7 days



silver.location
silver.payment
silver.paymentmethod
silver.paymentstatus
silver.request
silver.trip
silver.user
silver.vehiclemakes
silver.vehicles


In [0]:
%sql
SELECT count(distinct *) from silver.user

"count(DISTINCT user_id, full_name, email, phone_number, processing_date, record_modified_date, record_hash)"
584246


# How long do trips take on average, and what is the average distance?

# What are the most common pickup and dropoff locations?

# Join request_df with location_df for both PickupLocationID and DropoffLocationID
pickup_location_df = request_df.join(location_df, request_df.pickup_locationID == location_df.locatio_id)
dropoff_location_df = request_df.join(location_df, request_df.DropoffLocationID == location_df.LocationID)

# Find the most common pickup and dropoff locations
pickup_location_df.groupBy("Longitude", "Latitude").agg(count("*").alias("pickup_count")).orderBy(col("pickup_count").desc()).show()
dropoff_location_df.groupBy("Longitude", "Latitude").agg(count("*").alias("dropoff_count")).orderBy(col("dropoff_count").desc()).show()

# How many trips does each passenger take, and what is their average trip distance?

# Join trip_df with request_df to get passenger information, then group by passenger
trip_passenger_df = trip_df.join(request_df, trip_df.RequestID == request_df.RequestID)

# Group by PassengerID to calculate the number of trips and average distance
trip_passenger_df.groupBy("PassengerID").agg(
    count("*").alias("trip_count"),
    avg("TripDistance").alias("avg_trip_distance")
).orderBy(col("trip_count").desc()).show()


# What is the average fare and tip per trip by payment method?

# What percentage of trips have different payment statuses?

# What are the most common vehicle makes and models used in trips?

vehicle_full_df = vehicle_df.join(vehicle_make_df, on='MakeID')

# Group by MakeName and Model, and count the occurrences
vehicle_full_df.groupBy('MakeName', 'Model').count().orderBy('count', ascending=False).show(10)