In [0]:

from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta import *

In [0]:
dim_date = "gold.DIM_DATES"
dim_user = "gold.DIM_USERS"
dim_vehicle = "gold.DIM_VEHICLES"
dim_location = "gold.DIM_LOCATIONS"
fact_request = "gold.FCT_REQUESTS"

In [0]:
# Get the list of tables in the specified schema
tables = spark.sql("SHOW TABLES IN silver").collect()
# Dictionary to store DataFrames
dataframes = {}

# Loop through each table in the schema
for row in tables:
    table_name = row['tableName']  # Get the table name
    full_table_name = f"silver.{table_name}"  # Construct the full table name
    # Read the data and filter based on the current date
    df = spark.read.table(full_table_name)
    df.printSchema()  # Check the schema of the DataFrame

    df = spark.read.table(full_table_name).filter(to_date(col("processing_date")) == current_date())
    # Store the DataFrame in the dictionary
    dataframes[full_table_name.split('.')[1]] = df

# Example: Show the first few rows of a specific DataFrame (e.g., user)
if "user" in dataframes:
    dataframes["user"].show()



root
 |-- location_id: integer (nullable = true)
 |-- longitude: decimal(12,8) (nullable = true)
 |-- latitude: decimal(12,8) (nullable = true)
 |-- processing_date: timestamp (nullable = true)
 |-- record_modified_date: timestamp (nullable = true)
 |-- record_hash: string (nullable = true)

root
 |-- payment_id: integer (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_status_id: integer (nullable = true)
 |-- processing_date: timestamp (nullable = true)
 |-- record_modified_date: timestamp (nullable = true)
 |-- record_hash: string (nullable = true)

root
 |-- payment_method_id: integer (nullable = true)
 |-- method_name: string (nullable = true)
 |-- processing_date: timestamp (nullable = true)
 |-- record_modified_date: timestamp (nullable = true)
 |-- record_hash: string (nullable = true)

root
 |-- payment_status_id: integer (nullable = true)
 |-- status_name: string (nullable = true)
 |-- processing_date: timestamp (nullable = true)
 |-- record_mod

In [0]:
dataframes["user"].orderBy("full_name").show()

+-------+---------------+--------------------+--------------+-------------------+--------------------+--------------------+
|user_id|      full_name|               email|  phone_number|    processing_date|record_modified_date|         record_hash|
+-------+---------------+--------------------+--------------+-------------------+--------------------+--------------------+
| 700001|   Aaron Abbott|angela67@example.com|(453) 832-3608|2024-10-15 00:12:56| 2024-10-14 15:48:30|6e1ef69ae8734c534...|
|      1|   Aaron Acosta| vbishop@example.net|(677) 367-9557|2024-10-15 00:12:56| 2024-10-14 15:48:30|21d31195f82942e05...|
| 700002|   Aaron Acosta|vanessa20@example...|(953) 599-6007|2024-10-15 00:12:56| 2024-10-14 15:48:30|62d5bc8c4ecbeaf3d...|
|      4|    Aaron Adams|zjohnson@example.com|(324) 384-5822|2024-10-15 00:12:56| 2024-10-14 15:48:30|60f62b9afaac9bbb8...|
| 700006|    Aaron Adams| emily69@example.net|(217) 156-8782|2024-10-15 00:12:56| 2024-10-14 15:48:30|4866e617e54ca7de7...|
|      2

In [0]:
# Step 1: Join user_df with vehicle_df on user_id (assuming driver_id in vehicle_df relates to user_id)
user_vehicle_df = dataframes["user"].join(dataframes["vehicles"], dataframes["user"]["user_id"] == dataframes["vehicles"]["driver_id"], "left")

# Step 2: Join vehicle_df with vehicle_make_df on make_id to bring in make details
user_vehicle_make_df = user_vehicle_df.join(dataframes["vehiclemakes"], dataframes["vehicles"]["make_id"] == dataframes["vehiclemakes"]["make_id"], "left")

In [0]:
dim_user = user_vehicle_make_df.select(
    "user_id",
    "full_name",
    "email",
    "phone_number",
    # Replace NULLs in vehicle columns with 'Passenger' or 'N/A'
    coalesce(user_vehicle_make_df["vehicle_id"], lit("Passenger")).alias("vehicle_id"),
    coalesce(user_vehicle_make_df["make_name"], lit("Passenger")).alias("vehicle_make"),
    coalesce(user_vehicle_make_df["model"], lit("Passenger")).alias("vehicle_model"),
    coalesce(user_vehicle_make_df["year"], lit("Passenger")).alias("vehicle_year"),
    coalesce(user_vehicle_make_df["color"], lit("Passenger")).alias("vehicle_color"),
    coalesce(user_vehicle_make_df["license_plate"], lit("Passenger")).alias("vehicle_license_plate")
)




In [0]:
dim_user.orderBy("full_name").show()

+-------+---------------+--------------------+--------------+----------+-------------+-------------+------------+-------------+---------------------+
|user_id|      full_name|               email|  phone_number|vehicle_id| vehicle_make|vehicle_model|vehicle_year|vehicle_color|vehicle_license_plate|
+-------+---------------+--------------------+--------------+----------+-------------+-------------+------------+-------------+---------------------+
| 700001|   Aaron Abbott|angela67@example.com|(453) 832-3608|    168434|      Hyundai|     Santa Fe|        2006|       Silver|              3BW 116|
|      1|   Aaron Acosta| vbishop@example.net|(677) 367-9557| Passenger|    Passenger|    Passenger|   Passenger|    Passenger|            Passenger|
| 700002|   Aaron Acosta|vanessa20@example...|(953) 599-6007|     11618|          BMW|     5 Series|        2017|       Orange|                 389J|
|      4|    Aaron Adams|zjohnson@example.com|(324) 384-5822| Passenger|    Passenger|    Passenger|

In [0]:

# Define the start and end date
start_date = "2015-01-01"
end_date = "2015-01-31"

# Create a Spark DataFrame with a date range using the sequence function
calendar_df = spark.sql(f"""
    SELECT explode(sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day)) as full_date
""")

# Transform the DataFrame to create the calendar dimension
dim_calendar = calendar_df.select(
    date_format("full_date", "yyyyMMdd").alias("date_key"),  # Convert to string format YYYYMMDD
    col("full_date"),
    dayofmonth("full_date").alias("day"),  # Day of the month
    date_format("full_date", "EEEE").alias("day_name"),  # Full name of the day
    dayofweek("full_date").alias("day_of_week"),  # Day of the week (1 = Sunday, 7 = Saturday)
    weekofyear("full_date").alias("week_of_year"),  # Week of the year
    month("full_date").alias("month"),  # Month (1-12)
    date_format("full_date", "MMMM").alias("month_name"),  # Full name of the month
    quarter("full_date").alias("quarter"),  # Quarter (1-4)
    year("full_date").alias("year"),  # Year
    when(dayofweek("full_date").isin([1, 7]), 1).otherwise(0).alias("is_weekend"),  # Weekend flag (1=weekend)
    lit(0).alias("is_holiday"),  # Dummy holiday flag (replace with actual holiday logic)
    month("full_date").alias("fiscal_month"),  # Fiscal month (you can customize this logic)
    quarter("full_date").alias("fiscal_quarter"),  # Fiscal quarter (you can customize this logic)
    year("full_date").alias("fiscal_year")  # Fiscal year
)

dim_calendar.printSchema()
# Show the resulting calendar dimension DataFrame



root
 |-- date_key: string (nullable = false)
 |-- full_date: date (nullable = false)
 |-- day: integer (nullable = false)
 |-- day_name: string (nullable = false)
 |-- day_of_week: integer (nullable = false)
 |-- week_of_year: integer (nullable = false)
 |-- month: integer (nullable = false)
 |-- month_name: string (nullable = false)
 |-- quarter: integer (nullable = false)
 |-- year: integer (nullable = false)
 |-- is_weekend: integer (nullable = false)
 |-- is_holiday: integer (nullable = false)
 |-- fiscal_month: integer (nullable = false)
 |-- fiscal_quarter: integer (nullable = false)
 |-- fiscal_year: integer (nullable = false)



In [0]:
dim_calendar.show(8)

+--------+----------+---+---------+-----------+------------+-----+----------+-------+----+----------+----------+------------+--------------+-----------+
|date_key| full_date|day| day_name|day_of_week|week_of_year|month|month_name|quarter|year|is_weekend|is_holiday|fiscal_month|fiscal_quarter|fiscal_year|
+--------+----------+---+---------+-----------+------------+-----+----------+-------+----+----------+----------+------------+--------------+-----------+
|20150101|2015-01-01|  1| Thursday|          5|           1|    1|   January|      1|2015|         0|         0|           1|             1|       2015|
|20150102|2015-01-02|  2|   Friday|          6|           1|    1|   January|      1|2015|         0|         0|           1|             1|       2015|
|20150103|2015-01-03|  3| Saturday|          7|           1|    1|   January|      1|2015|         1|         0|           1|             1|       2015|
|20150104|2015-01-04|  4|   Sunday|          1|           1|    1|   January|     

In [0]:
dataframes["payment"].show(10)
dataframes["paymentmethod"].show()
dataframes["paymentstatus"].show()

+----------+-----------------+-----------------+-------------------+--------------------+--------------------+
|payment_id|payment_method_id|payment_status_id|    processing_date|record_modified_date|         record_hash|
+----------+-----------------+-----------------+-------------------+--------------------+--------------------+
|         1|                1|                1|2024-10-15 00:13:13| 2024-10-14 15:44:24|6b86b273ff34fce19...|
|         6|                6|                4|2024-10-15 00:13:13| 2024-10-14 15:44:24|e7f6c011776e8db7c...|
|         3|                3|                1|2024-10-15 00:13:13| 2024-10-14 15:44:24|4e07408562bedb8b6...|
|         5|                5|                2|2024-10-15 00:13:13| 2024-10-14 15:44:24|ef2d127de37b942ba...|
|         4|                4|                4|2024-10-15 00:13:13| 2024-10-14 15:44:24|4b227777d4dd1fc61...|
|         2|                2|                3|2024-10-15 00:13:13| 2024-10-14 15:44:24|d4735e3a265e16eee...|
+

In [0]:
# Perform a cross join to get all combinations of PaymentMethod and PaymentStatus
combined_df = dataframes["paymentmethod"].crossJoin(dataframes["paymentstatus"])

# Add an incremental surrogate key
final_df = combined_df.withColumn("s_payment", monotonically_increasing_id())

# Select only the relevant columns (SurrogateKey, PaymentMethodID, PaymentStatusID)
dim_payment = final_df.select("s_payment", "payment_method_id", "method_name", "payment_status_id", "status_name")

# Show the resulting DataFrame
dim_payment.show(30)

+---------+-----------------+-----------+-----------------+-----------+
|s_payment|payment_method_id|method_name|payment_status_id|status_name|
+---------+-----------------+-----------+-----------------+-----------+
|        0|                1|  Apple Pay|                1|  Completed|
|        1|                1|  Apple Pay|                2|     Failed|
|        2|                1|  Apple Pay|                3|    Pending|
|        3|                1|  Apple Pay|                4|   Refunded|
|        4|                2|       Cash|                1|  Completed|
|        5|                2|       Cash|                2|     Failed|
|        6|                2|       Cash|                3|    Pending|
|        7|                2|       Cash|                4|   Refunded|
|        8|                3|Credit Card|                1|  Completed|
|        9|                3|Credit Card|                2|     Failed|
|       10|                3|Credit Card|                3|    P

In [0]:
dim_payment.show(10)

+---------+-----------------+-----------+-----------------+-----------+
|s_payment|payment_method_id|method_name|payment_status_id|status_name|
+---------+-----------------+-----------+-----------------+-----------+
|        0|                1|  Apple Pay|                1|  Completed|
|        1|                1|  Apple Pay|                2|     Failed|
|        2|                1|  Apple Pay|                3|    Pending|
|        3|                1|  Apple Pay|                4|   Refunded|
|        4|                2|       Cash|                1|  Completed|
|        5|                2|       Cash|                2|     Failed|
|        6|                2|       Cash|                3|    Pending|
|        7|                2|       Cash|                4|   Refunded|
|        8|                3|Credit Card|                1|  Completed|
|        9|                3|Credit Card|                2|     Failed|
+---------+-----------------+-----------+-----------------+-----

In [0]:
dim_location = dataframes["location"].select("location_id", "latitude", "longitude")
dim_location.show()

+-----------+-----------+------------+
|location_id|   latitude|   longitude|
+-----------+-----------+------------+
|     534332|40.80208206|-73.94550323|
|       8922|40.71480942|-74.01123047|
|     245539|40.75943756|-73.98454285|
|     272874|40.76562119|-73.98242950|
|     173273|40.76070404|-73.98984528|
|     320069|40.78929901|-73.97923279|
|      74627|40.71456528|-73.99970245|
|     422957|40.77065659|-73.96841431|
|     442324|40.80597305|-73.96557617|
|     360541|40.75173569|-73.97579193|
|     354277|40.72886276|-73.97641754|
|     198480|40.73231506|-73.98804474|
|     420681|40.79624939|-73.96874237|
|     427568|40.75556946|-73.96782684|
|     118623|40.71798325|-73.99394989|
|     173854|40.76736832|-73.98980713|
|     539166|40.75203323|-73.93933868|
|     421466|40.76403046|-73.96862030|
|     361052|40.76347351|-73.97573853|
|     527211|40.77400589|-73.94878387|
+-----------+-----------+------------+
only showing top 20 rows



In [0]:
dataframes["request"].show(5)
dataframes["trip"].show(5)

+----------+------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|request_id|passenger_id|pickup_location_id|dropoff_location_id|       request_time|        accept_time|    processing_date|record_modified_date|         record_hash|
+----------+------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|    237680|      116990|            325968|             282621|2015-01-13 21:04:00|2015-01-13 21:13:00|2024-10-15 00:13:20| 2024-10-14 15:47:33|39c6325cdb5e629fa...|
|    273431|      136300|             68934|             233428|2015-01-08 01:34:00|2015-01-08 01:41:00|2024-10-15 00:13:20| 2024-10-14 15:47:33|28c653b4e7e206ae4...|
|     34051|       16972|            375948|              98347|2015-01-26 13:56:00|2015-01-26 13:59:00|2024-10-15 00:13:20| 2024-10-14 15:47:33|71c70b5555bbb8ac4...

In [0]:

# Join the DataFrames
fact_request = dataframes["request"].join(dataframes["trip"], dataframes["request"]["request_id"] == dataframes["trip"]["request_id"], "left") \
    .join(dataframes["payment"], dataframes["trip"]["payment_id"] == dataframes["payment"]["payment_id"], "left") \
    .join(dim_payment, 
          (dataframes["payment"]["payment_method_id"] == dim_payment["payment_method_id"]) & 
          (dataframes["payment"]["payment_status_id"] == dim_payment["payment_status_id"]), 
          "left") \
    .select(
        dataframes["trip"]["driver_id"],
        dataframes["request"]["passenger_id"],
        dataframes["request"]["pickup_location_id"],
        dataframes["request"]["dropoff_location_id"],
        date_format(dataframes["request"]["request_time"], 'yyyyMMdd').alias("request_datekey"),  # Renamed and formatted
        date_format(dataframes["request"]["accept_time"], 'yyyyMMdd').alias("accept_datekey"),    # Renamed and formatted
        date_format(dataframes["trip"]["trip_start_time"], 'yyyyMMdd').alias("trip_start_datekey"),   # Renamed
        date_format(dataframes["trip"]["trip_end_time"], 'yyyyMMdd').alias("trip_end_datekey"),       # Renamed
        dim_payment["s_payment"],
        dataframes["trip"]["trip_distance"],
        dataframes["trip"]["base_fare"],
        dataframes["trip"]["extra_fare"],
        dataframes["trip"]["mta_tax"],
        dataframes["trip"]["tip_amount"],
        dataframes["trip"]["tolls_amount"],
        dataframes["trip"]["improvement_surcharge"]
    )

fact_request.show(24)

+---------+------------+------------------+-------------------+---------------+--------------+------------------+----------------+---------+-------------+---------+----------+-------+----------+------------+---------------------+
|driver_id|passenger_id|pickup_location_id|dropoff_location_id|request_datekey|accept_datekey|trip_start_datekey|trip_end_datekey|s_payment|trip_distance|base_fare|extra_fare|mta_tax|tip_amount|tolls_amount|improvement_surcharge|
+---------+------------+------------------+-------------------+---------------+--------------+------------------+----------------+---------+-------------+---------+----------+-------+----------+------------+---------------------+
|   786475|       17820|            440218|             406815|       20150103|      20150103|          20150103|        20150103|       17|         0.50|     4.50|      0.00|   0.50|      0.00|        0.00|                 0.00|
|   927190|       17899|            351563|             555384|       20150130| 

In [0]:
dataframes = [dim_calendar, dim_location, dim_payment, dim_user, fact_request]
# Names of the original DataFrames
dataframe_names = ["dim_calendar", "dim_location", "dim_payment", "dim_user", "fct_request"]

In [0]:
processing_date = date_trunc('second', current_timestamp())

for i, df in enumerate(dataframes):
    # Apply column name conversion
    df = df.withColumn("processing_date", processing_date)

    # Reassign the modified DataFrame back to the original global variable
    globals()[dataframe_names[i]] = df
    
    # Show the first 5 rows of the updated DataFrame


In [0]:

# List of DataFrames, locations, and hash columns with an attribute for slowly changing dimension (SCD)
tables_info = [
    {"df": dim_user, "location": "/mnt/gold/dim_user", "hash_columns": ["user_id", "full_name"], "is_scd": True},
    {"df": dim_calendar, "location": "/mnt/gold//dim_calendar", "hash_columns": ["date_key"], "is_scd": True},
    {"df": dim_location, "location": "/mnt/gold/dim_location", "hash_columns": ["location_id"], "is_scd": True},
    {"df": dim_payment, "location": "/mnt/gold/dim_payment", "hash_columns": ["s_payment"], "is_scd": True},
    {"df": fact_request, "location": "/mnt/gold/fct_request", "hash_columns": ["driver_id", "trip_start_datekey"], "is_scd": True}
]

def deduplicate_source_df(source_df: DataFrame, hash_columns: list) -> DataFrame:
    """
    Deduplicates the source DataFrame based on the hash columns.
    
    Args:
        source_df (DataFrame): The source DataFrame to be deduplicated.
        hash_columns (list): List of columns to be used for deduplication.
        
    Returns:
        DataFrame: The deduplicated DataFrame.
    """
    return source_df.dropDuplicates(hash_columns)

# Function to generate a hash column for specified columns in a DataFrame
def generate_hash_column(df: DataFrame, columns: list, hash_column_name: str = "record_hash") -> DataFrame:
    """
    Generates a hash column for specified columns in the DataFrame using SHA-256.
    """
    return df.withColumn(hash_column_name, sha2(concat_ws("||", *columns), 256))

# Function to build merge condition using hash
def build_merge_condition_with_hash(target_alias: str, source_alias: str, hash_column: str) -> str:
    """
    Builds a merge condition string using the hash column.
    """
    return f"{target_alias}.{hash_column} = {source_alias}.{hash_column}"

# Function to merge Delta tables based on SCD flag
def merge_delta_table(source_df: DataFrame, delta_location: str, hash_columns: list, is_scd: bool):
    """
    Performs merge for slowly changing dimension (SCD) tables. If is_scd is False, data is simply overwritten.
    """
    # Generate hash column for the source DataFrame
    source_df_hashed = generate_hash_column(source_df, hash_columns)
    
    # If the table is an SCD, perform a merge operation
    if is_scd:
        try:
            # Check if the Delta table exists at the specified location
            if DeltaTable.isDeltaTable(spark, delta_location):
                print(f"Delta table found at {delta_location}. Proceeding with merge...")
                delta_table = DeltaTable.forPath(spark, delta_location)
                
                # Alias for target and source
                target_alias = "target"
                source_alias = "src"
                
                # Generate hash column for the target Delta table
                delta_table_df = spark.read.format("delta").load(delta_location)
                delta_table_hashed = generate_hash_column(delta_table_df, hash_columns)
                
                # Build the merge condition using the hash column
                merge_condition = build_merge_condition_with_hash(target_alias, source_alias, "record_hash")
                
                # Perform the merge operation
                delta_table.alias(target_alias).merge(
                    source=source_df_hashed.alias(source_alias),
                    condition=merge_condition
                ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
                
                print(f"Merge completed for {delta_location}.")
            else:
                # If Delta table doesn't exist, write the DataFrame to that location
                print(f"No Delta table found at {delta_location}. Writing new DataFrame...")
                source_df_hashed.write.mode("overwrite").format("delta").save(delta_location)
                print(f"DataFrame written to {delta_location}.")
        except Exception as e:
            print(f"Error processing table at {delta_location}: {e}")
    else:
        # For non-SCD tables, just overwrite the data
        print(f"{delta_location} is not an SCD table. Overwriting...")
        source_df_hashed.write.mode("overwrite").format("delta").save(delta_location)
        print(f"DataFrame overwritten at {delta_location}.")

In [0]:
# Loop through each table and perform the operations
for table_info in tables_info:
    # Print the schema of the source DataFrame
    print("Schema of source DataFrame:")
    table_info["df"].printSchema()  # Ensure we are printing the schema of the specific DataFrame

    # Deduplicate the source DataFrame based on hash columns
    df = deduplicate_source_df(table_info["df"], table_info["hash_columns"])
    
    # Prepare the Delta table location and SCD flag
    delta_location = table_info["location"]
    hash_columns = table_info["hash_columns"]
    is_scd = table_info["is_scd"]  # Check if it's a slowly changing dimension

    # Perform merge or overwrite based on is_scd flag
    merge_delta_table(df, delta_location, hash_columns, is_scd)

Schema of source DataFrame:
root
 |-- user_id: integer (nullable = true)
 |-- full_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- vehicle_id: string (nullable = false)
 |-- vehicle_make: string (nullable = false)
 |-- vehicle_model: string (nullable = false)
 |-- vehicle_year: string (nullable = false)
 |-- vehicle_color: string (nullable = false)
 |-- vehicle_license_plate: string (nullable = false)
 |-- processing_date: timestamp (nullable = true)

Delta table found at /mnt/gold/dim_user. Proceeding with merge...
Merge completed for /mnt/gold/dim_user.
Schema of source DataFrame:
root
 |-- date_key: string (nullable = false)
 |-- full_date: date (nullable = false)
 |-- day: integer (nullable = false)
 |-- day_name: string (nullable = false)
 |-- day_of_week: integer (nullable = false)
 |-- week_of_year: integer (nullable = false)
 |-- month: integer (nullable = false)
 |-- month_name: string (nullable = false)
 |--

In [0]:
spark.read.format("delta").load("/mnt/gold/dim_user").orderBy("full_name").show()

+-------+---------------+--------------------+--------------+----------+-------------+-------------+------------+-------------+---------------------+-------------------+--------------------+
|user_id|      full_name|               email|  phone_number|vehicle_id| vehicle_make|vehicle_model|vehicle_year|vehicle_color|vehicle_license_plate|    processing_date|         record_hash|
+-------+---------------+--------------------+--------------+----------+-------------+-------------+------------+-------------+---------------------+-------------------+--------------------+
| 700001|   Aaron Abbott|angela67@example.com|(453) 832-3608|    168434|      Hyundai|     Santa Fe|        2006|       Silver|              3BW 116|2024-10-15 00:15:01|6e1ef69ae8734c534...|
|      1|   Aaron Acosta| vbishop@example.net|(677) 367-9557| Passenger|    Passenger|    Passenger|   Passenger|    Passenger|            Passenger|2024-10-15 00:15:01|21d31195f82942e05...|
| 700002|   Aaron Acosta|vanessa20@example...

In [0]:
# Create the gold schema once outside the loop
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

for file in dataframe_names:
    print(file)
    table_name = f"gold.{file}"
    print(f"Creating external table: {table_name}")
    
    # Set the current database to 'gold'
    spark.sql("USE gold")
    
    delta_table_path = f"/mnt/gold/{file}"
    print(f"Delta table path: {delta_table_path}")
    #spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    spark.sql(f"""
        CREATE EXTERNAL TABLE IF NOT EXISTS {table_name}
        USING DELTA 
        LOCATION '{delta_table_path}'
    """)



# Optionally, show the tables after the loop to inspect
spark.sql("SHOW TABLES IN gold").show()


dim_calendar
Creating external table: gold.dim_calendar
Delta table path: /mnt/gold/dim_calendar
dim_location
Creating external table: gold.dim_location
Delta table path: /mnt/gold/dim_location
dim_payment
Creating external table: gold.dim_payment
Delta table path: /mnt/gold/dim_payment
dim_user
Creating external table: gold.dim_user
Delta table path: /mnt/gold/dim_user
fct_request
Creating external table: gold.fct_request
Delta table path: /mnt/gold/fct_request
+--------+------------+-----------+
|database|   tableName|isTemporary|
+--------+------------+-----------+
|    gold|dim_calendar|      false|
|    gold|dim_location|      false|
|    gold| dim_payment|      false|
|    gold|    dim_user|      false|
|    gold| fct_request|      false|
+--------+------------+-----------+



In [0]:
from delta.tables import DeltaTable

# Dictionary to store the DeltaTable objects
delta_tables = {}

for table in dataframe_names:
    # Define the table location
    table_name = f"gold.{table}"
    print(table_name)
    print(table)
    # Store the DeltaTable object in the dictionary using the file name as the key
    delta_tables[table] = DeltaTable.forName(spark, table_name)  # Load table from path

    # Check if the table has been vacuumed in the last 30 days
    if delta_tables[table].history(30).filter("operation = 'VACUUM START'").count() == 0:
        # Optimize the table for better query performance
        delta_tables[table].optimize()
        # Perform vacuum operation (default is to keep data for 7 days)
        delta_tables[table].vacuum()


gold.dim_calendar
dim_calendar
gold.dim_location
dim_location
gold.dim_payment
dim_payment
gold.dim_user
dim_user
gold.fct_request
fct_request


In [0]:
spark.read.format("delta").load(f"/mnt/gold/fct_request").count()


292201

In [0]:
# Join the DataFrames with aliases for clarity
"""
fact_request_with_user_payment = fact_request.alias("fr") \
    .join(dim_user.alias("du"), col("fr.driver_id") == col("du.user_id"), "left") \
    .join(dim_user.alias("pu"), col("fr.passenger_id") == col("pu.user_id"), "left") \
    .join(dim_payment.alias("dp"), col("fr.s_payment") == col("dp.s_payment"), "left") \
    .select(
        col("fr.driver_id"),
        col("fr.passenger_id"),
        col("fr.pickup_location_id"),
        col("fr.dropoff_location_id"),
        col("fr.request_datekey"),
        col("fr.accept_datekey"),
        col("fr.trip_start_datekey"),
        col("fr.trip_end_datekey"),
        col("fr.s_payment"),
        col("fr.trip_distance"),
        col("fr.base_fare"),
        col("fr.extra_fare"),
        col("fr.mta_tax"),
        col("fr.tip_amount"),
        col("fr.tolls_amount"),
        col("fr.improvement_surcharge"),
        col("du.full_name").alias("driver_name"),
        col("du.email").alias("driver_email"),
        col("du.phone_number").alias("driver_phone"),
        col("du.vehicle_id").alias("driver_vehicle_id"),
        col("du.vehicle_make").alias("driver_vehicle_make"),
        col("du.vehicle_model").alias("driver_vehicle_model"),
        col("du.vehicle_year").alias("driver_vehicle_year"),
        col("du.vehicle_color").alias("driver_vehicle_color"),
        col("du.vehicle_license_plate").alias("driver_vehicle_license_plate"),
        col("pu.full_name").alias("passenger_name"),
        col("pu.email").alias("passenger_email"),
        col("pu.phone_number").alias("passenger_phone"),
        col("dp.method").alias("payment_method"),
        col("dp.status").alias("payment_status")
    )

# Show the resulting DataFrame
fact_request_with_user_payment.orderBy("driver_name").show(10)
"""


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3799327554425638>, line 12[0m
[1;32m     10[0m [38;5;28mprint[39m(table)
[1;32m     11[0m [38;5;66;03m# Store the DeltaTable object in the dictionary using the file name as the key[39;00m
[0;32m---> 12[0m delta_tables[table] [38;5;241m=[39m DeltaTable[38;5;241m.[39mforPath(spark, table_name)  [38;5;66;03m# Load table from path[39;00m
[1;32m     14[0m [38;5;66;03m# Check if the table has been vacuumed in the last 30 days[39;00m
[1;32m     15[0m [38;5;28;01mif[39;00m delta_tables[table][38;5;241m.[39mhistory([38;5;241m30[39m)[38;5;241m.[39mfilter([38;5;124m"[39m[38;5;124moperation = [39m[38;5;124m'[39m[38;5;124mVACUUM START[39m[38;5;124m'[39m[38;5;124m"[39m)[38;5;241m.[39mcount() [38;5;241m==[39m [38;5;241m0[39m:
[1;32m     16[0m     [38;5;66;03