# Landing to bronze NOTEBOOK

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark
from delta import *
import os
import IPython


### - SPARK SESSION

In [2]:

# Build Spark session with Delta configurations and Hive support
builder = SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.io.native.lib.available", "true") \
    .config("spark.jars.packages", "org.apache.hive:hive-exec:2.3.9")  # Include Hive dependency

# Enable Hive support explicitly and get Spark session
spark = configure_spark_with_delta_pip(builder.enableHiveSupport()).getOrCreate()

# Optionally, you can test by running a Hive query
spark.sql("CREATE TABLE IF NOT EXISTS test_hive_table (name STRING, age INT) USING hive")
spark.sql("SELECT * FROM test_hive_table").show()


+----+---+
|name|age|
+----+---+
+----+---+



In [3]:
# Set the landing and bronze locations
landing_location = os.path.join(os.getcwd())
bronze_location = os.path.join(landing_location, "bronze")
print(landing_location)
print(bronze_location)

e:\DEPIfINALpROJECT\DATABASEcSV
e:\DEPIfINALpROJECT\DATABASEcSV\bronze


In [4]:
# List of CSV file names
file_names = ["Location.csv", "Payment.csv", "PaymentMethod.csv", "PaymentStatus.csv",
              "Request.csv", "Trip.csv", "User.csv", "Vehicles.csv", "VehicleMakes.csv"]

In [5]:
# Function to load CSV files into DataFrames
def load_csv_files(file_names, landing_location):
    """
    This function takes a list of CSV file names and loads each CSV into a Spark DataFrame.

    Args:
        file_names (list): List of CSV filenames to be loaded.
        landing_location (str): Path where the CSV files are located.

    Returns:
        dict: A dictionary containing DataFrames where the key is the file name and the value is the DataFrame.
    """
    # Create an empty dictionary to hold the DataFrames
    dataframes = {}
    start = 1
    # Loop over the filenames and load each CSV into a DataFrame
    for file_name in file_names:
        # Remove the ".csv" extension and use it as the DataFrame key
        df_name = file_name.split(".csv")[0].lower() + "_df_raw"
        print(df_name)
        # Load the CSV into a DataFrame and store it in the dictionary
        df_path = f"{landing_location}/{file_name}"
        dataframes[df_name] = spark.read.format('csv').option("header", "true").load(df_path)
        
    return dataframes




In [6]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# Load the CSV files into DataFrames
dataframes = load_csv_files(file_names, landing_location)

# Loop over the dataframes dictionary to assign each DataFrame to a global variable
for df_name, df in dataframes.items():
    globals()[df_name] = df

#location_df_raw.show(5)
print(dataframes.items())

location_df_raw
payment_df_raw
paymentmethod_df_raw
paymentstatus_df_raw
request_df_raw
trip_df_raw
user_df_raw
vehicles_df_raw
vehiclemakes_df_raw
dict_items([('location_df_raw', DataFrame[LocationID: string, Longitude: string, Latitude: string]), ('payment_df_raw', DataFrame[PaymentID: string, PaymentMethodID: string, PaymentStatusID: string]), ('paymentmethod_df_raw', DataFrame[PaymentMethodID: string, MethodName: string]), ('paymentstatus_df_raw', DataFrame[PaymentStatusID: string, StatusName: string]), ('request_df_raw', DataFrame[RequestID: string, PassengerID: string, PickupLocationID: string, DropoffLocationID: string, RequestTime: string, AcceptTime: string]), ('trip_df_raw', DataFrame[TripID: string, RequestID: string, DriverID: string, VehicleID: string, PaymentID: string, TripStartTime: string, TripEndTime: string, TripDistance: string, driver_rating: string, BaseFare: string, ExtraFare: string, MtaTax: string, TipAmount: string, TollsAmount: string, ImprovementSurcharge: s

In [7]:
# Loop through each DataFrame in the dictionary and display its content
for df_name, df in dataframes.items():
    print(f"Showing first 5 rows of {df_name}:")
    df.show(5)
    df.printSchema()

Showing first 5 rows of location_df_raw:
+----------+------------+-----------+
|LocationID|   Longitude|   Latitude|
+----------+------------+-----------+
|----------|   ---------|   --------|
|     76173|-73.99945068|40.72192383|
|    567607|-73.77745056|40.64664841|
|    498838|-73.95517731|40.76498795|
|    138392|-73.99216461|40.72513962|
+----------+------------+-----------+
only showing top 5 rows

root
 |-- LocationID: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Latitude: string (nullable = true)

Showing first 5 rows of payment_df_raw:
+---------+---------------+---------------+
|PaymentID|PaymentMethodID|PaymentStatusID|
+---------+---------------+---------------+
|---------|---------------|---------------|
|        6|              6|              4|
|        6|              6|              2|
|        3|              3|              1|
|        6|              6|              4|
+---------+---------------+---------------+
only showing top 5 rows

ro

In [8]:

def add_date_columns(dataframes):
    """
    This function takes a dictionary of DataFrames and adds 'processing_date' and 'modification_date' columns
    to each DataFrame.

    Args:
        dataframes (dict): A dictionary containing DataFrames.
    """
    processing_date = date_trunc('second', current_timestamp())

    for df_name, df in dataframes.items():
        # Add 'processing_date' and 'modification_date' columns
        #df = df.withcolumn("_pipeline_run_id", lit(dbutils.widgets.get('_pipeline_run_id')))
        df = df.withColumn("_processing_date", processing_date) \
                .withColumn("_input_filename", input_file_name()) \
                .withColumn("_input_file_modification_date", col("_metadata.file_modification_time"))
        
        # Update the DataFrame in the dictionary
        dataframes[df_name] = df



In [9]:

# Call the function to add date columns to all DataFrames
add_date_columns(dataframes)
# Loop over the dataframes dictionary to assign each DataFrame to a global variable
for df_name, df in dataframes.items():
    globals()[df_name] = df

location_df_raw.show(5)


+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|----------|   ---------|   --------|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|     76173|-73.99945068|40.72192383|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    567607|-73.77745056|40.64664841|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    498838|-73.95517731|40.76498795|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    138392|-73.99216461|40.72513962|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
only showing top 5 

In [10]:

# Display the updated DataFrames to verify the new columns
for df_name, df in dataframes.items():
    print(f"Updated DataFrame: {df_name}")
    df.show(5)

Updated DataFrame: location_df_raw
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|----------|   ---------|   --------|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|     76173|-73.99945068|40.72192383|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    567607|-73.77745056|40.64664841|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    498838|-73.95517731|40.76498795|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    138392|-73.99216461|40.72513962|2024-10-13 22:29:56|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
+----------+------------+-----------+-------------------+--------------------+---------------

In [11]:
from delta.tables import DeltaTable

def write_to_delta(dataframes, bronze_location):
    """
    This function checks if each table exists in Delta format at the specified location. 
    If the table exists, it appends the data; otherwise, it creates a new Delta table.

    Args:
        dataframes (dict): Dictionary of DataFrames.
        bronze_location (str): The base path where Delta tables will be stored.
    """
    for df_name, df in dataframes.items():
        delta_table_path = f"{bronze_location}\\{df_name}"
        # Check if the Delta table exists
        if DeltaTable.isDeltaTable(spark, delta_table_path):
            print(f"Table {df_name} already exists, appending data.")
            df.write.mode("append").format("delta").save(delta_table_path)
        else:
            print(f"Creating new table for {df_name}.")
            df.write.mode("overwrite").format("delta").save(delta_table_path)


In [12]:
write_to_delta(dataframes, bronze_location)


Creating new table for location_df_raw.
Creating new table for payment_df_raw.
Creating new table for paymentmethod_df_raw.
Creating new table for paymentstatus_df_raw.
Creating new table for request_df_raw.
Creating new table for trip_df_raw.
Creating new table for user_df_raw.
Creating new table for vehicles_df_raw.
Creating new table for vehiclemakes_df_raw.


In [13]:
# After your write_to_delta function
for file in file_names:
    
    table_name = f"bronze.{file.split('.csv')[0].lower()}"
    print(f"Creating external table: {table_name}")
    print(table_name.split()[-1])
    # Creating schema and external table in Spark SQL
    spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
    # Set the current database to 'bronze'
    spark.sql("USE bronze")
    delta_table_path = f"{bronze_location}\\{table_name.split('.')[-1]}_df_raw"
    spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION r'{delta_table_path}'")
    #spark.sql(f"SELECT * FROM location_df_raw").show()


Creating external table: bronze.location
bronze.location
Creating external table: bronze.payment
bronze.payment
Creating external table: bronze.paymentmethod
bronze.paymentmethod
Creating external table: bronze.paymentstatus
bronze.paymentstatus
Creating external table: bronze.request
bronze.request
Creating external table: bronze.trip
bronze.trip
Creating external table: bronze.user
bronze.user
Creating external table: bronze.vehicles
bronze.vehicles
Creating external table: bronze.vehiclemakes
bronze.vehiclemakes


In [14]:
# Show all tables in the 'bronze' schema
spark.sql("SHOW TABLES IN bronze").show()


+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|   bronze|       location|      false|
|   bronze|        payment|      false|
|   bronze| payment_method|      false|
|   bronze| payment_status|      false|
|   bronze|  paymentmethod|      false|
|   bronze|  paymentstatus|      false|
|   bronze|        request|      false|
|   bronze|test_hive_table|      false|
|   bronze|           trip|      false|
|   bronze|           user|      false|
|   bronze|        vehicle|      false|
|   bronze|   vehicle_make|      false|
|   bronze|   vehiclemakes|      false|
|   bronze|       vehicles|      false|
+---------+---------------+-----------+



In [15]:

# Verify the current schema
print("Current Schema:")
spark.sql("SELECT current_database()").show()

# List all tables in the bronze schema
print("Tables in bronze schema:")
spark.sql("SHOW TABLES IN bronze").show()
spark.sql("SELECT * FROM bronze.location LIMIT 1").show()
spark.sql("SELECT * FROM bronze.payment LIMIT 1").show()


# Describe the history of the Delta table
print("Describing history for bronze.location:")





Current Schema:
+------------------+
|current_database()|
+------------------+
|            bronze|
+------------------+

Tables in bronze schema:
+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|   bronze|       location|      false|
|   bronze|        payment|      false|
|   bronze| payment_method|      false|
|   bronze| payment_status|      false|
|   bronze|  paymentmethod|      false|
|   bronze|  paymentstatus|      false|
|   bronze|        request|      false|
|   bronze|test_hive_table|      false|
|   bronze|           trip|      false|
|   bronze|           user|      false|
|   bronze|        vehicle|      false|
|   bronze|   vehicle_make|      false|
|   bronze|   vehiclemakes|      false|
|   bronze|       vehicles|      false|
+---------+---------------+-----------+

+----------+---------+--------+-------------------+--------------------+-----------------------------+
|LocationID|Longitude|Latitud

In [16]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "bronze\\location_df_raw")
fullHistoryDF = deltaTable.history()
fullHistoryDF.show(truncate=False)

+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                      |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------------------+------------+-----------------------------------+
|0      |2024-10-13 22:30:04.049|NULL  |NULL    |WRITE    |{mode -> Overwrite, partitionBy -> []}|NULL|NULL    |NULL     |NULL       |Serializable  

In [17]:
spark.sql("SELECT _processing_date, count(*) cnt FROM bronze.location GROUP BY _processing_date").show()

+-------------------+-------+
|   _processing_date|    cnt|
+-------------------+-------+
|2024-10-13 22:29:58|1148685|
+-------------------+-------+



In [18]:
spark.sql("select * from bronze.location").show()

+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|----------|   ---------|   --------|2024-10-13 22:29:58|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|     76173|-73.99945068|40.72192383|2024-10-13 22:29:58|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    567607|-73.77745056|40.64664841|2024-10-13 22:29:58|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    498838|-73.95517731|40.76498795|2024-10-13 22:29:58|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    138392|-73.99216461|40.72513962|2024-10-13 22:29:58|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    532266|-73.94660187|40.77573395|2024-10-13 22:29:58|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    144884|-73.991

In [19]:
user = spark.read.table("bronze.user")


In [20]:
from delta.tables import DeltaTable

# Dictionary to store the DeltaTable objects
delta_tables = {}

for file in file_names:
    file_name = file.split('.csv')[0].lower()  # Extract the file name (without .csv extension)
    table_name = f"bronze.{file_name}"  # Define the table name in the bronze schema
    
    print(file_name)
    print(table_name)

    # Store the DeltaTable object in the dictionary using the file name as the key
    delta_tables[file_name] = DeltaTable.forName(spark, table_name)

    # Vacuum the table if it has not been vacuumed in the last 30 days.
    if delta_tables[file_name].history(30).filter("operation = 'VACUUM START'").count() == 0:
        delta_tables[file_name].optimize()
        delta_tables[file_name].vacuum()  # Default = 7 days



location
bronze.location
payment
bronze.payment
paymentmethod
bronze.paymentmethod
paymentstatus
bronze.paymentstatus
request
bronze.request
trip
bronze.trip
user
bronze.user
vehicles
bronze.vehicles
vehiclemakes
bronze.vehiclemakes


In [21]:
spark.sql("SELECT COUNT(*) FROM bronze.location").show()


+--------+
|count(1)|
+--------+
| 1148685|
+--------+



In [22]:

spark.stop()
%reset -f

