In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark
from delta import *
import os


In [2]:

# Build Spark session with Delta configurations and Hive support
builder = SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.io.native.lib.available", "true") \
    .config("spark.jars.packages", "org.apache.hive:hive-exec:2.3.9")  # Include Hive dependency

# Enable Hive support explicitly and get Spark session
spark = configure_spark_with_delta_pip(builder.enableHiveSupport()).getOrCreate()

# Optionally, you can test by running a Hive query
spark.sql("CREATE TABLE IF NOT EXISTS test_hive_table (name STRING, age INT) USING hive")
spark.sql("SELECT * FROM test_hive_table").show()


+----+---+
|name|age|
+----+---+
+----+---+



In [3]:
from delta.tables import *
#spark.conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [4]:
print(spark.sparkContext.getConf().get("spark.jars"))


file:///C:/Users/mouda/.ivy2/jars/io.delta_delta-spark_2.12-3.2.0.jar,file:///C:/Users/mouda/.ivy2/jars/io.delta_delta-storage-3.2.0.jar,file:///C:/Users/mouda/.ivy2/jars/org.antlr_antlr4-runtime-4.9.3.jar


In [5]:
# Check if Delta is available
print(spark.sql("SHOW TABLES").show())

+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|  default|test_hive_table|      false|
+---------+---------------+-----------+

None


In [6]:
# Set the landing and bronze locations
landing_location = os.path.join(os.getcwd())
bronze_location = os.path.join(landing_location, "bronze")
print(landing_location)
print(bronze_location)

e:\DEPIfINALpROJECT\DATABASEcSV
e:\DEPIfINALpROJECT\DATABASEcSV\bronze


In [7]:
# List of CSV file names
file_names = ["Location.csv", "Payment.csv", "PaymentMethod.csv", "PaymentStatus.csv",
              "Request.csv", "Trip.csv", "User.csv", "Vehicles.csv", "VehicleMakes.csv"]

In [52]:
# Function to load CSV files into DataFrames
def load_csv_files(file_names, landing_location):
    """
    This function takes a list of CSV file names and loads each CSV into a Spark DataFrame.

    Args:
        file_names (list): List of CSV filenames to be loaded.
        landing_location (str): Path where the CSV files are located.

    Returns:
        dict: A dictionary containing DataFrames where the key is the file name and the value is the DataFrame.
    """
    # Create an empty dictionary to hold the DataFrames
    dataframes = {}
    start = 1
    # Loop over the filenames and load each CSV into a DataFrame
    for file_name in file_names:
        # Remove the ".csv" extension and use it as the DataFrame key
        df_name = file_name.split(".csv")[0].lower() + "_df_raw"
        print(df_name)
        # Load the CSV into a DataFrame and store it in the dictionary
        df_path = f"{landing_location}/{file_name}"
        dataframes[df_name] = spark.read.format('csv').option("header", "true").load(df_path)
        
    return dataframes




In [53]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# Load the CSV files into DataFrames
dataframes = load_csv_files(file_names, landing_location)

# Loop over the dataframes dictionary to assign each DataFrame to a global variable
for df_name, df in dataframes.items():
    globals()[df_name] = df

#location_df_raw.show(5)
print(dataframes.items())

location_df_raw
payment_df_raw
paymentmethod_df_raw
paymentstatus_df_raw
request_df_raw
trip_df_raw
user_df_raw
vehicles_df_raw
vehiclemakes_df_raw
dict_items([('location_df_raw', DataFrame[LocationID: string, Longitude: string, Latitude: string]), ('payment_df_raw', DataFrame[PaymentID: string, PaymentMethodID: string, PaymentStatusID: string]), ('paymentmethod_df_raw', DataFrame[PaymentMethodID: string, MethodName: string]), ('paymentstatus_df_raw', DataFrame[PaymentStatusID: string, StatusName: string]), ('request_df_raw', DataFrame[RequestID: string, PassengerID: string, PickupLocationID: string, DropoffLocationID: string, RequestTime: string, AcceptTime: string]), ('trip_df_raw', DataFrame[TripID: string, RequestID: string, DriverID: string, VehicleID: string, PaymentID: string, TripStartTime: string, TripEndTime: string, TripDistance: string, driver_rating: string, BaseFare: string, ExtraFare: string, MtaTax: string, TipAmount: string, TollsAmount: string, ImprovementSurcharge: s

In [54]:
# Loop through each DataFrame in the dictionary and display its content
for df_name, df in dataframes.items():
    print(f"Showing first 5 rows of {df_name}:")
    df.show(5)
    df.printSchema()

Showing first 5 rows of location_df_raw:
+----------+------------+-----------+
|LocationID|   Longitude|   Latitude|
+----------+------------+-----------+
|----------|   ---------|   --------|
|     76173|-73.99945068|40.72192383|
|    567607|-73.77745056|40.64664841|
|    498838|-73.95517731|40.76498795|
|    138392|-73.99216461|40.72513962|
+----------+------------+-----------+
only showing top 5 rows

root
 |-- LocationID: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Latitude: string (nullable = true)

Showing first 5 rows of payment_df_raw:
+---------+---------------+---------------+
|PaymentID|PaymentMethodID|PaymentStatusID|
+---------+---------------+---------------+
|---------|---------------|---------------|
|        6|              6|              4|
|        6|              6|              2|
|        3|              3|              1|
|        6|              6|              4|
+---------+---------------+---------------+
only showing top 5 rows

ro

In [12]:

def add_date_columns(dataframes):
    """
    This function takes a dictionary of DataFrames and adds 'processing_date' and 'modification_date' columns
    to each DataFrame.

    Args:
        dataframes (dict): A dictionary containing DataFrames.
    """
    processing_date = date_trunc('second', current_timestamp())

    for df_name, df in dataframes.items():
        # Add 'processing_date' and 'modification_date' columns
        #df = df.withcolumn("_pipeline_run_id", lit(dbutils.widgets.get('_pipeline_run_id')))
        df = df.withColumn("_processing_date", processing_date) \
                .withColumn("_input_filename", input_file_name()) \
                .withColumn("_input_file_modification_date", col("_metadata.file_modification_time"))
        
        # Update the DataFrame in the dictionary
        dataframes[df_name] = df



In [13]:

# Call the function to add date columns to all DataFrames
add_date_columns(dataframes)
# Loop over the dataframes dictionary to assign each DataFrame to a global variable
for df_name, df in dataframes.items():
    globals()[df_name] = df

location_df_raw.show(5)


+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|----------|   ---------|   --------|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|     76173|-73.99945068|40.72192383|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    567607|-73.77745056|40.64664841|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    498838|-73.95517731|40.76498795|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    138392|-73.99216461|40.72513962|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
only showing top 5 

In [14]:

# Display the updated DataFrames to verify the new columns
for df_name, df in dataframes.items():
    print(f"Updated DataFrame: {df_name}")
    df.show(5)

Updated DataFrame: location_df_raw
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|----------|   ---------|   --------|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|     76173|-73.99945068|40.72192383|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    567607|-73.77745056|40.64664841|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    498838|-73.95517731|40.76498795|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    138392|-73.99216461|40.72513962|2024-10-11 08:21:43|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
+----------+------------+-----------+-------------------+--------------------+---------------

In [15]:
from delta.tables import DeltaTable

def write_to_delta(dataframes, bronze_location):
    """
    This function checks if each table exists in Delta format at the specified location. 
    If the table exists, it appends the data; otherwise, it creates a new Delta table.

    Args:
        dataframes (dict): Dictionary of DataFrames.
        bronze_location (str): The base path where Delta tables will be stored.
    """
    for df_name, df in dataframes.items():
        delta_table_path = f"{bronze_location}\\{df_name}"
        # Check if the Delta table exists
        if DeltaTable.isDeltaTable(spark, delta_table_path):
            print(f"Table {df_name} already exists, appending data.")
            df.write.mode("append").format("delta").save(delta_table_path)
        else:
            print(f"Creating new table for {df_name}.")
            df.write.mode("overwrite").format("delta").save(delta_table_path)


In [16]:
write_to_delta(dataframes, bronze_location)


Table location_df_raw already exists, appending data.
Table payment_df_raw already exists, appending data.
Table paymentmethod_df_raw already exists, appending data.
Table paymentstatus_df_raw already exists, appending data.
Table request_df_raw already exists, appending data.
Table trip_df_raw already exists, appending data.
Table user_df_raw already exists, appending data.
Table vehicles_df_raw already exists, appending data.
Table vehiclemakes_df_raw already exists, appending data.


In [17]:
# After your write_to_delta function
for file in file_names:
    
    table_name = f"bronze.{file.split('.csv')[0].lower()}"
    print(f"Creating external table: {table_name}")
    print(table_name.split()[-1])
    # Creating schema and external table in Spark SQL
    spark.sql("CREATE SCHEMA IF NOT EXISTS bronze")
    # Set the current database to 'bronze'
    spark.sql("USE bronze")
    delta_table_path = f"{bronze_location}\\{table_name.split('.')[-1]}_df_raw"
    spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS {table_name} USING DELTA LOCATION r'{delta_table_path}'")
    #spark.sql(f"SELECT * FROM location_df_raw").show()


Creating external table: bronze.location
bronze.location
Creating external table: bronze.payment
bronze.payment
Creating external table: bronze.paymentmethod
bronze.paymentmethod
Creating external table: bronze.paymentstatus
bronze.paymentstatus
Creating external table: bronze.request
bronze.request
Creating external table: bronze.trip
bronze.trip
Creating external table: bronze.user
bronze.user
Creating external table: bronze.vehicles
bronze.vehicles
Creating external table: bronze.vehiclemakes
bronze.vehiclemakes


In [18]:
# Show all tables in the 'bronze' schema
spark.sql("SHOW TABLES IN bronze").show()


+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|   bronze|       location|      false|
|   bronze|        payment|      false|
|   bronze|  paymentmethod|      false|
|   bronze|  paymentstatus|      false|
|   bronze|        request|      false|
|   bronze|test_hive_table|      false|
|   bronze|           trip|      false|
|   bronze|           user|      false|
|   bronze|   vehiclemakes|      false|
|   bronze|       vehicles|      false|
+---------+---------------+-----------+



In [19]:

# Verify the current schema
print("Current Schema:")
spark.sql("SELECT current_database()").show()

# List all tables in the bronze schema
print("Tables in bronze schema:")
spark.sql("SHOW TABLES IN bronze").show()
spark.sql("SELECT * FROM bronze.location LIMIT 1").show()
spark.sql("SELECT * FROM bronze.payment LIMIT 1").show()


# Describe the history of the Delta table
print("Describing history for bronze.location:")





Current Schema:
+------------------+
|current_database()|
+------------------+
|            bronze|
+------------------+

Tables in bronze schema:
+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|   bronze|       location|      false|
|   bronze|        payment|      false|
|   bronze|  paymentmethod|      false|
|   bronze|  paymentstatus|      false|
|   bronze|        request|      false|
|   bronze|test_hive_table|      false|
|   bronze|           trip|      false|
|   bronze|           user|      false|
|   bronze|   vehiclemakes|      false|
|   bronze|       vehicles|      false|
+---------+---------------+-----------+

+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+-----------------

In [48]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "bronze\\location_df_raw")
fullHistoryDF = deltaTable.history()
fullHistoryDF.show(truncate=False)

+-------+-----------------------+------+--------+------------+--------------------------------------------------------------------+----+--------+---------+-----------+-----------------+-------------+----------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation   |operationParameters                                                 |job |notebook|clusterId|readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                      |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+------------+--------------------------------------------------------------------+----+--------+---------+-----------+-----------------+-------------+----------------------------------------------------------------------+------------+-----------------------------------+
|16     |2024-10-11 08:43:31.155|NULL  |

In [1]:
spark.sql("SELECT _processing_date, count(*) cnt FROM bronze.location GROUP BY _processing_date").show()

NameError: name 'spark' is not defined

In [22]:
spark.sql("select * from bronze.location").show()

+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|LocationID|   Longitude|   Latitude|   _processing_date|     _input_filename|_input_file_modification_date|
+----------+------------+-----------+-------------------+--------------------+-----------------------------+
|    568573|-73.77675629|40.64520645|2024-10-10 17:44:46|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    108784|-73.99476624|40.73990250|2024-10-10 17:44:46|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    507969|-73.95380402|40.76547623|2024-10-10 17:44:46|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    565124|-73.78221893|40.64458084|2024-10-10 17:44:46|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    467139|-73.96097565|40.77510834|2024-10-10 17:44:46|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|    314617|-73.97971344|40.76609421|2024-10-10 17:44:46|file:///e:/DEPIfI...|         2024-10-08 09:46:...|
|     55692|-74.002

In [23]:
user = spark.read.table("bronze.user")


In [58]:
from delta.tables import DeltaTable

# Dictionary to store the DeltaTable objects
delta_tables = {}

for file in file_names:
    file_name = file.split('.csv')[0].lower()  # Extract the file name (without .csv extension)
    table_name = f"bronze.{file_name}"  # Define the table name in the bronze schema
    
    print(file_name)
    print(table_name)

    # Store the DeltaTable object in the dictionary using the file name as the key
    delta_tables[file_name] = DeltaTable.forName(spark, table_name)

    # Vacuum the table if it has not been vacuumed in the last 30 days.
    if delta_tables[file_name].history(30).filter("operation = 'VACUUM START'").count() == 0:
        delta_tables[file_name].optimize()
        delta_tables[file_name].vacuum()  # Default = 7 days



location
bronze.location
payment
bronze.payment
paymentmethod
bronze.paymentmethod
paymentstatus
bronze.paymentstatus
request
bronze.request
trip
bronze.trip
user
bronze.user
vehicles
bronze.vehicles
vehiclemakes
bronze.vehiclemakes


In [4]:
spark.sql("SELECT COUNT(*) FROM bronze.location").show()



Py4JJavaError: An error occurred while calling o37.sql.
: org.sparkproject.guava.util.concurrent.UncheckedExecutionException: java.lang.NullPointerException
	at org.sparkproject.guava.cache.LocalCache$Segment.get(LocalCache.java:2263)
	at org.sparkproject.guava.cache.LocalCache.get(LocalCache.java:4000)
	at org.sparkproject.guava.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.getCachedPlan(SessionCatalog.scala:210)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.org$apache$spark$sql$execution$datasources$FindDataSourceTable$$readDataSourceTable(DataSourceStrategy.scala:248)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable$$anonfun$apply$2.applyOrElse(DataSourceStrategy.scala:296)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable$$anonfun$apply$2.applyOrElse(DataSourceStrategy.scala:278)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$2(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$4(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1216)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1215)
	at org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias.mapChildren(basicLogicalOperators.scala:1676)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$4(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1216)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1215)
	at org.apache.spark.sql.catalyst.plans.logical.Aggregate.mapChildren(basicLogicalOperators.scala:1151)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning(AnalysisHelper.scala:99)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning$(AnalysisHelper.scala:96)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperators(AnalysisHelper.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperators$(AnalysisHelper.scala:75)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:32)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.apply(DataSourceStrategy.scala:278)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.apply(DataSourceStrategy.scala:241)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:240)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:236)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:187)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:236)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:202)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:223)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:222)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:77)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:77)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.NullPointerException
	at org.apache.spark.sql.delta.storage.DelegatingLogStore.<init>(DelegatingLogStore.scala:38)
	at org.apache.spark.sql.delta.storage.LogStore$.createLogStoreWithClassName(LogStore.scala:288)
	at org.apache.spark.sql.delta.storage.LogStoreProvider.createLogStore(LogStore.scala:385)
	at org.apache.spark.sql.delta.storage.LogStoreProvider.createLogStore$(LogStore.scala:380)
	at org.apache.spark.sql.delta.storage.LogStore$.createLogStore(LogStore.scala:266)
	at org.apache.spark.sql.delta.storage.LogStore$.apply(LogStore.scala:279)
	at org.apache.spark.sql.delta.storage.LogStore$.apply(LogStore.scala:274)
	at org.apache.spark.sql.delta.storage.LogStoreProvider.createLogStore(LogStore.scala:322)
	at org.apache.spark.sql.delta.storage.LogStoreProvider.createLogStore$(LogStore.scala:321)
	at org.apache.spark.sql.delta.DeltaLog.createLogStore(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.DeltaLog.store$lzycompute(DeltaLog.scala:122)
	at org.apache.spark.sql.delta.DeltaLog.store(DeltaLog.scala:122)
	at org.apache.spark.sql.delta.Checkpoints.findLastCompleteCheckpointBefore(Checkpoints.scala:441)
	at org.apache.spark.sql.delta.Checkpoints.findLastCompleteCheckpointBefore$(Checkpoints.scala:431)
	at org.apache.spark.sql.delta.DeltaLog.findLastCompleteCheckpointBefore(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.Checkpoints.$anonfun$loadMetadataFromFile$1(Checkpoints.scala:398)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:136)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:128)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:117)
	at org.apache.spark.sql.delta.DeltaLog.recordOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:135)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:125)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:115)
	at org.apache.spark.sql.delta.DeltaLog.recordDeltaOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.Checkpoints.loadMetadataFromFile(Checkpoints.scala:375)
	at org.apache.spark.sql.delta.Checkpoints.$anonfun$loadMetadataFromFile$1(Checkpoints.scala:386)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:136)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:128)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:117)
	at org.apache.spark.sql.delta.DeltaLog.recordOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:135)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:125)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:115)
	at org.apache.spark.sql.delta.DeltaLog.recordDeltaOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.Checkpoints.loadMetadataFromFile(Checkpoints.scala:375)
	at org.apache.spark.sql.delta.Checkpoints.$anonfun$loadMetadataFromFile$1(Checkpoints.scala:386)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:136)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:128)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:117)
	at org.apache.spark.sql.delta.DeltaLog.recordOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:135)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:125)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:115)
	at org.apache.spark.sql.delta.DeltaLog.recordDeltaOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.Checkpoints.loadMetadataFromFile(Checkpoints.scala:375)
	at org.apache.spark.sql.delta.Checkpoints.$anonfun$loadMetadataFromFile$1(Checkpoints.scala:386)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:136)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:128)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:117)
	at org.apache.spark.sql.delta.DeltaLog.recordOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:135)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:125)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:115)
	at org.apache.spark.sql.delta.DeltaLog.recordDeltaOperation(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.Checkpoints.loadMetadataFromFile(Checkpoints.scala:375)
	at org.apache.spark.sql.delta.Checkpoints.readLastCheckpointFile(Checkpoints.scala:369)
	at org.apache.spark.sql.delta.Checkpoints.readLastCheckpointFile$(Checkpoints.scala:368)
	at org.apache.spark.sql.delta.DeltaLog.readLastCheckpointFile(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.SnapshotManagement.$anonfun$getSnapshotAtInit$2(SnapshotManagement.scala:575)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.SnapshotManagement.$anonfun$getSnapshotAtInit$1(SnapshotManagement.scala:573)
	at org.apache.spark.sql.delta.SnapshotManagement.withSnapshotLockInterruptibly(SnapshotManagement.scala:78)
	at org.apache.spark.sql.delta.SnapshotManagement.withSnapshotLockInterruptibly$(SnapshotManagement.scala:75)
	at org.apache.spark.sql.delta.DeltaLog.withSnapshotLockInterruptibly(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.SnapshotManagement.getSnapshotAtInit(SnapshotManagement.scala:573)
	at org.apache.spark.sql.delta.SnapshotManagement.getSnapshotAtInit$(SnapshotManagement.scala:572)
	at org.apache.spark.sql.delta.DeltaLog.getSnapshotAtInit(DeltaLog.scala:74)
	at org.apache.spark.sql.delta.SnapshotManagement.$init$(SnapshotManagement.scala:69)
	at org.apache.spark.sql.delta.DeltaLog.<init>(DeltaLog.scala:80)
	at org.apache.spark.sql.delta.DeltaLog$.$anonfun$apply$4(DeltaLog.scala:853)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.delta.DeltaLog$.$anonfun$apply$3(DeltaLog.scala:848)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.DeltaLog$.recordFrameProfile(DeltaLog.scala:651)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:136)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:128)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:117)
	at org.apache.spark.sql.delta.DeltaLog$.recordOperation(DeltaLog.scala:651)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:135)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:125)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:115)
	at org.apache.spark.sql.delta.DeltaLog$.recordDeltaOperation(DeltaLog.scala:651)
	at org.apache.spark.sql.delta.DeltaLog$.createDeltaLog$1(DeltaLog.scala:847)
	at org.apache.spark.sql.delta.DeltaLog$.$anonfun$apply$5(DeltaLog.scala:866)
	at com.google.common.cache.LocalCache$LocalManualCache$1.load(LocalCache.java:4792)
	at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
	at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
	at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
	at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2257)
	at com.google.common.cache.LocalCache.get(LocalCache.java:4000)
	at com.google.common.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
	at org.apache.spark.sql.delta.DeltaLog$.getDeltaLogFromCache$1(DeltaLog.scala:865)
	at org.apache.spark.sql.delta.DeltaLog$.apply(DeltaLog.scala:875)
	at org.apache.spark.sql.delta.DeltaLog$.forTable(DeltaLog.scala:751)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.$anonfun$deltaLog$1(DeltaTableV2.scala:92)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2$.withEnrichedUnsupportedTableException(DeltaTableV2.scala:367)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.deltaLog$lzycompute(DeltaTableV2.scala:92)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.deltaLog(DeltaTableV2.scala:90)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.$anonfun$initialSnapshot$4(DeltaTableV2.scala:145)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.$anonfun$initialSnapshot$1(DeltaTableV2.scala:145)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2$.withEnrichedUnsupportedTableException(DeltaTableV2.scala:367)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.initialSnapshot$lzycompute(DeltaTableV2.scala:144)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.initialSnapshot(DeltaTableV2.scala:124)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.toBaseRelation$lzycompute(DeltaTableV2.scala:236)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.toBaseRelation(DeltaTableV2.scala:234)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.$anonfun$createRelation$5(DeltaDataSource.scala:250)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.recordFrameProfile(DeltaDataSource.scala:49)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.createRelation(DeltaDataSource.scala:209)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:346)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.$anonfun$readDataSourceTable$1(DataSourceStrategy.scala:260)
	at org.sparkproject.guava.cache.LocalCache$LocalManualCache$1.load(LocalCache.java:4792)
	at org.sparkproject.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
	at org.sparkproject.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
	at org.sparkproject.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
	at org.sparkproject.guava.cache.LocalCache$Segment.get(LocalCache.java:2257)
	... 87 more


In [3]:
spark.stop()
