In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from delta.pip_utils import configure_spark_with_delta_pip
from delta.tables import DeltaTable

In [2]:
SparkDeltalakeConfig = SparkSession.builder\
    .master("local[*]")\
    .appName("sparkdev-deltalake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [3]:
spark = configure_spark_with_delta_pip(SparkDeltalakeConfig).getOrCreate()

:: loading settings :: url = jar:file:/Users/pavanmantha/venv-metal/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/pavanmantha/.ivy2/cache
The jars for the packages stored in: /Users/pavanmantha/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d0237427-3663-466e-a240-dc7588833b23;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/3.0.0/delta-spark_2.12-3.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-spark_2.12;3.0.0!delta-spark_2.12.jar (1293ms)
downloading https://repo1.maven.org/maven2/io/delta/delta-storage/3.0.0/delta-storage-3.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-storage;3.0.0!delta-storage.jar (307ms)
downloading https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.9.3/antlr4-runtime-4.9.3.jar ...
	[SUCCESSFUL ] org.antlr#antlr4-runtime;4.9.3!antlr4-runtime.jar (312ms)
:: resolution report :: reso

In [4]:
schema = StructType([
    StructField('Id', IntegerType(), True), # ID
    StructField('Date', StringType(), True), #date
    StructField('Location', StringType(), True),
    StructField('MinTemp', StringType(), True),
    StructField('MaxTemp', StringType(), True),
    StructField('Rainfall', StringType(), True)
    # AND OTHER FIELDS OMITTED TO MAKE THIS CODE BLOCK SMALL
])

In [5]:
df_weather = (
    spark
    .read.format("csv")
    .option("delimiter", ",")
    .option("header", "true")
    .option("encoding", "ISO-8859-1")
    .option('overwriteSchema', True)
    .schema(schema=schema)
    .load("./data/weather.csv")
)

df_weather.show(5)

23/10/19 15:48:41 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 25, schema size: 6
CSV file: file:///Users/pavanmantha/Pavans/PracticeExamples/DataScience_Practice/spark-handson/deltalake-pyspark/data/weather.csv


+---+----------+--------+-------+-------+--------+
| Id|      Date|Location|MinTemp|MaxTemp|Rainfall|
+---+----------+--------+-------+-------+--------+
|  0|2008-12-01|  Albury|   13.4|   22.9|     0.6|
|  1|2008-12-02|  Albury|    7.4|   25.1|     0.0|
|  2|2008-12-03|  Albury|   12.9|   25.7|     0.0|
|  3|2008-12-04|  Albury|    9.2|   28.0|     0.0|
|  4|2008-12-05|  Albury|   17.5|   32.3|     1.0|
+---+----------+--------+-------+-------+--------+
only showing top 5 rows



In [6]:
df_weather.write.format("delta")\
    .mode("overwrite")\
    .option('mergeSchema', True)\
    .saveAsTable("weather_delta_master")
    #.save("./delta")

23/10/19 15:48:44 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 25, schema size: 6
CSV file: file:///Users/pavanmantha/Pavans/PracticeExamples/DataScience_Practice/spark-handson/deltalake-pyspark/data/weather.csv
23/10/19 15:48:46 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
23/10/19 15:48:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
#df_weather_delta = spark.read.format("delta").load("./delta")

In [8]:
#df_weather_delta.select(["Date","MaxTemp"]).show(5, truncate=False)

In [9]:
#df_weather_delta.count()

## Read data from the delta lake using SQL

In [7]:
spark.sql("select * from weather_delta_master").limit(10).show(truncate=False)

+---+----------+--------+-------+-------+--------+
|Id |Date      |Location|MinTemp|MaxTemp|Rainfall|
+---+----------+--------+-------+-------+--------+
|0  |2008-12-01|Albury  |13.4   |22.9   |0.6     |
|1  |2008-12-02|Albury  |7.4    |25.1   |0.0     |
|2  |2008-12-03|Albury  |12.9   |25.7   |0.0     |
|3  |2008-12-04|Albury  |9.2    |28.0   |0.0     |
|4  |2008-12-05|Albury  |17.5   |32.3   |1.0     |
|5  |2008-12-06|Albury  |14.6   |29.7   |0.2     |
|6  |2008-12-07|Albury  |14.3   |25.0   |0.0     |
|7  |2008-12-08|Albury  |7.7    |26.7   |0.0     |
|8  |2008-12-09|Albury  |9.7    |31.9   |0.0     |
|9  |2008-12-10|Albury  |13.1   |30.1   |1.4     |
+---+----------+--------+-------+-------+--------+



In [8]:
spark.sql('select location, count(*) as _count from weather_delta_master '
          'group by location order by _count desc').show(truncate=False)

+-------------+------+
|location     |_count|
+-------------+------+
|Albury       |3011  |
|Cobar        |2988  |
|NorfolkIsland|2964  |
|Newcastle    |2955  |
|CoffsHarbour |2953  |
|NorahHead    |2929  |
|BadgerysCreek|2928  |
|Moree        |2854  |
|Penrith      |1418  |
+-------------+------+



In [32]:
spark.sql(
    'SELECT location, max(MaxTemp) AS MAX_TEMP FROM weather_delta_master GROUP BY location ORDER BY CAST(MAX_TEMP AS FLOAT) DESC'
).show(truncate=False)

+-------------+--------+
|location     |MAX_TEMP|
+-------------+--------+
|Penrith      |46.5    |
|BadgerysCreek|46.4    |
|Newcastle    |44.1    |
|NorahHead    |44.0    |
|CoffsHarbour |39.2    |
|NorfolkIsland|28.4    |
|Albury       |9.9     |
|Cobar        |9.7     |
|Moree        |9.6     |
+-------------+--------+



In [33]:
spark.sql("DESCRIBE TABLE weather_delta_master").limit(10).show(truncate=False)

+---------------+---------+-------+
|col_name       |data_type|comment|
+---------------+---------+-------+
|Id             |int      |       |
|Date           |string   |       |
|Location       |string   |       |
|MinTemp        |string   |       |
|MaxTemp        |string   |       |
|Rainfall       |string   |       |
|               |         |       |
|# Partitioning |         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



## Check delta table versions

In [37]:
dt = DeltaTable.forName(spark, "weather_delta_master")

In [38]:
dt.history().select("version","timestamp").show(truncate=False)

                                                                                

+-------+-----------------------+
|version|timestamp              |
+-------+-----------------------+
|2      |2023-05-26 07:19:01.864|
|1      |2023-05-25 20:43:06.7  |
|0      |2023-05-25 20:41:50.456|
+-------+-----------------------+



In [44]:
spark.sql('SELECT * FROM weather_delta_master VERSION as of 1').show(truncate=False)

                                                                                

+---+----------+--------+-------+-------+--------+
|Id |Date      |Location|MinTemp|MaxTemp|Rainfall|
+---+----------+--------+-------+-------+--------+
|0  |2008-12-01|Albury  |13.4   |22.9   |0.6     |
|1  |2008-12-02|Albury  |7.4    |25.1   |0.0     |
|2  |2008-12-03|Albury  |12.9   |25.7   |0.0     |
|3  |2008-12-04|Albury  |9.2    |28.0   |0.0     |
|4  |2008-12-05|Albury  |17.5   |32.3   |1.0     |
|5  |2008-12-06|Albury  |14.6   |29.7   |0.2     |
|6  |2008-12-07|Albury  |14.3   |25.0   |0.0     |
|7  |2008-12-08|Albury  |7.7    |26.7   |0.0     |
|8  |2008-12-09|Albury  |9.7    |31.9   |0.0     |
|9  |2008-12-10|Albury  |13.1   |30.1   |1.4     |
|10 |2008-12-11|Albury  |13.4   |30.4   |0.0     |
|11 |2008-12-12|Albury  |15.9   |21.7   |2.2     |
|12 |2008-12-13|Albury  |15.9   |18.6   |15.6    |
|13 |2008-12-14|Albury  |12.6   |21.0   |3.6     |
|14 |2008-12-16|Albury  |9.8    |27.7   |null    |
|15 |2008-12-17|Albury  |14.1   |20.9   |0.0     |
|16 |2008-12-18|Albury  |13.5  