In [1]:
"""
Author: Matt Martin
Date: 10/24/2023
Desc: Simple demo using spark 
"""

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test") \
    .config("spark.sql.warehouse.dir", "./test_dw") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/10 06:34:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
#generate some data
data1 = [
     {'name': 'Matt', 'age':12}
    ,{'name': 'Alex', 'age':14}
]

df1 = spark.createDataFrame(data1)
df1.write.mode("overwrite").parquet('./persons1.parquet')

data2 = [
     {'name': 'Tom', 'age':19}
    ,{'name': 'Sandy', 'age':31}
    ,{'name': 'Matt', 'age': 15}
]

df2 = spark.createDataFrame(data2)
df2.write.mode("overwrite").parquet('./persons2.parquet')

                                                                                

In [3]:
#assign views to each dataframe
df1.createOrReplaceTempView("persons1")
df2.createOrReplaceTempView("persons2")

In [4]:
## run some sql to join the data and write out to a file
sql = """
SELECT
     COALESCE(p1.name, p2.name) as name
    ,COALESCE(p1.age, p2.age) as age1
    ,p2.age as other_age
FROM persons1 as p1
    FULL OUTER JOIN persons2 as p2
        ON p1.name = p2.name
"""
spark.sql(sql).write.mode("overwrite").parquet('./persons_combined.parquet')

In [5]:
sql = """
create or replace temp view test
as
SELECT
     COALESCE(p1.name, p2.name) as name
    ,COALESCE(p1.age, p2.age) as age1
    ,p2.age as other_age
FROM persons1 as p1
    FULL OUTER JOIN persons2 as p2
        ON p1.name = p2.name

"""
spark.sql(sql)

DataFrame[]

In [6]:
spark.sql("select * from test").show()

+-----+----+---------+
| name|age1|other_age|
+-----+----+---------+
| Alex|  14|     null|
| Matt|  12|       15|
|Sandy|  31|       31|
|  Tom|  19|       19|
+-----+----+---------+



In [9]:
df1.write.saveAsTable("test_tbl3")

In [10]:
spark.sql("select * from test_tbl3").show()

+---+----+
|age|name|
+---+----+
| 12|Matt|
| 14|Alex|
+---+----+



In [5]:
#validate what the output was
spark.read.parquet('./persons_combined.parquet').show()

+-----+----+---------+
| name|age1|other_age|
+-----+----+---------+
| Alex|  14|     null|
| Matt|  12|       15|
|Sandy|  31|       31|
|  Tom|  19|       19|
+-----+----+---------+

