In [None]:
"""
Author: Matt Martin
Date: 2/19/24
Desc: Testing Spark UI monitor
    -- webui launches at http://localhost:4040
"""

import os
dw_path = os.path.expanduser("~")+'/test_dummy_data/spark/test_dw'

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test") \
    .config("spark.sql.warehouse.dir", dw_path) \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.instances", 10) \
    .config("spark.jars.packages", "io.dataflint:spark_2.12:0.1.4") \
    .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") \
    .getOrCreate()

#### Semantics on standard spark

1. You can create a "database" in spark which is more/less analogous to a schema in other databases
2. Standard Spark does not support "create or replace table"; delta and iceberg do
3. Make sure to to callibrate max mem and workers/executors prior to kicking off a job that has a lot of ram usage
4. 

In [None]:
data = [
    {'id':1,'name':'Dale'},
    {'id':2,'name':'Fred'}
]

df = spark.createDataFrame(data)
df.show()

In [3]:
df.createOrReplaceTempView('test')

In [5]:
spark.sql("create database testdb")

DataFrame[]

In [6]:
spark.catalog.setCurrentDatabase("testdb")

In [8]:
df.write.mode('overwrite').saveAsTable('test3')

In [None]:
spark.sql('create or replace table testdb.test3 as select 1 as x')

In [9]:
spark.sql('select * from testdb.test3').show(truncate=False)

+---+----+
|id |name|
+---+----+
|2  |Fred|
|1  |Dale|
+---+----+



In [35]:
df2 = spark.range(0,1_000_000_000).toDF("row_id")

In [None]:
df2.write.mode('overwrite').saveAsTable("ints")

In [None]:
spark.stop()