In [None]:
from poor_man_lakehouse.config import settings
from poor_man_lakehouse.spark_connector.builder import retrieve_current_spark_session

spark = retrieve_current_spark_session()

In [2]:
spark.sql("SHOW CATALOGS").show()
spark.sql(f"show schemas in {settings.CATALOG}").show()
spark.sql("show schemas in spark_catalog").show()
spark.sql(f"SHOW  tables in {settings.CATALOG}.default").show()

+-------------+
|      catalog|
+-------------+
|     postgres|
|spark_catalog|
+-------------+

+----------+
| namespace|
+----------+
|   default|
|pg_catalog|
+----------+

+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|    prova|      false|
+---------+---------+-----------+



In [None]:
from datetime import UTC, datetime

import polars as pl
from pyspark.sql import DataFrame

data = pl.DataFrame(
    {
        "datetime": [
            datetime(2023, 1, 1, 12, 0, tzinfo=UTC),
            datetime(2023, 1, 2, 12, 0, tzinfo=UTC),
            datetime(2023, 1, 3, 12, 0, tzinfo=UTC),
        ],
        "symbol": ["AAPL", "GOOGL", "MSFT"],
        "bid": [150.0, 2800.0, 300.0],
        "ask": [151.0, 2805.0, 305.0],
        "details": [
            {"created_by": "user1"},
            {"created_by": "user2"},
            {"created_by": None},
        ],
    },
)

spark_df: DataFrame = spark.createDataFrame(data.to_pandas())


In [4]:
spark_df.write.format("iceberg").mode("overwrite").saveAsTable(
    f"{settings.CATALOG}.default.prova", mode="overwrite"
)

26/01/13 23:31:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

In [5]:
spark.sql(f"""SELECT * FROM {settings.CATALOG}.default.prova""").show()

+-------------------+------+------+------+--------------------+
|           datetime|symbol|   bid|   ask|             details|
+-------------------+------+------+------+--------------------+
|2023-01-01 12:00:00|  AAPL| 150.0| 151.0|{created_by -> us...|
|2023-01-02 12:00:00| GOOGL|2800.0|2805.0|{created_by -> us...|
|2023-01-03 12:00:00|  MSFT| 300.0| 305.0|{created_by -> NULL}|
+-------------------+------+------+------+--------------------+



## Testing LakeSail

To test Lakesail, it's necessary to restart the kernel since the sparkSession was already created and we need a clean one.

In [None]:
from pysail.spark import SparkConnectServer
from pyspark.sql import SparkSession

server = SparkConnectServer()
server.start()
address = server.listening_address
if address is None:
    raise RuntimeError("Failed to start Spark Connect server")
_, port = address

spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate()


In [None]:
path = "s3a://warehouse/test_parquet"
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], schema="id INT, name STRING")
df.write.parquet(path)

df = spark.read.parquet(path)
