Initializing Spark Context

In [1]:
from pyspark.sql import SparkSession


jars = ["/home/jovyan/openlineage/libs/openlineage-spark-1.5.0.jar"]
marquez_url = "http://host.docker.internal:5000/api/v1"
marquez_namespace = 'notebook_experiments'


spark = (SparkSession.builder.master('local')
            .appName('complex_data_types_experiment')
            .config('spark.jars', ",".join(jars))
            .config('spark.jars.packages', 'io.openlineage:openlineage-spark:1.5.0')
            .config('spark.extraListeners', 'io.openlineage.spark.agent.OpenLineageSparkListener')
            .config('spark.openlineage.appName', 'complex_data_type') # overwriting Spark app name in events 
            
            # using HTTP transport, sending events directly to Marquez endpoint
            .config('spark.openlineage.transport.type', 'http')
            .config('spark.openlineage.transport.url', marquez_url)
            
            # evolve to capture an env var
            .config('spark.openlineage.namespace', marquez_namespace)
            .getOrCreate())

spark.sparkContext.setLogLevel("INFO")

Generic DataFrame for initial experiment

In [9]:
generic_df = spark.createDataFrame([
    {'a': 1, 'b': [1,2,3], 'c': {'id': 'abc', 'name': {'first': 'luis', 'last': 'yamada'}}},
    {'a': 2, 'b': [3,4,5], 'c': {'id': 'bcd', 'name': {'first': 'marcos', 'last': 'andrade'}}},
    {'a': 3, 'b': [6,7,8], 'c': {'id': 'efg', 'name': {'first': 'walace', 'last': 'morais'}}}
])
generic_df.show(3,False)

+---+---------+-------------------------------------------------+
|a  |b        |c                                                |
+---+---------+-------------------------------------------------+
|1  |[1, 2, 3]|{name -> {last=yamada, first=luis}, id -> abc}   |
|2  |[3, 4, 5]|{name -> {last=andrade, first=marcos}, id -> bcd}|
|3  |[6, 7, 8]|{name -> {last=morais, first=walace}, id -> efg} |
+---+---------+-------------------------------------------------+



In [11]:
generic_df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- c: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



Time to write the dataframe and check Marquez

In [13]:
generic_df.write.mode("overwrite").parquet("./complex_data_type/")