Initializing Spark Context

In [1]:
from pyspark.sql import SparkSession


jars = ["/home/jovyan/openlineage/libs/openlineage-spark-1.5.0.jar"]
marquez_url = "http://host.docker.internal:5000/api/v1"
marquez_namespace = 'notebook_experiments'


spark = (SparkSession.builder.master('local')
            .appName('spark_agent_exploration')
            .config('spark.jars', ",".join(jars))
            .config('spark.jars.packages', 'io.openlineage:openlineage-spark:1.5.0')
            .config('spark.extraListeners', 'io.openlineage.spark.agent.OpenLineageSparkListener')
            .config('spark.openlineage.appName', 'anime_job') # overwriting Spark app name in events 
            
            # using HTTP transport, sending events directly to Marquez endpoint
            .config('spark.openlineage.transport.type', 'http')
            .config('spark.openlineage.transport.url', marquez_url)
            
            # evolve to capture an env var
            .config('spark.openlineage.namespace', marquez_namespace)
            .config('spark.openlineage.jobName.appendDatasetName', False)
            # .config('spark.openlineage.dataset.removePath.pattern', '(.*)(?<remove>\/.*\/.*)')
            .getOrCreate())

spark.sparkContext.setLogLevel("INFO")

Reading anime genre data for initial experiment

In [2]:
anime_genre_df = spark.read.parquet('./source_data/animes_genres.parquet')
anime_genre_df.show(10)

+----+--------------------+------+-------+------+----+----------+------+--------+------+-------------+------+-----+-------+------+------+----------+-----+-----+--------+----+----+-------+-----+-----+-----+------------+-------------+---------+-------+------+-----------+------+------------+------+---------+-------+-----+--------+----+-------+----+------+-----+------+
|  id|      details.Genres|Parody|Vampire|Shoujo|Kids|Historical|Horror|Dementia|School|Psychological|Demons|Drama|Mystery|Police|Action|Shounen Ai|Ecchi|Music|Military|Game|Yaoi|Samurai|Space|Harem|Mecha|Martial Arts|Slice of Life|Adventure|Fantasy|Sports|Super Power|Sci-Fi|Supernatural|Hentai|Shoujo Ai|Romance|Josei|Thriller|Cars|Shounen|Yuri|Seinen|Magic|Comedy|
+----+--------------------+------+-------+------+----+----------+------+--------+------+-------------+------+-----+-------+------+------+----------+-----+-----+--------+----+----+-------+-----+-----+-----+------------+-------------+---------+-------+------+-------

In [3]:
anime_genre_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- details.Genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Parody: long (nullable = true)
 |-- Vampire: long (nullable = true)
 |-- Shoujo: long (nullable = true)
 |-- Kids: long (nullable = true)
 |-- Historical: long (nullable = true)
 |-- Horror: long (nullable = true)
 |-- Dementia: long (nullable = true)
 |-- School: long (nullable = true)
 |-- Psychological: long (nullable = true)
 |-- Demons: long (nullable = true)
 |-- Drama: long (nullable = true)
 |-- Mystery: long (nullable = true)
 |-- Police: long (nullable = true)
 |-- Action: long (nullable = true)
 |-- Shounen Ai: long (nullable = true)
 |-- Ecchi: long (nullable = true)
 |-- Music: long (nullable = true)
 |-- Military: long (nullable = true)
 |-- Game: long (nullable = true)
 |-- Yaoi: long (nullable = true)
 |-- Samurai: long (nullable = true)
 |-- Space: long (nullable = true)
 |-- Harem: long (nullable = true)
 |-- Mecha: long (nullable = tr

In [4]:
anime_genre_df.createOrReplaceTempView('animes_genres')

In [5]:
top10_df = spark.sql('select * from animes_genres order by id asc limit 10')
top10_df.show(10,False)

+---+-----------------------------------------------------------------+------+-------+------+----+----------+------+--------+------+-------------+------+-----+-------+------+------+----------+-----+-----+--------+----+----+-------+-----+-----+-----+------------+-------------+---------+-------+------+-----------+------+------------+------+---------+-------+-----+--------+----+-------+----+------+-----+------+
|id |details.Genres                                                   |Parody|Vampire|Shoujo|Kids|Historical|Horror|Dementia|School|Psychological|Demons|Drama|Mystery|Police|Action|Shounen Ai|Ecchi|Music|Military|Game|Yaoi|Samurai|Space|Harem|Mecha|Martial Arts|Slice of Life|Adventure|Fantasy|Sports|Super Power|Sci-Fi|Supernatural|Hentai|Shoujo Ai|Romance|Josei|Thriller|Cars|Shounen|Yuri|Seinen|Magic|Comedy|
+---+-----------------------------------------------------------------+------+-------+------+----+----------+------+--------+------+-------------+------+-----+-------+------+--

Time to write the dataframe and check Marquez

In [7]:
top10_df.coalesce(1).write.mode("overwrite").parquet("./top_10_animes/")

In [9]:
animes_df = spark.read.parquet('./source_data/prepared_animes.parquet')
animes_df.createOrReplaceTempView('animes')
animes_df.show(1,False)

+----+--------+----------------------------------------------------+--------------------------------------------------------+-------------------------+-------------------+---------------------------------------------------+-------------------------------------------------+------------------------------------------------------+----------------+----------------+------------+----------------+---------------+---------------------------+-----------------+------------------------+---------------------------------------------------------------------+-----------------+---------------------------------+--------------+-----------------------+----------------+--------------+-------------+----------------+--------------+------------------+---------------+-----------------+---------------+-----+-------------------+-------------------+-----------+-----------------+-------------------+-------------------+-------------------------+--------------+
|id  |title   |photo                                   

In [11]:
spark.sql('select count(id) as total_animes from animes').show(10,False)

+------------+
|total_animes|
+------------+
|21755       |
+------------+



In [11]:
spark.sql('select count(id) as total_animes from animes').show(10,False)

+------------+
|total_animes|
+------------+
|21755       |
+------------+

