In [3]:
import os
import sys
import socket

import sys,uuid,datetime
from pyspark.sql import SparkSession, Row
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import explode


In [4]:
key = os.environ["MINIO_ACCESS_KEY"]
secret = os.environ["MINIO_SECRET_KEY"]
endpoint = "http://minio:9000"
print(endpoint)

http://minio:9000


In [5]:
no need to worry about size and speed as everything is scalable with Spark, even the metadata.
spark = SparkSession.builder \
.master("local") \
.appName("nino") \
.config("spark.hadoop.fs.s3a.access.key", key) \
.config("spark.hadoop.fs.s3a.secret.key", secret) \
.config("spark.hadoop.fs.s3a.endpoint", endpoint) \
.config("spark.hadoop.fs.s3a.path.style.access", "true") \
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0,org.apache.hadoop:hadoop-aws:3.2.0,com.amazonaws:aws-java-sdk-bundle:1.11.375") \
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.config('spark.submit.deployMode', 'client') \
.config("spark.kubernetes.container.image", "spark:spark-docker") \
.config("spark.kubernetes.pyspark.pythonVersion", "3") \
.config("spark.kubernetes.authenticate.driver.serviceAccountName", "default") \
.config("spark.executor.instances", "1") \
.config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
.config("spark.kubernetes.executor.request.cores","0.5") \
.config("spark.kubernetes.executor.limit.cores","1") \
.config("jupyterService.jupyterPort_create_prop", "30888") \
.config("serviceAccount", "spark") \
.getOrCreate()

In [6]:
sc = spark.sparkContext

# Create delta lake tables

In [7]:
delta_path = "s3a://wallstreetbets/"
json_path = "s3a://wallstreetbets/*.txt"
delta_table_name='wallstreetbets'
database = 'reddit'

In [8]:
data = spark.read.json(json_path)
children = data.select("data.*")
df = children.select(explode("children")).select("col.data.*")
df.printSchema()

root
 |-- all_awardings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- award_sub_type: string (nullable = true)
 |    |    |-- award_type: string (nullable = true)
 |    |    |-- awardings_required_to_grant_benefits: long (nullable = true)
 |    |    |-- coin_price: long (nullable = true)
 |    |    |-- coin_reward: long (nullable = true)
 |    |    |-- count: long (nullable = true)
 |    |    |-- days_of_drip_extension: long (nullable = true)
 |    |    |-- days_of_premium: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- end_date: string (nullable = true)
 |    |    |-- giver_coin_reward: long (nullable = true)
 |    |    |-- icon_format: string (nullable = true)
 |    |    |-- icon_height: long (nullable = true)
 |    |    |-- icon_url: string (nullable = true)
 |    |    |-- icon_width: long (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- is_enabled: boolean (nullable = t

In [9]:
df.show()


+--------------------+-------------------+---------------+-----------+--------+--------------------+-----------------------------+----------------------+---------------------+------------------------+--------------------+-----------------------+-----------------+---------------+--------------------+--------------+--------+-------------+---------+--------+------------+--------+-------+------------------+------------+-------------+-------------+---------------+-------------+-------------------+-----+-------------+------+------------------+------+----------+------+----------------------+----------------+-------+-------------------+----------------------+------------------+-------+--------+-----+---------------------------+--------------------+--------------------+----------------------+------------------+---------------------+---------------+------+--------------------+--------------------+----------+--------+-------------+----------------+-----------+---------+---------+------------+----

In [10]:
df.write \
    .format("delta") \
    .mode('overwrite')  \
    .option("mergeSchema", True) \
    .save(delta_path)

In [11]:
spark.sql(
        "CREATE DATABASE IF NOT EXISTS {}".format(database)
    )


spark.sql(
        """
        CREATE TABLE IF NOT EXISTS {}.{}
        USING DELTA
        LOCATION "{}"
        """.format(
            database, delta_table_name, delta_path
        )
    )

DataFrame[]

# Query delta lake format

In [12]:
spark.sql("DESCRIBE FORMATTED delta.`{}`".format(delta_path)).show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|                data|struct<all_awardi...|       |
|       all_awardings|array<struct<awar...|       |
| allow_live_comments|             boolean|       |
|     approved_at_utc|              string|       |
|         approved_by|              string|       |
|            archived|             boolean|       |
|              author|              string|       |
|author_flair_back...|              string|       |
|author_flair_css_...|              string|       |
|author_flair_rich...|array<struct<e:st...|       |
|author_flair_temp...|              string|       |
|   author_flair_text|              string|       |
|author_flair_text...|              string|       |
|   author_flair_type|              string|       |
|     author_fullname|              string|       |
|author_patreon_flair|             boolean|       |
|      autho

# Reading data from the CSV

In [13]:
spark.sql("select title, selftext, url from delta.`{}`".format(delta_path)).show()

+--------------------+--------------------+--------------------+
|               title|            selftext|                 url|
+--------------------+--------------------+--------------------+
|Weekend Discussio...|Your weekend disc...|https://www.reddi...|
|Most Anticipated ...|                    |https://i.redd.it...|
|TD Ameritrade has...|                    |https://www.tdame...|
|All on board the ...|Below is a very i...|https://www.reddi...|
|Back in 2019 I dr...|                    |https://v.redd.it...|
|I built a program...|This post is in c...|https://www.reddi...|
|6000 shares of BB...|                    |https://i.redd.it...|
|First time Buyer ...|                    |https://i.redd.it...|
|     Bulls and Bears|                    |https://v.redd.it...|
|Disclaimer* I am ...|                    |https://v.redd.it...|
|House deposit mon...|                    |https://i.redd.it...|
|AMC has been havi...|                    |https://v.redd.it...|
|I analyzed all th...|**P

In [14]:
spark.sql("select count(*) from delta.`{}`".format(delta_path)).show()

+--------+
|count(1)|
+--------+
|    2808|
+--------+

