In [None]:
"""
Author: Matt Martin
Date: 1/27/24
Desc: Various techniques to generate test data in spark
"""

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("sequence") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.instances", 10) \
    .getOrCreate()

In [2]:
#generate row sequence
df = spark.range(1,1_000_001).toDF('row_id')
#generate a list of dates
start_dt = "2001-01-01"
end_dt = "2024-05-10"
dates = spark.sql(f"SELECT explode(sequence(to_date('{start_dt}'), to_date('{end_dt}'), interval 1 day)) as date")

In [3]:
df.createOrReplaceTempView('numbers')
dates.createOrReplaceTempView('dates')

In [6]:
sql = """
select n.row_id, d.date, uuid() as test_id
from numbers as n, dates as d
limit 10000
"""
spark.sql(sql).show(10)

+------+----------+--------------------+
|row_id|      date|             test_id|
+------+----------+--------------------+
|     1|2001-01-01|e31821d6-819c-4f0...|
|     1|2001-01-02|cd9b626a-5360-40a...|
|     1|2001-01-03|6c924bb9-2e8e-416...|
|     1|2001-01-04|5bd58446-43bf-4d9...|
|     1|2001-01-05|ae865f71-c83a-4fd...|
|     1|2001-01-06|ddc28548-c5fe-49d...|
|     1|2001-01-07|153713a0-ddfc-49c...|
|     1|2001-01-08|ec6ff8dc-d566-464...|
|     1|2001-01-09|297d8cf2-dc3d-4c1...|
|     1|2001-01-10|38e55e0c-a159-404...|
+------+----------+--------------------+
only showing top 10 rows



In [None]:
spark.stop