In [14]:
"""
Author: Matt Martin
Date: 1/27/24
Desc: Various techniques to generate data in spark
"""

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("sequence") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.instances", 10) \
    .getOrCreate()

tot_rows = 1_000_000


In [2]:
import os
home_dir = os.path.expanduser("~")
f_path = "{0}/test_dummy_data/python/".format(home_dir)

In [15]:
#built in spark range iterator
df = spark.range(1,tot_rows+1).toDF('row_id')
df.write.mode('overwrite').parquet('{0}/numbers.parquet'.format(f_path))

In [16]:
#generate a list of dates
start_dt = "2001-01-01"
end_dt = "2024-05-10"

# Generate a DataFrame with a column containing the exploded dates
dates = spark.sql(f"SELECT explode(sequence(to_date('{start_dt}'), to_date('{end_dt}'), interval 1 day)) as date")
dates.write.mode('overwrite').parquet('{0}/dates.parquet'.format(f_path))

In [17]:
df.createOrReplaceTempView('numbers')
dates.createOrReplaceTempView('dates')

In [18]:
sql = """
select n.row_id, d.date
from numbers as n, dates as d
limit 10000
"""
spark.sql(sql).show(5)

+------+----------+
|row_id|      date|
+------+----------+
|     1|2001-01-01|
|     1|2001-01-02|
|     1|2001-01-03|
|     1|2001-01-04|
|     1|2001-01-05|
+------+----------+
only showing top 5 rows



In [None]:
spark.stop