In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Spark Data Generation") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "8") \
    .getOrCreate()

In [28]:
import os
import time
import uuid
from pyspark.sql.functions import current_date, rand, floor, udf
from pyspark.sql.types import StringType
row_cnt = 500_000_000
home_dir = os.path.expanduser("~")
f_path = os.path.join(home_dir, f"test_dummy_data/spark")

In [23]:
#just a rowID: 6 seconds for just row id
start_ts = time.time()
df = spark.range(0, row_cnt).toDF('row_id') 
df.write.mode('overwrite').parquet(f_path)
end_ts = time.time()
print(f"Total time to create dataset: {end_ts - start_ts:.2f} seconds")



Total time to create dataset: 6.61 seconds


                                                                                

In [24]:
#rowID plus current date: 8 seconds adding current date
start_ts = time.time()
df = spark.range(0, row_cnt) \
    .withColumn('row_id',current_date()) \
    .toDF('row_id', 'rpt_dt') \
    .write.mode('overwrite').parquet(f_path) 
#df.write.mode('overwrite').parquet(f_path)
end_ts = time.time()
print(f"Total time to create dataset: {end_ts - start_ts:.2f} seconds")



Total time to create dataset: 8.25 seconds


                                                                                

In [25]:
#add a random int: 12 seconds

start_ts = time.time()

df = spark.range(0, row_cnt) \
    .withColumn('rpt_dt', current_date()) \
    .withColumn('some_val', floor(rand() * 100)) \
    .withColumnRenamed('id', 'row_id') \
    .toDF('row_id', 'rpt_dt', 'some_val') 

df.show(5)
df.write.mode('overwrite').parquet(f_path)  
end_ts = time.time()
print(f"Total time to create dataset: {end_ts - start_ts:.2f} seconds")

+------+----------+--------+
|row_id|    rpt_dt|some_val|
+------+----------+--------+
|     0|2024-06-06|      16|
|     1|2024-06-06|       0|
|     2|2024-06-06|      59|
|     3|2024-06-06|      12|
|     4|2024-06-06|      70|
+------+----------+--------+
only showing top 5 rows





Total time to create dataset: 11.91 seconds


                                                                                

In [29]:
#add guid: adds a whopping extra 240 seconds....UDFS in spark are Terrible
@udf(StringType())
def generate_uuid():
    return str(uuid.uuid4())

start_ts = time.time()

df = spark.range(0, row_cnt) \
    .withColumn('rpt_dt', current_date()) \
    .withColumn('some_val', floor(rand() * 100)) \
    .withColumn("txn_key", generate_uuid()) \
    .withColumnRenamed('id', 'row_id') \
    .toDF('row_id', 'rpt_dt', 'some_val', 'txn_key') 

df.show(5)
df.write.mode('overwrite').parquet(f_path)  
end_ts = time.time()
print(f"Total time to create dataset: {end_ts - start_ts:.2f} seconds")

+------+----------+--------+--------------------+
|row_id|    rpt_dt|some_val|             txn_key|
+------+----------+--------+--------------------+
|     0|2024-06-06|      77|4964a611-5984-433...|
|     1|2024-06-06|      37|6bce99e6-879d-424...|
|     2|2024-06-06|      22|3e53e424-ae2d-4c5...|
|     3|2024-06-06|      25|690e8d82-be27-4d6...|
|     4|2024-06-06|      26|8260cdc6-f1e2-4c6...|
+------+----------+--------+--------------------+
only showing top 5 rows





Total time to create dataset: 247.84 seconds


                                                                                

In [30]:
#add guid via inliniing: 26.22 seconds
from pyspark.sql.functions import expr

start_ts = time.time()

df = spark.range(0, row_cnt) \
    .withColumn('rpt_dt', current_date()) \
    .withColumn('some_val', floor(rand() * 100)) \
    .withColumn("txn_key", expr("uuid()")) \
    .withColumnRenamed('id', 'row_id') \
    .toDF('row_id', 'rpt_dt', 'some_val', 'txn_key') 

df.show(5)
df.write.mode('overwrite').parquet(f_path)  
end_ts = time.time()
print(f"Total time to create dataset: {end_ts - start_ts:.2f} seconds")

+------+----------+--------+--------------------+
|row_id|    rpt_dt|some_val|             txn_key|
+------+----------+--------+--------------------+
|     0|2024-06-06|      45|a6263713-a1b6-4b6...|
|     1|2024-06-06|      67|69ea866b-9cdf-474...|
|     2|2024-06-06|      47|12e7a895-997c-438...|
|     3|2024-06-06|      74|adb80dc3-17c4-4f5...|
|     4|2024-06-06|       3|17a326c2-942a-4ea...|
+------+----------+--------+--------------------+
only showing top 5 rows





Total time to create dataset: 26.22 seconds


                                                                                