In [0]:
dbutils.widgets.text("wNumberOfFiles", "10", "Number of new files to generate")

In [0]:
%pip install dbldatagen==0.4.0

In [0]:
def generate_random_trip_data():
    """Generate random taxi trip data"""
    import dbldatagen as dg
    import pyspark.sql.types as T

    ds = (
        dg.DataGenerator(
            spark, name="random_taxi_trip_dataset", rows=10000, partitions=4
        )
        .withColumn("trip_id", T.IntegerType(), minValue=1000000, maxValue=2000000)
        .withColumn("taxi_number", T.IntegerType(), uniqueValues=10000, random=True)
        .withColumn("passenger_count", T.IntegerType(), minValue=1, maxValue=4)
        .withColumn("trip_amount", T.FloatType(), minValue=10.0, maxValue=1000.0)
        .withColumn("trip_distance", T.FloatType(), minValue=0.1, maxValue=1000.0)
        .withColumn("trip_date", T.DateType(), uniqueValues=300, random=True)
        .withColumn("op_type", T.StringType(), values=["I", "U", "D"], random=True)
        .withColumn("op_date", T.DateType(), uniqueValues=300, random=True)
        .withColumn("sequence_num", T.IntegerType(), minValue=1000000, maxValue=2000000)
    )

    return ds.build()

In [0]:
dbutils.fs.ls('dbfs:/')

In [0]:
dbutils.fs.ls("dbfs:/tmp/")

In [0]:
# dbutils.fs.mkdirs("dbfs:/tmp/chp_02/taxi_data")
# dbutils.fs.mkdirs("dbfs:/tmp/chp_02/taxi_data_chkpnt")
spark.sql("CREATE SCHEMA IF NOT EXISTS hive_metastore.chp_02")

In [0]:
import random

max_num_files = dbutils.widgets.get("wNumberOfFiles")
for i in range(int(max_num_files)):
    df = generate_random_trip_data()
    file_name = f"/tmp/chp_02/taxi_data/taxi_data_{random.randint(1, 1000000)}.json"
    df.write.mode("append").json(file_name)
    print(f"Wrote trip data to '{file_name}'")

In [0]:
df = (
    spark.read.json("/tmp/chp_02/taxi_data/taxi_data_*.json")
)
df.display()

In [0]:
# Optional - Cleanup random generated data
dbutils.fs.rm("/tmp/chp_02/taxi_data/", True)
dbutils.fs.rm("/tmp/chp_02/taxi_data_chkpnt", recurse=True)
dbutils.fs.rm("/tmp/chp_02/", True)
spark.sql("DROP SCHEMA IF EXISTS dbdemos.chp_02 CASCADE")
