In [0]:

%pip install dbldatagen

In [0]:
import dbldatagen as dg
df = dg.Datasets(spark, "basic/user").get(rows=1000_000).build()
num_rows=df.count() 

In [0]:
print(num_rows)

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS magnusp_catalog.training.source

In [0]:
df.write.format("csv").option("header", "true").mode("overwrite").save("/Volumes/magnusp_catalog/training/source/user.csv")

In [0]:
import dbldatagen as dg
import pyspark.sql.functions as F

# clear cache so that if we run multiple times to check performance,


UNIQUE_PLANS = 20
PLAN_MIN_VALUE = 100

shuffle_partitions_requested = 8
partitions_requested = 1
data_rows = UNIQUE_PLANS # we'll generate one row for each plan

spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)


plan_dataspec = (
    dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
    .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
    # use plan_id as root value
    .withColumn("plan_name", prefix="plan", baseColumn="plan_id")

    # note default step is 1 so you must specify a step for small number ranges,
    .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
                step=0.005, random=True)
    .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
                step=0.001, random=True)
    .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
                step=0.001, random=True)

    # we're modelling long distance and international prices simplistically -
    # each is a multiplier thats applied to base rate
    .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
                random=True, distribution="normal", omit=True)
    .withColumn("ld_cost_per_minute", "decimal(5,3)",
                expr="cost_per_minute * ld_multiplier",
                baseColumns=['cost_per_minute', 'ld_multiplier'])
    .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
                random=True,  distribution="normal", omit=True)
    .withColumn("intl_cost_per_minute", "decimal(5,3)",
                expr="cost_per_minute * intl_multiplier",
                baseColumns=['cost_per_minute', 'intl_multiplier'])
            )

df_plans = plan_dataspec.build().cache()

display(df_plans)

In [0]:
df_plans.write.format("json").mode("overwrite").save("/Volumes/magnusp_catalog/training/source/plans.json")




In [0]:
x = spark.read.format("json").load("/Volumes/magnusp_catalog/training/source/plans.json")
display(x)

In [0]:
import dbldatagen as dg
import pyspark.sql.functions as F

spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)

UNIQUE_CUSTOMERS = 50000
CUSTOMER_MIN_VALUE = 1000
DEVICE_MIN_VALUE = 1000000000
SUBSCRIBER_NUM_MIN_VALUE = 1000000000

spark.catalog.clearCache()  # clear cache so that if we run multiple times to check
                            # performance, we're not relying on cache
shuffle_partitions_requested = 8
partitions_requested = 8
data_rows = UNIQUE_CUSTOMERS

customer_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
            .withColumn("customer_id","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
                        uniqueValues=UNIQUE_CUSTOMERS)
            .withColumn("customer_name", template=r"\\w \\w|\\w a. \\w")

            # use the following for a simple sequence
            #.withColumn("device_id","decimal(10)", minValue=DEVICE_MIN_VALUE,
            #              uniqueValues=UNIQUE_CUSTOMERS)

            .withColumn("device_id","decimal(10)",  minValue=DEVICE_MIN_VALUE,
                        baseColumn="customer_id", baseColumnType="hash")

            .withColumn("phone_number","decimal(10)",  minValue=SUBSCRIBER_NUM_MIN_VALUE,
                        baseColumn=["customer_id", "customer_name"], baseColumnType="hash")

            # for email, we'll just use the formatted phone number
            .withColumn("email","string",  format="subscriber_%s@myoperator.com",
                        baseColumn="phone_number")
            .withColumn("plan", "int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS,
                        random=True)
            )

df_customers = (customer_dataspec.build()
                .dropDuplicates(["device_id"])
                .dropDuplicates(["phone_number"])
                .orderBy("customer_id")
                .cache()
               )

effective_customers = df_customers.count()


     


In [0]:
display(df_customers)

In [0]:
df_customers.write.format("csv").option("header", "true").mode("overwrite").save("/Volumes/magnusp_catalog/training/source/customers.csv")

In [0]:
dbutils.fs.mkdirs("/Volumes/magnusp_catalog/training/source/schemas/plans")

In [0]:
import dbldatagen as dg
import pyspark.sql.functions as F

AVG_EVENTS_PER_CUSTOMER = 50

spark.catalog.clearCache()
shuffle_partitions_requested = 8
partitions_requested = 8
NUM_DAYS=31
MB_100 = 100 * 1000 * 1000
K_1 = 1000
data_rows = AVG_EVENTS_PER_CUSTOMER * UNIQUE_CUSTOMERS * NUM_DAYS

spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)


# use random seed method of 'hash_fieldname' for better spread - default in later builds
events_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                   randomSeed=42, randomSeedMethod="hash_fieldname")
             # use same logic as per customers dataset to ensure matching keys
             # but make them random
            .withColumn("device_id_base","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
                        uniqueValues=UNIQUE_CUSTOMERS,
                        random=True, omit=True)
            .withColumn("device_id","decimal(10)",  minValue=DEVICE_MIN_VALUE,
                        baseColumn="device_id_base", baseColumnType="hash")

            # use specific random seed to get better spread of values
            .withColumn("event_type","string",
                        values=[ "sms", "internet", "local call", "ld call", "intl call" ],
                        weights=[50, 50, 20, 10, 5 ], random=True)

            # use Gamma distribution for skew towards short calls
            .withColumn("base_minutes","decimal(7,2)",
                        minValue=1.0, maxValue=100.0, step=0.1,
                        distribution=dg.distributions.Gamma(shape=1.5, scale=2.0),
                        random=True, omit=True)

            # use Gamma distribution for skew towards short transfers
            .withColumn("base_bytes_transferred","decimal(12)",
                        minValue=K_1, maxValue=MB_100,
                        distribution=dg.distributions.Gamma(shape=0.75, scale=2.0),
                        random=True, omit=True)

            .withColumn("minutes", "decimal(7,2)",
                        baseColumn=["event_type", "base_minutes"],
                        expr= """
                              case when event_type in ("local call", "ld call", "intl call")
                                  then base_minutes
                                  else 0
                              end
                               """)
            .withColumn("bytes_transferred", "decimal(12)",
                        baseColumn=["event_type", "base_bytes_transferred"],
                        expr= """
                              case when event_type = "internet"
                                   then base_bytes_transferred
                                   else 0
                              end
                               """)

            .withColumn("event_ts", "timestamp",
                         data_range=dg.DateRange("2020-07-01 00:00:00",
                                                 "2020-07-31 11:59:59",
                                                 "seconds=1"),
                        random=True)

            )

df_events = events_dataspec.build()


In [0]:
df_events.write.mode("overwrite").saveAsTable("magnusp_catalog.training.events")