#### Generate Synthetic Data

This notebook generates synthetic Superstore sales data using the original dataset
as a baseline. It supports:
- **Parameterized backfills** using databricks jobs
- **Monthly** file generation
- **Partitioned** folder structure (year/month)
- **rows_multiplier** widget for controlled data volume scaling
- **Realistic metric randomization**

##### **Output Path** :
`/raw_data/year=YYYY/superstore-sales-mon-yyyy.csv`


In [0]:
# Import modules
from pyspark.sql.functions import (
    col, rand, expr, add_months, trunc, lit, explode, sequence, monotonically_increasing_id, concat, to_date, coalesce
)

from datetime import date
from dateutil.relativedelta import relativedelta

#### Run Config Notebook

In [0]:
%run ../../configs/config_notebook

#### Widgets for Job Parameters

In [0]:
# Create input widgets
dbutils.widgets.text("start_year_month", "2018-01")  # parameter for the start date of the data generation
dbutils.widgets.text("end_year_month", "2018-02")    # parameter for the end date of the data generation
dbutils.widgets.text("rows_multiplier", "1.0")  # controls the number of rows in the output table
dbutils.widgets.dropdown("run_mode", "skip_if_exists", ["skip_if_exists", "overwrite"]) # controls idempotency


#### Read Widget Values

In [0]:
# Read widget values
start_year_month = date.fromisoformat(dbutils.widgets.get("start_year_month") + "-01")
end_year_month = date.fromisoformat(dbutils.widgets.get("end_year_month") + "-01")
rows_multiplier = float(dbutils.widgets.get("rows_multiplier"))
run_mode = dbutils.widgets.get("run_mode")

#### Paths

In [0]:
# Use configuration variables
SOURCE_FILE = f"{VOLUMES['raw_data']}/Sample - Superstore.csv"
TARGET_PATH = VOLUMES['raw_data']

#### Load base dataset

In [0]:
df_base_file = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("quote", '"')
    .option("escape", '"')
    .option("multiLine", True)
    .option("mode", "PERMISSIVE")
    .csv(SOURCE_FILE)
)


#### Normalize Columns Data Type

In [0]:
df_base_file = (
    df_base_file
    .withColumn("Row ID", col("Row ID").cast("int"))
    .withColumn(
        "Order Date", 
        coalesce(
            to_date(col("Order Date"), "yyyy-MM-dd"), 
            to_date(col("Order Date"), "MM/dd/yyyy")
        )
    )
    .withColumn(
        "Ship Date", 
        coalesce(
            to_date(col("Ship Date"), "yyyy-MM-dd"),
            to_date(col("Ship Date"), "MM/dd/yyyy")
        )
    )
    .withColumn("Quantity", col("Quantity").cast("double"))
    .withColumn("Sales", col("Sales").cast("double"))
    .withColumn("Profit", col("Profit").cast("double"))
    .withColumn("Discount", col("Discount").cast("double"))
)


#### Helper function for Monthly Data Generator

In [0]:
def generate_monthly_data(df, year, month):
    base_date = f"{year}-{month:02d}-01"

    #===============================================================
    #         Step 1: Generate unique orders
    #===============================================================
    orders = (
        df
        .sample(withReplacement=True, fraction=rows_multiplier)
        .select("Customer ID", "Customer Name", "Segment", "Country", "State", "City", "Postal Code", "Region")
        .distinct()

        # Generate Order ID prefix (80% 'CA' 20 % 'US')
        .withColumn(
          "country_prefix", 
          expr("CASE WHEN rand() < 0.8 THEN 'CA' ELSE 'US' END")
        )

        # Generate Order ID format (CA|US-YYYY-######)
        .withColumn(
          "Order ID",
          concat(
            col("country_prefix"),
            lit(f"-{year}-"),
            expr("lpad(cast(cast(rand()*1000000 as int) as string), 6, '0')")
          )
        )

        # Dates logic
        .withColumn(
          "Order Date", 
          expr(f"date_add(trunc(date('{base_date}'), 'MM'), cast(rand()*27 as int))")
        )
        .withColumn(
          "Ship Date", 
          expr(f"date_add(`Order Date`, cast(rand()*5 as int))")
        )

        # Ship Mode logic
        .withColumn(
          "Ship Mode",
            expr("""
              CASE 
                WHEN rand() < 0.25 THEN 'Second Class'
                WHEN rand() < 0.50 THEN 'Standard Class'
                WHEN rand() < 0.75 THEN 'First Class'
                ELSE 'Same Day'
              END
            """)
        )

        # Order lineItems
        .withColumn(
          "order_line_count", 
          expr(
            """
            CASE 
              WHEN rand() < 0.60 THEN 1
              WHEN rand() < 0.85 THEN 2
              WHEN rand() < 0.95 THEN 3
              WHEN rand() < 0.99 THEN 4
              ELSE 5
            END
          """)
        )  
    )  

    #==================================================================
    #         Step 2: Explode orders into order line items
    #==================================================================                 
    order_lines = (
      orders
      .withColumn("order_line", explode(sequence(lit(1), col("order_line_count"))))
      .drop("order_line_count", "country_prefix")
    )

    #==================================================================
    #         Step 3: Join order lines with base file
    #==================================================================  
    products_df = df.select("Product ID", "Category", "Sub-Category", "Product Name", "Sales", "Quantity", "Discount", "Profit")

    final_df = (
      order_lines
      .join(products_df, how="left")
      
      # Auto increasing Row ID
      .withColumn("Row ID", monotonically_increasing_id())

      # Metrics logic
      .withColumn("Quantity", (col("Quantity") + rand() * 2).cast("int"))
      .withColumn("Discount", expr("least(0.8, greatest(0, Discount + rand()*0.1))"))
      .withColumn("Sales", col("Sales") * (rand()*0.2 + 0.9))
      .withColumn("Profit", col("Sales") * (1 - col("Discount")) * (rand()*0.3 - 0.05))
    )
    
    return final_df.select(
      "Row ID",
      "Order ID",
      "Order Date",
      "Ship Date",
      "Ship Mode",
      "Customer ID",
      "Customer Name",
      "Segment",
      "Country",
      "City",
      "State",
      "Postal Code",
      "Region",
      "Product ID",
      "Category",
      "Sub-Category",
      "Product Name",
      "Sales",
      "Quantity",
      "Discount",
      "Profit"
    )
    

#### Generate monthly files

In [0]:
current = start_year_month 
while current <= end_year_month:
    run_date = current
    year = run_date.year
    month = run_date.month
    month_name = run_date.strftime("%b").lower()

    TARGET_DIR = f"{TARGET_PATH}/year={year}"
    TEMP_DIR = f"{TARGET_PATH}/_temp/year={year}/month={month:02d}"
    FINAL_FILE = f"{TARGET_DIR}/superstore-sales-{month_name}-{year}.csv"
    
    #====================================================================
    #       Idempotency check
    #====================================================================
    file_exists = False
    try:
        dbutils.fs.ls(FINAL_FILE)
        file_exists = True
    except:
        pass
    
    if file_exists and run_mode == "skip_if_exists":
        print(f"Skipping {year}-{month:02d} as file already exists")
        current += relativedelta(months=1)
        continue

    #====================================================================
    #       Generate data
    #====================================================================
    df_year_month = generate_monthly_data(df_base_file, year, month)
    (
        df_year_month
        .coalesce(1)
        .write
        .mode("overwrite") 
        .option("header", True)
        .csv(TEMP_DIR)
    )
    
    #=====================================================================
    #       Move data to final location
    #=====================================================================
    files = dbutils.fs.ls(TEMP_DIR)

    tmp_file = [f.path for f in files if f.name.startswith("part-")][0]

    dbutils.fs.mkdirs(TARGET_DIR)
    dbutils.fs.mv(tmp_file, FINAL_FILE)
    dbutils.fs.rm(TEMP_DIR, recurse=True)

    print(f"Generated {FINAL_FILE}")
    current  += relativedelta(months=1)
