# Synthetic data for dbt

* Creates synthetic "orders" data which describe 10 customers, 2 suppliers, 40 orders over the last 10 days. The first 5 orders have multiple suppliers in their supplier history.
* Also created synthetic "items" data which relates to these orders. Every fifth order has multiple items involved.
* Just creates deduped data for now.

In [None]:
import datetime
import random
import pandas as pd

DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"

def random_datetime(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = random.randrange(int_delta)
    dt = start + datetime.timedelta(seconds=random_second)
    return dt.strftime(DATETIME_FORMAT)

In [68]:
n_customers = 10
n_suppliers = 2
n_orders = 40
n_days = 10

custs = [f"cust_{i}" for i in range(0,n_customers)]
suppliers = [f"sup_{i}" for i in range(0,n_suppliers)]
items = ["fish", "bread", "milk", "yoghurt", "burger", "pizza", "chips"]

end = datetime.datetime.now()
start = now - datetime.timedelta(days=n_days)

order_times = [random_datetime(start, end) for i in range(0,n_orders)]

month = now.month if len(str(now.month)) == 2 else "0" + now.month
partition_time = "-".join([str(now.year), str(month), "01"]) + " 00:00:00"

In [82]:
order_rows = []
item_rows = []

for i in range(0,n_orders):
    t = order_times[i]
    sup = random.choice(suppliers)
    cust = random.choice(custs)
    item = random.choice(items)
    
    supplier_history = [
            {"supplierid": sup, "time": t}
        ]
    
    # for the first few orders, add supplier history related to another supplier a second later
    if i < 5:
        new_sup = random.choice(suppliers)
        while new_sup == sup:
            new_sup = random.choice(suppliers)
        new_time = datetime.datetime.strptime(t, DATETIME_FORMAT) + datetime.timedelta(seconds=1)
        supplier_history.append({"supplierid": new_sup, "time": new_time.strftime(DATETIME_FORMAT)})
    
    order_row = {
        "orderid": f"order_{i}",
        "clientid": "ravelintest",
        "customerid": cust,
        "eventtime": t,
        "createdat": t,
        "updatedat": t,
        "partitiontime": partition_time,
        "deleted": False,
        "supplierhistory": supplier_history,
        "orderstatushistory": [],
    }
    
    order_rows.append(order_row)
    
    item_row = {
        "orderid": f"order_{i}",
        "clientid": "ravelintest",
        "customerid": cust,
        "eventtime": t,
        "createdat": t,
        "updatedat": t,
        "itemSet": f"order_{i}_{item}",  # primary key
        "partitiontime": partition_time,
        "deleted": False,
        "name": item,
        "quantity": 1,
    }
    
    item_rows.append(item_row)
    
    # for every fifth order, pretend they ordered multiple items
    if i % 5 == 0:
        new_item = random.choice(items)
        while new_item == item:
            new_item = random.choice(items)
        row_copy = item_row.copy()
        row_copy["itemSet"] = f"order_{i}_{new_item}"
        row_copy["name"] = new_item

        item_rows.append(row_copy)
    
print(len(order_rows))
print(len(item_rows))

40
48


In [83]:
pd.DataFrame(order_rows).head(10)

Unnamed: 0,orderid,clientid,customerid,eventtime,createdat,updatedat,partitiontime,deleted,supplierhistory,orderstatushistory
0,order_0,ravelintest,cust_1,2020-11-06 15:58:35,2020-11-06 15:58:35,2020-11-06 15:58:35,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_1', 'time': '2020-11-06 1...",[]
1,order_1,ravelintest,cust_7,2020-11-10 08:09:19,2020-11-10 08:09:19,2020-11-10 08:09:19,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_1', 'time': '2020-11-10 0...",[]
2,order_2,ravelintest,cust_5,2020-11-08 10:37:57,2020-11-08 10:37:57,2020-11-08 10:37:57,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_1', 'time': '2020-11-08 1...",[]
3,order_3,ravelintest,cust_1,2020-11-05 13:46:24,2020-11-05 13:46:24,2020-11-05 13:46:24,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_1', 'time': '2020-11-05 1...",[]
4,order_4,ravelintest,cust_2,2020-11-08 22:29:03,2020-11-08 22:29:03,2020-11-08 22:29:03,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_0', 'time': '2020-11-08 2...",[]
5,order_5,ravelintest,cust_1,2020-11-09 23:56:13,2020-11-09 23:56:13,2020-11-09 23:56:13,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_1', 'time': '2020-11-09 2...",[]
6,order_6,ravelintest,cust_5,2020-11-08 13:25:21,2020-11-08 13:25:21,2020-11-08 13:25:21,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_0', 'time': '2020-11-08 1...",[]
7,order_7,ravelintest,cust_5,2020-11-04 21:52:32,2020-11-04 21:52:32,2020-11-04 21:52:32,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_0', 'time': '2020-11-04 2...",[]
8,order_8,ravelintest,cust_4,2020-11-11 02:53:25,2020-11-11 02:53:25,2020-11-11 02:53:25,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_0', 'time': '2020-11-11 0...",[]
9,order_9,ravelintest,cust_7,2020-11-08 21:11:20,2020-11-08 21:11:20,2020-11-08 21:11:20,2020-11-01 00:00:00,False,"[{'supplierid': 'sup_1', 'time': '2020-11-08 2...",[]


In [84]:
pd.DataFrame(item_rows).head(10)

Unnamed: 0,orderid,clientid,customerid,eventtime,createdat,updatedat,itemSet,partitiontime,deleted,name,quantity
0,order_0,ravelintest,cust_1,2020-11-06 15:58:35,2020-11-06 15:58:35,2020-11-06 15:58:35,order_0_burger,2020-11-01 00:00:00,False,burger,1
1,order_0,ravelintest,cust_1,2020-11-06 15:58:35,2020-11-06 15:58:35,2020-11-06 15:58:35,order_0_chips,2020-11-01 00:00:00,False,chips,1
2,order_1,ravelintest,cust_7,2020-11-10 08:09:19,2020-11-10 08:09:19,2020-11-10 08:09:19,order_1_pizza,2020-11-01 00:00:00,False,pizza,1
3,order_2,ravelintest,cust_5,2020-11-08 10:37:57,2020-11-08 10:37:57,2020-11-08 10:37:57,order_2_bread,2020-11-01 00:00:00,False,bread,1
4,order_3,ravelintest,cust_1,2020-11-05 13:46:24,2020-11-05 13:46:24,2020-11-05 13:46:24,order_3_burger,2020-11-01 00:00:00,False,burger,1
5,order_4,ravelintest,cust_2,2020-11-08 22:29:03,2020-11-08 22:29:03,2020-11-08 22:29:03,order_4_pizza,2020-11-01 00:00:00,False,pizza,1
6,order_5,ravelintest,cust_1,2020-11-09 23:56:13,2020-11-09 23:56:13,2020-11-09 23:56:13,order_5_milk,2020-11-01 00:00:00,False,milk,1
7,order_5,ravelintest,cust_1,2020-11-09 23:56:13,2020-11-09 23:56:13,2020-11-09 23:56:13,order_5_chips,2020-11-01 00:00:00,False,chips,1
8,order_6,ravelintest,cust_5,2020-11-08 13:25:21,2020-11-08 13:25:21,2020-11-08 13:25:21,order_6_yoghurt,2020-11-01 00:00:00,False,yoghurt,1
9,order_7,ravelintest,cust_5,2020-11-04 21:52:32,2020-11-04 21:52:32,2020-11-04 21:52:32,order_7_milk,2020-11-01 00:00:00,False,milk,1


# Save in JSONL format then upload to bigquery project

In [86]:
import json

with open("orders.jsonl", 'w') as f:
    for row in order_rows:
        f.write(json.dumps(row) + "\n")
        
with open("items.jsonl", 'w') as f:
    for row in item_rows:
        f.write(json.dumps(row) + "\n")