In [64]:
import pandas as pd
import json

In [65]:
orders = pd.read_csv("olist_orders_dataset.csv")
order_items = pd.read_csv("olist_order_items_dataset.csv")
customers = pd.read_csv("olist_customers_dataset.csv")

In [66]:
orders.head(), order_items.head(), customers.head()

(                           order_id                       customer_id  \
 0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
 1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
 2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
 3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
 4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   
 
   order_status order_purchase_timestamp    order_approved_at  \
 0    delivered      2017-10-02 10:56:33  2017-10-02 11:07:15   
 1    delivered      2018-07-24 20:41:37  2018-07-26 03:24:27   
 2    delivered      2018-08-08 08:38:49  2018-08-08 08:55:23   
 3    delivered      2017-11-18 19:28:06  2017-11-18 19:45:59   
 4    delivered      2018-02-13 21:18:39  2018-02-13 22:20:29   
 
   order_delivered_carrier_date order_delivered_customer_date  \
 0          2017-10-04 19:55:00           2017-10-10 21:25:13   
 1          2018-07-26 14:31:00 

In [67]:
def make_customer(order_id):
    cust_id = orders.loc[orders["order_id"] == order_id, "customer_id"].values[0]
    cust_row = customers[customers["customer_id"] == cust_id]
    if not cust_row.empty:
        return {
            "customer_id": cust_id,
            "city": cust_row["customer_city"].values[0],
            "state": cust_row["customer_state"].values[0]
        }
    else:
        return {"customer_id": cust_id, "city": None, "state": None}


In [68]:
import json

def make_order_items(order_id):
    items = order_items[order_items["order_id"] == order_id]
    item_list = []
    for _, row in items.iterrows():
        features = {
            "product_id": row["product_id"],
            "seller_id": row["seller_id"]
        }
        item_struct = {
            "product_id": row["product_id"],
            "product_name": "unknown",
            "quantity": 1,
            "price": float(row["price"]),
            "product_features": json.dumps(features)
        }
        item_list.append(item_struct)
    return item_list


example_order_id = order_items["order_id"].iloc[0]   
result = make_order_items(example_order_id)
print(result)

[{'product_id': '4244733e06e7ecb4970a6e2683c13e61', 'product_name': 'unknown', 'quantity': 1, 'price': 58.9, 'product_features': '{"product_id": "4244733e06e7ecb4970a6e2683c13e61", "seller_id": "48436dade18ac8b2bce089ec2a041202"}'}]


In [69]:
def make_campaign_details():
    campaigns = [
        {"coupon_code": "WELCOME10", "discount": 10.0, "channel": "web"},
        {"coupon_code": "SUMMER20", "discount": 20.0, "channel": "mobile"},
        {}
    ]
    return json.dumps(random.choice(campaigns))

In [70]:
json_records = order_items.to_dict(orient="records")
with open("order_items_ndjson.json", "w", encoding="utf-8") as f:
    for rec in json_records:
        f.write(json.dumps(rec) + "\n")

In [71]:
import numpy as np
order_items['campaign'] = np.random.choice(len(campaigns), size=len(order_items))
order_items['campaign_details'] = order_items['campaign'].apply(lambda i: campaigns[i])

In [72]:
order_items['campaign_details'] = order_items['campaign'].apply(lambda i: campaigns[i])

In [None]:
grouped_items = order_items.groupby("order_id").apply(lambda x: x.to_dict(orient="records")).to_dict()

customer_map = customers.set_index("customer_id")[["customer_city", "customer_state"]].to_dict(orient="index")
order_to_customer = orders.set_index("order_id")["customer_id"].to_dict()

import random, json
full_data = []
for idx, row in orders.iterrows():
    order_id = row["order_id"]
    customer_id = order_to_customer[order_id]
    cust_info = customer_map.get(customer_id, {"customer_city": None, "customer_state": None})
    record = {
        "order_id": order_id,
        "order_timestamp": row["order_purchase_timestamp"],
        "customer": {
            "customer_id": customer_id,
            "city": cust_info["customer_city"],
            "state": cust_info["customer_state"]
        },
        "order_items": grouped_items.get(order_id, []),
        "campaign_details": random.choice([
            {"coupon_code": "WELCOME10", "discount": 10.0, "channel": "web"},
            {"coupon_code": "SUMMER20", "discount": 20.0, "channel": "mobile"},
            {}
        ]),
        "order_status": row["order_status"]
    }
    full_data.append(record)

with open("olist_bigquery_nested_orders.json", "w", encoding="utf-8") as f:
    for rec in full_data:
        f.write(json.dumps(rec) + "\n")

In [None]:
with open("olist_bigquery_nested_orders.json", "r", encoding="utf-8") as f:
    for i in range(5):  
        print(f.readline())

In [None]:
import json

def convert_values_to_str(obj):
    if isinstance(obj, dict):
        return {k: convert_values_to_str(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_values_to_str(elem) for elem in obj]
    elif isinstance(obj, int):
        return str(obj)
    else:
        return obj

with open("olist_bigquery_nested_orders.json", "w", encoding="utf-8") as f:
    for rec in full_data:
        rec_clean = convert_values_to_str(rec)
        f.write(json.dumps(rec_clean) + "\n")


In [None]:
orders.to_json("output.json", orient="records", lines=True)
order_items.to_json("output.json", orient="records", lines=True)
customers.to_json("output.json", orient="records", lines=True)

