# 01 — Build Bronze Layer (PySpark on Dataproc, No Hive Required)

We **avoid Hive dependencies** by writing **Parquet** datasets under a base path (`GCS_BASE`) and registering **temporary views** for SQL.

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("bronze-dataproc").getOrCreate()
DATA_BASE = os.getenv("DATA_BASE", "gs://spark-training-pp/etl_spark/raw")
GCS_BASE  = os.getenv("GCS_BASE", "/user/nb")
print("DATA_BASE:", DATA_BASE); print("GCS_BASE:", GCS_BASE)

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType
schemas = {
    "crm_cust_info": [("cst_id",IntegerType()),("cst_key",StringType()),("cst_firstname",StringType()),("cst_lastname",StringType()),("cst_marital_status",StringType()),("cst_gndr",StringType()),("cst_create_date",DateType())],
    "crm_prd_info": [("prd_id",IntegerType()),("prd_key",StringType()),("prd_nm",StringType()),("prd_cost",IntegerType()),("prd_line",StringType()),("prd_start_dt",TimestampType()),("prd_end_dt",TimestampType())],
    "crm_sales_details": [("sls_ord_num",StringType()),("sls_prd_key",StringType()),("sls_cust_id",IntegerType()),("sls_order_dt",IntegerType()),("sls_ship_dt",IntegerType()),("sls_due_dt",IntegerType()),("sls_sales",IntegerType()),("sls_quantity",IntegerType()),("sls_price",IntegerType())],
    "erp_loc_a101": [("cid",StringType()),("cntry",StringType())],
    "erp_cust_az12": [("cid",StringType()),("bdate",DateType()),("gen",StringType())],
    "erp_px_cat_g1v2": [("id",StringType()),("cat",StringType()),("subcat",StringType()),("maintenance",StringType())],
}
def struct_schema(fields):
    return StructType([StructField(n,t,True) for n,t in fields])

### Dataset: `crm_cust_info`

**Intent:** Raw, minimally-typed landing for reproducibility.

| Column | Type | Nullable | Description |
|---|---|---|---|
| `cst_id` | `int` | Yes | Natural numeric ID from CRM; used to join sales |
| `cst_key` | `string` | Yes | Business/customer code; maps ERP attributes |
| `cst_firstname` | `string` | Yes | Raw given name |
| `cst_lastname` | `string` | Yes | Raw surname |
| `cst_marital_status` | `string` | Yes | Raw code S/M/etc |
| `cst_gndr` | `string` | Yes | Raw gender code |
| `cst_create_date` | `date` | Yes | Creation date as delivered |

In [None]:
table = "crm_cust_info"
inp = f"{DATA_BASE}/cust_info.csv"
out = f"{GCS_BASE}/bronze/{table}"
df = spark.read.csv(inp, header=True, schema=struct_schema(schemas[table]))
df.write.mode("overwrite").parquet(out)
spark.read.parquet(out).createOrReplaceTempView(f"bronze_crm_cust_info")
print("Wrote Parquet:", out, "| Rows:", spark.read.parquet(out).count())
spark.sql("SELECT * FROM bronze_crm_cust_info LIMIT 10").show(truncate=False)

### Dataset: `crm_prd_info`

**Intent:** Raw, minimally-typed landing for reproducibility.

| Column | Type | Nullable | Description |
|---|---|---|---|
| `prd_id` | `int` | Yes | Natural product ID |
| `prd_key` | `string` | Yes | Composite key (category+sku) |
| `prd_nm` | `string` | Yes | Product name |
| `prd_cost` | `int` | Yes | Unit cost |
| `prd_line` | `string` | Yes | Product line code M/R/S/T |
| `prd_start_dt` | `timestamp` | Yes | Start timestamp |
| `prd_end_dt` | `timestamp` | Yes | End timestamp (nullable) |

In [None]:
table = "crm_prd_info"
inp = f"{DATA_BASE}/prd_info.csv"
out = f"{GCS_BASE}/bronze/{table}"
df = spark.read.csv(inp, header=True, schema=struct_schema(schemas[table]))
df.write.mode("overwrite").parquet(out)
spark.read.parquet(out).createOrReplaceTempView(f"bronze_crm_prd_info")
print("Wrote Parquet:", out, "| Rows:", spark.read.parquet(out).count())
spark.sql("SELECT * FROM bronze_crm_prd_info LIMIT 10").show(truncate=False)

### Dataset: `crm_sales_details`

**Intent:** Raw, minimally-typed landing for reproducibility.

| Column | Type | Nullable | Description |
|---|---|---|---|
| `sls_ord_num` | `string` | Yes | Order number / line identifier |
| `sls_prd_key` | `string` | Yes | Product key references product |
| `sls_cust_id` | `int` | Yes | Customer id references customer |
| `sls_order_dt` | `int` | Yes | Integer date YYYYMMDD |
| `sls_ship_dt` | `int` | Yes | Integer date YYYYMMDD |
| `sls_due_dt` | `int` | Yes | Integer date YYYYMMDD |
| `sls_sales` | `int` | Yes | Extended amount |
| `sls_quantity` | `int` | Yes | Units sold |
| `sls_price` | `int` | Yes | Unit price |

In [None]:
table = "crm_sales_details"
inp = f"{DATA_BASE}/sales_details.csv"
out = f"{GCS_BASE}/bronze/{table}"
df = spark.read.csv(inp, header=True, schema=struct_schema(schemas[table]))
df.write.mode("overwrite").parquet(out)
spark.read.parquet(out).createOrReplaceTempView(f"bronze_crm_sales_details")
print("Wrote Parquet:", out, "| Rows:", spark.read.parquet(out).count())
spark.sql("SELECT * FROM bronze_crm_sales_details LIMIT 10").show(truncate=False)

### Dataset: `erp_cust_az12`

**Intent:** Raw, minimally-typed landing for reproducibility.

| Column | Type | Nullable | Description |
|---|---|---|---|
| `cid` | `string` | Yes | ERP customer code |
| `bdate` | `date` | Yes | Birthdate |
| `gen` | `string` | Yes | Gender raw |

In [None]:
table = "erp_cust_az12"
inp = f"{DATA_BASE}/CUST_AZ12.csv"
out = f"{GCS_BASE}/bronze/{table}"
df = spark.read.csv(inp, header=True, schema=struct_schema(schemas[table]))
df.write.mode("overwrite").parquet(out)
spark.read.parquet(out).createOrReplaceTempView(f"bronze_erp_cust_az12")
print("Wrote Parquet:", out, "| Rows:", spark.read.parquet(out).count())
spark.sql("SELECT * FROM bronze_erp_cust_az12 LIMIT 10").show(truncate=False)

### Dataset: `erp_loc_a101`

**Intent:** Raw, minimally-typed landing for reproducibility.

| Column | Type | Nullable | Description |
|---|---|---|---|
| `cid` | `string` | Yes | ERP customer code |
| `cntry` | `string` | Yes | Country code/name |

In [None]:
table = "erp_loc_a101"
inp = f"{DATA_BASE}/LOC_A101.csv"
out = f"{GCS_BASE}/bronze/{table}"
df = spark.read.csv(inp, header=True, schema=struct_schema(schemas[table]))
df.write.mode("overwrite").parquet(out)
spark.read.parquet(out).createOrReplaceTempView(f"bronze_erp_loc_a101")
print("Wrote Parquet:", out, "| Rows:", spark.read.parquet(out).count())
spark.sql("SELECT * FROM bronze_erp_loc_a101 LIMIT 10").show(truncate=False)

### Dataset: `erp_px_cat_g1v2`

**Intent:** Raw, minimally-typed landing for reproducibility.

| Column | Type | Nullable | Description |
|---|---|---|---|
| `id` | `string` | Yes | Category id |
| `cat` | `string` | Yes | Category |
| `subcat` | `string` | Yes | Subcategory |
| `maintenance` | `string` | Yes | Maintenance attribute |

In [None]:
table = "erp_px_cat_g1v2"
inp = f"{DATA_BASE}/PX_CAT_G1V2.csv"
out = f"{GCS_BASE}/bronze/{table}"
df = spark.read.csv(inp, header=True, schema=struct_schema(schemas[table]))
df.write.mode("overwrite").parquet(out)
spark.read.parquet(out).createOrReplaceTempView(f"bronze_erp_px_cat_g1v2")
print("Wrote Parquet:", out, "| Rows:", spark.read.parquet(out).count())
spark.sql("SELECT * FROM bronze_erp_px_cat_g1v2 LIMIT 10").show(truncate=False)

## Summary
- Bronze Parquet written under `GCS_BASE/bronze/*`; views `bronze_<table>` registered.