# Retail data — Spark SQL only
This notebook loads headerless CSVs from `/content`, registers temp views, and demonstrates common analytics using **pure SQL**.

## Setup

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType
spark = SparkSession.builder.appName("Retail-Data-PySpark-SQL").getOrCreate()
INPUT_ROOT = "/content"  # headerless CSVs

## Define schemas (match the column order in the CSV files)

In [None]:
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", TimestampType(), True),
    StructField("order_customer_id", IntegerType(), True),
    StructField("order_status", StringType(), True)
])

order_items_schema = StructType([
    StructField("order_item_id", IntegerType(), True),
    StructField("order_item_order_id", IntegerType(), True),
    StructField("order_item_product_id", IntegerType(), True),
    StructField("order_item_quantity", IntegerType(), True),
    StructField("order_item_subtotal", DoubleType(), True),
    StructField("order_item_product_price", DoubleType(), True)
])

customers_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_fname", StringType(), True),
    StructField("customer_lname", StringType(), True),
    StructField("customer_email", StringType(), True),
    StructField("customer_password", StringType(), True),
    StructField("customer_street", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("customer_zipcode", StringType(), True)
])

categories_schema = StructType([
    StructField("category_id", IntegerType(), True),
    StructField("category_department_id", IntegerType(), True),
    StructField("category_name", StringType(), True)
])

products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_category_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("product_description", StringType(), True),
    StructField("product_price", DoubleType(), True),
    StructField("product_image", StringType(), True)
])

departments_schema = StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department_name", StringType(), True)
])

## Load CSVs (header=False) and register temp views

In [None]:
orders = spark.read.option("header", False).schema(orders_schema).csv(f"{INPUT_ROOT}/orders.csv")
order_items = spark.read.option("header", False).schema(order_items_schema).csv(f"{INPUT_ROOT}/order_items.csv")
customers = spark.read.option("header", False).schema(customers_schema).csv(f"{INPUT_ROOT}/customers.csv")
categories = spark.read.option("header", False).schema(categories_schema).csv(f"{INPUT_ROOT}/categories.csv")
products = spark.read.option("header", False).schema(products_schema).csv(f"{INPUT_ROOT}/products.csv")
departments = spark.read.option("header", False).schema(departments_schema).csv(f"{INPUT_ROOT}/departments.csv")

orders.createOrReplaceTempView("orders")
order_items.createOrReplaceTempView("order_items")
customers.createOrReplaceTempView("customers")
categories.createOrReplaceTempView("categories")
products.createOrReplaceTempView("products")
departments.createOrReplaceTempView("departments")

## SQL — count of orders

In [None]:
spark.sql("""
SELECT COUNT(*) AS orders_count
FROM orders
""").show(truncate=False)

## SQL — distinct order status

In [None]:
spark.sql("""
SELECT DISTINCT order_status
FROM orders
ORDER BY order_status
""").show(truncate=False)

## SQL — derive `line_amount` on order_items

In [None]:
spark.sql("""
SELECT
  order_item_id,
  order_item_order_id,
  order_item_quantity * order_item_product_price AS line_amount
FROM order_items
LIMIT 5
""").show(truncate=False)

## SQL — group revenue per order

In [None]:
spark.sql("""
SELECT
  order_item_order_id AS order_id,
  ROUND(SUM(order_item_quantity * order_item_product_price), 2) AS order_revenue
FROM order_items
GROUP BY order_item_order_id
ORDER BY order_revenue DESC
LIMIT 10
""").show(truncate=False)

## SQL — concatenate full name

In [None]:
spark.sql("""
SELECT
  customer_id,
  CONCAT_WS(' ', customer_fname, customer_lname) AS full_name
FROM customers
LIMIT 5
""").show(truncate=False)

## SQL — filter where status == 'COMPLETE'

In [None]:
spark.sql("""
SELECT order_id, order_status
FROM orders
WHERE order_status = 'COMPLETE'
LIMIT 5
""").show(truncate=False)

## SQL — revenue for a specific order (order_id = 2)

In [None]:
spark.sql("""
SELECT ROUND(SUM(order_item_quantity * order_item_product_price), 2) AS revenue_for_order_2
FROM order_items
WHERE order_item_order_id = 2
""").show(truncate=False)

## SQL — revenue by category (order_items → products → categories)

In [None]:
spark.sql("""
SELECT
  c.category_id,
  c.category_name,
  ROUND(SUM(oi.order_item_quantity * oi.order_item_product_price), 2) AS revenue
FROM order_items oi
JOIN products p
  ON oi.order_item_product_id = p.product_id
JOIN categories c
  ON p.product_category_id = c.category_id
GROUP BY c.category_id, c.category_name
ORDER BY revenue DESC
LIMIT 10
""").show(truncate=False)

## SQL — top products by revenue

In [None]:
spark.sql("""
SELECT
  p.product_id,
  p.product_name,
  ROUND(SUM(oi.order_item_quantity * oi.order_item_product_price), 2) AS revenue
FROM order_items oi
JOIN products p
  ON oi.order_item_product_id = p.product_id
GROUP BY p.product_id, p.product_name
ORDER BY revenue DESC
LIMIT 10
""").show(truncate=False)