<a href="https://colab.research.google.com/github/nptan2005/spark401_colab/blob/main/notebooks/lab3_test_join.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -y openjdk-17-jdk

!wget https://archive.apache.org/dist/spark/spark-4.0.1/spark-4.0.1-bin-hadoop3.tgz
!tar xf spark-4.0.1-bin-hadoop3.tgz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  at-spi2-core fonts-dejavu-core fonts-dejavu-extra gsettings-desktop-schemas
  libatk-bridge2.0-0 libatk-wrapper-java libatk-wrapper-java-jni libatk1.0-0
  libatk1.0-data libatspi2.0-0 libgail-common libgail18 libgtk2.0-0
  libgtk2.0-bin libgtk2.0-common librsvg2-common libxcomposite1 libxt-dev
  libxtst6 libxxf86dga1 openjdk-17-jre session-migration x11-utils
Suggested packages:
  gvfs libxt-doc openjdk-17-demo openjdk-17-source visualvm mesa-utils
The following NEW packages will be installed:
  at-spi2-core fonts-dejavu-core fonts-dejavu-extra gsettings-desktop-schemas
  libatk-bridge2.0-0 libatk-wrapper-java libatk-wrapper-java-jni libatk1.0-0
  libatk1.0-data libatspi2.0-0 libgail-common libgail18 libgtk2.0-0
  libgtk2.0-bin libgtk2.0-common librsvg2-common libxcomposite1 libxt-dev
  libxtst6 libxxf86dga1 openjdk-17-jdk openjdk-17-jr

In [2]:
# ===============================
# Spark 4.0.1 Setup (REQUIRED)
# ===============================
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-4.0.1-bin-hadoop3"
os.environ["PATH"] += ":/content/spark-4.0.1-bin-hadoop3/bin"

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark401-Training") \
    .master("local[*]") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

print("Spark version:", spark.version)

Spark version: 4.0.1


In [52]:
# Prepare data
from __future__ import annotations
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, pmod, to_date, rand, expr, element_at, date_sub,floor,
    date_add, to_date, concat, lit, format_string)

#====================
# CONFIG
#====================

BASE = '/content/sliver'
CUSTOMER_PATH = f'{BASE}/customers'
ORDER_PATH = f'{BASE}/orders'
PRODUCT_PATH = f'{BASE}/products'

ORDERS_RAW_PATH = f"{BASE}/orders_raw"
ORDERS_FACT_PATH = f"{BASE}/orders_fact_dt"

N_CUSTOMER = 5_0000
N_ORDER = 2_000_0000


def build_spark(app:str) -> SparkSession:
  return SparkSession.builder \
        .config("spark.sql.shuffle.partitions", "100") \
        .config("spark.sql.adaptive.enabled", "true") \
        .appName(app).getOrCreate()

spark = build_spark("sliver")

In [33]:
def generate_vn_customers(spark: SparkSession, n_rows: int):
    return (
        spark.range(0, n_rows)
        # Tầng 1: Tạo các cột cơ sở và giữ lại cột để tính toán
        .select(
            (col("id") + 1).alias("customer_id"),
            # CIF: Định dạng 8 chữ số (ví dụ bắt đầu từ 10000000)
            (col("id") + 10000001).cast("string").alias("cif_number"),
            # ID Number: CMND/CCCD 12 chữ số (ví dụ bắt đầu từ 001000000000)
            format_string("001%09d", col("id")).alias("id_number"),
            date_add(
                to_date(lit("2026-01-01")),
                (-floor(rand(42) * (60-18) * 365 + (18 * 365))).cast("int")
            ).alias("birth_date"),
            element_at(
                expr("array('090', '091', '098', '035', '086')"),
                (pmod(floor(rand(10) * 100), 5) + 1).cast("int")
            ).alias("prefix"),
            (pmod(floor(rand(55) * 10), 3)).alias("city_idx"),
            element_at(
                expr("array('MASS','AFFLUENT','SME')"),
                (pmod(col("id"), 3) + 1).cast("int")
            ).alias("segment"),
            date_sub(expr("date('2026-01-11')"), pmod(col("id"), 365).cast("int")).alias("created_date"),
            # Ngẫu nhiên cơ sở
            rand(42).alias("r_name"),
            rand(123).alias("r_job"),
            rand(7).alias("r_addr")
        )

        # Tạo Tên và Nghề nghiệp (Dùng mảng mẫu)
        .withColumn("first_name", element_at(
            expr("array('Nguyễn', 'Trần', 'Lê', 'Phạm', 'Hoàng', 'Phan', 'Vũ', 'Đặng')"),
            (pmod(floor(col("r_name") * 100), 8) + 1).cast("int")
        ))
        .withColumn("last_name", element_at(
            expr("array('Anh', 'Bình', 'Chi', 'Dũng', 'Giang', 'Hương', 'Khánh', 'Linh')"),
            (pmod(floor(col("r_name") * 1000), 8) + 1).cast("int")
        ))
        .withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name")))
        # Tạo Phone và City
        .withColumn("phone",
            format_string("%s%07d", col("prefix"), floor(rand(123) * 10000000).cast("int"))
        )
        # Tính Tuổi và Segment
        .withColumn("age", expr("year(cast('2026-01-01' as date)) - year(birth_date)"))
        .withColumn("age_segment",
            when(col("age") < 25, "Gen Z")
            .when(col("age").between(25, 40), "Millennials")
            .when(col("age").between(41, 55), "Gen X")
            .otherwise("Boomers")
        )
        .withColumn("city",
            when(col("city_idx") == 0, "Hà Nội")
            .when(col("city_idx") == 1, "TP.HCM")
            .otherwise("Đà Nẵng")
        )
        # Tạo District (Dùng customer_id thay vì id)
        .withColumn("district",
            when(col("city") == "Hà Nội",
                 element_at(expr("array('Hoàn Kiếm', 'Ba Đình', 'Cầu Giấy', 'Đống Đa')"),
                            (pmod(col("customer_id"), 4) + 1).cast("int")))
            .when(col("city") == "TP.HCM",
                 element_at(expr("array('Quận 1', 'Quận 3', 'Quận 7', 'Thủ Đức')"),
                            (pmod(col("customer_id"), 4) + 1).cast("int")))
            .otherwise(
                 element_at(expr("array('Hải Châu', 'Thanh Khê', 'Sơn Trà')"),
                            (pmod(col("customer_id"), 3) + 1).cast("int")))
        )
        # Bước 3: Địa chỉ chi tiết (Address Line)
        .withColumn("address_line", concat(
            lit("Số "), floor(col("r_addr") * 500),
            lit(", Đường "),
            element_at(expr("array('Lê Lợi', 'Nguyễn Huệ', 'Cách Mạng Tháng 8', 'Pasteur')"),
                       (pmod(floor(col("r_addr") * 10), 4) + 1).cast("int"))
        ))
        .withColumn("job", element_at(
            expr("array('Kỹ sư', 'Bác sĩ', 'Giáo viên', 'Kinh doanh', 'IT', 'Nội trợ', 'Tự do')"),
            (pmod(floor(col("r_job") * 100), 7) + 1).cast("int")
        ))
        # Phân khúc khách hàng (Segment dựa trên mức độ "giàu có" ngẫu nhiên)
        .withColumn("wealth_segment", rand(99))
        .withColumn("customer_segment",
            when(col("wealth_segment") > 0.95, "DIAMOND")
            .when(col("wealth_segment") > 0.80, "GOLD")
            .when(col("wealth_segment") > 0.50, "SILVER")
            .otherwise("MASS")
        )
        .withColumn("wealth_score", rand(13))
        .withColumn("customer_score",
          (col("wealth_score") * 100).cast("int")
        )
        .withColumn("customer_score_desc",
             when(col("customer_score") > 6000, "VERY GOOD")
             .when(col("customer_score") > 5000, "GOOD")
             .when(col("customer_score") > 4000, "MEDIUM")
             .when(col("customer_score") > 3000, "NOT BAD")
             .when(col("customer_score") > 2000, "BAD")
             .otherwise("VERY BAD")
        )
        .withColumn("risk_tier",element_at(
            expr("array('LOW','MED','HIGH')"),
            (pmod(floor(col("r_job") * 100), 3) + 1).cast("int")
          )
        )
        .drop("city_idx", "prefix","r_name", "r_job", "r_addr", "wealth_segment", "wealth_score","first_name", "last_name")
    )

# Thực thi
# df = generate_vn_customers(spark, 100)
# df.show(10, False)

In [43]:
def generate_orders_v2(spark: SparkSession, n_rows: int):
    # Sử dụng n_rows được truyền vào thay vì biến toàn cục N_ORDER
    return (
        spark.range(0, n_rows)
        .select(
            (col("id") + 1000000).cast("string").alias("order_id"),

            # 25% record về customer_id=1 => tạo Skew để sau này test Salting
            expr("""
                CASE
                  WHEN rand(7) < 0.25 THEN '1'
                  ELSE cast(pmod(id * 17, 49999) + 2 as string)
                END
            """).alias("customer_id"),

            # Số tiền giao dịch
            (floor(rand(11) * 1000000,0)).alias("amount"),

            # Xử lý thời gian an toàn
            expr("timestamp('2026-01-12 10:23:17')").alias("base_ts"),
            (pmod(col("id"), 30).cast("int")).alias("day_back"),

            # Thêm Category mặt hàng cho ngầu
            element_at(
                expr("array('Electronics', 'Fashion', 'Groceries', 'Home Decor', 'Beauty')"),
                (pmod(col('id'), 5) + 1).cast("int")
            ).alias("category"),

            element_at(expr("array('POS','ECOM','ATM')"),
                       (pmod(col('id'), 3) + 1).cast("int")).alias("channel"),

            element_at(expr("array('VN','SG','TH','ID','MY')"),
                       (pmod(col('id'), 5) + 1).cast("int")).alias("country"),

            element_at(expr("array('SUCCESS','FAILED','REVERSED')"),
                       (pmod(col('id'), 3) + 1).cast("int")).alias("status")
        )
        .withColumn("shop", element_at(
            expr("array('Shop A', 'Shop B', 'Shop C')"),
            (pmod(floor(rand(789) * 100), 3) + 1).cast("int")
        ))
        # Fix thụt lề (indent) và tính toán thời gian
        .withColumn("order_ts", expr("base_ts - make_interval(0,0,0,day_back,0,0,0)"))
        # Thêm phí giao dịch 1.5% và làm tròn 2 chữ số
        .withColumn("fee", expr("round(amount * 0.015, 2)"))
        # Thêm nhãn Promo ngẫu nhiên
        .withColumn("is_promo", when(rand(42) > 0.7, "YES").otherwise("NO"))
        .withColumn("dt", to_date(col("order_ts")))
        .drop("base_ts", "day_back")
    )

# Thực thi
# spark = SparkSession.builder.getOrCreate()
# df_orders = generate_orders_v2(spark, 100)
# df_orders.show(10, False)

+--------+-----------+------+-----------+-------+-------+--------+------+-------------------+--------+--------+----------+
|order_id|customer_id|amount|category   |channel|country|status  |shop  |order_ts           |fee     |is_promo|dt        |
+--------+-----------+------+-----------+-------+-------+--------+------+-------------------+--------+--------+----------+
|1000000 |2          |34226 |Electronics|POS    |VN     |SUCCESS |Shop B|2026-01-12 10:23:17|513.39  |NO      |2026-01-12|
|1000001 |19         |116822|Fashion    |ECOM   |SG     |FAILED  |Shop A|2026-01-11 10:23:17|1752.33 |NO      |2026-01-11|
|1000002 |36         |325222|Groceries  |ATM    |TH     |REVERSED|Shop C|2026-01-10 10:23:17|4878.33 |YES     |2026-01-10|
|1000003 |53         |435885|Home Decor |POS    |ID     |SUCCESS |Shop C|2026-01-09 10:23:17|6538.28 |NO      |2026-01-09|
|1000004 |70         |278315|Beauty     |ECOM   |MY     |FAILED  |Shop C|2026-01-08 10:23:17|4174.73 |NO      |2026-01-08|
|1000005 |87    

In [None]:
# write data

spark = build_spark("sliver")
df_customers = generate_vn_customers(spark, N_CUSTOMER)

(df_customers.write.mode("overwrite").parquet(CUSTOMER_PATH))

print("✔ customers:", df_customers.count())

df_orders = generate_orders_v2(spark, N_ORDER)

(df_orders.write.mode("overwrite").parquet(ORDERS_RAW_PATH))
print("✔ orders_raw:", df_orders.count())

# =========================
# 3) ORDERS FACT PARTITIONED BY dt
#    Đây là điểm "gần thực tế": fact thường partition theo ngày
# =========================

# Ví dụ ghi từng ngày để tránh treo RAM
dates = [row.dt for row in df_orders.select("dt").distinct().collect()]

for d in dates:
    print(f"Processing date: {d}")
    df_orders.filter(col("dt") == d) \
             .repartition(10) \
             .write.mode("append") \
             .parquet(ORDERS_FACT_PATH)
    print("✔ orders_fact_dt written:", ORDERS_FACT_PATH)

