In [1]:
from pyspark import SparkContext
print(SparkContext._active_spark_context)

None


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ETL Star Schema Reports to ClickHouse") \
    .config("spark.jars", "postgresql-42.6.0.jar,clickhouse-jdbc-0.4.6.jar") \
    .getOrCreate()

# PostgreSQL
pg_url   = "jdbc:postgresql://postgres:5432/spark_db"
pg_props = {
    "user": "spark_user",
    "password": "spark_password",
    "driver": "org.postgresql.Driver"
}

# ClickHouse
ch_url   = "jdbc:clickhouse://clickhouse:8123/default"
ch_props = {
    "driver":   "com.clickhouse.jdbc.ClickHouseDriver",
    "user":     "custom_user",
    "password": "custom_password",
}

print("✔️ SparkSession и JDBC настроены")


✔️ SparkSession и JDBC настроены


In [3]:
from pyspark.sql import functions as F

fact_sales    = spark.read.jdbc(
    url=pg_url,
    table="fact_sales",
    properties=pg_props
)
dim_products  = spark.read.jdbc(
    url=pg_url,
    table="dim_products",
    properties=pg_props
)
dim_customers = spark.read.jdbc(
    url=pg_url,
    table="dim_customers",
    properties=pg_props
)
dim_dates     = spark.read.jdbc(
    url=pg_url,
    table="dim_dates",
    properties=pg_props
)
dim_stores    = spark.read.jdbc(
    url=pg_url,
    table="dim_stores",
    properties=pg_props
)
dim_suppliers = spark.read.jdbc(
    url=pg_url,
    table="dim_suppliers",
    properties=pg_props
)
dim_countries = spark.read.jdbc(
    url=pg_url,
    table="dim_countries",
    properties=pg_props
)
dim_cities    = spark.read.jdbc(
    url=pg_url,
    table="dim_cities",
    properties=pg_props
)

print("✔️ Данные из PostgreSQL загружены")


✔️ Данные из PostgreSQL загружены


In [14]:
# Ячейка 3a: sales_by_product — базовая витрина
from pyspark.sql.window import Window

prod_metrics = (
    fact_sales
      .join(dim_products, "product_id")
      .select("product_id","product_name","category_id","quantity","total_price","rating","reviews")
)

# 1) Топ-10 самых продаваемых продуктов
top10_products = (
    prod_metrics
      .groupBy("product_id","product_name")
      .agg(F.sum("quantity").alias("units_sold"))
      .orderBy(F.col("units_sold").desc())
      .limit(10)
)

# 2) Общая выручка по категориям продуктов
revenue_by_category = (
    prod_metrics
      .groupBy("category_id")
      .agg(F.sum("total_price").alias("revenue"))
)

# 3) Средний рейтинг и кол-во отзывов для каждого продукта
rating_reviews = (
    prod_metrics
      .groupBy("product_id","product_name")
      .agg(
         F.avg("rating").alias("avg_rating"),
         F.sum("reviews").alias("total_reviews")
      )
)

# Записываем все три в ClickHouse
for tbl, df, order_cols in [
    ("top10_products",      top10_products,      ["units_sold"]),
    ("revenue_by_category", revenue_by_category, ["category_id"]),
    ("rating_reviews",      rating_reviews,      ["product_id"])
]:
    df.write.format("jdbc") \
       .option("url", ch_url).option("dbtable", tbl) \
       .options(**{**ch_props, "createTableOptions":f"ENGINE = MergeTree() ORDER BY ({','.join(order_cols)})"}) \
       .mode("overwrite").save()
    print(f"✔️ {tbl}: {df.count()} строк")


✔️ top10_products: 3 строк
✔️ revenue_by_category: 2 строк
✔️ rating_reviews: 3 строк


In [15]:
# Ячейка 4a: sales_by_customer — все метрики
# 1) Топ-10 клиентов по общей сумме покупок
top10_customers = (
    fact_sales
      .join(dim_customers, "customer_id")
      .groupBy("customer_id","first_name","last_name")
      .agg(F.sum("total_price").alias("total_spent"))
      .orderBy(F.col("total_spent").desc())
      .limit(10)
)

# 2) Распределение клиентов по странам
customers_by_country = (
    fact_sales
      .join(dim_customers, "customer_id")
      .join(dim_countries, ["country_id"], "left")
      .groupBy("country_name")
      .agg(F.countDistinct("customer_id").alias("unique_customers"))
)

# 3) Средний чек для каждого клиента
avg_check_per_customer = (
    fact_sales
      .groupBy("customer_id")
      .agg((F.sum("total_price")/F.count("*")).alias("avg_check"))
)

for tbl, df, order_cols in [
    ("top10_customers",        top10_customers,        ["total_spent"]),
    ("customers_by_country",   customers_by_country,   ["country_name"]),
    ("avg_check_per_customer", avg_check_per_customer, ["customer_id"])
]:
    df.write.format("jdbc") \
       .option("url", ch_url).option("dbtable", tbl) \
       .options(**{**ch_props, "createTableOptions":f"ENGINE = MergeTree() ORDER BY ({','.join(order_cols)})"}) \
       .mode("overwrite").save()
    print(f"✔️ {tbl}: {df.count()} строк")


✔️ top10_customers: 10 строк
✔️ customers_by_country: 204 строк
✔️ avg_check_per_customer: 10000 строк


In [16]:
# Ячейка 5a: sales_by_time — все метрики
# 1) Месячные и годовые тренды продаж
monthly_trends = (
    fact_sales
      .join(dim_dates.withColumnRenamed("full_date","sale_date"), "date_id")
      .withColumn("year",  F.year("sale_date"))
      .withColumn("month", F.month("sale_date"))
      .groupBy("year","month")
      .agg(F.sum("total_price").alias("revenue"))
)

# 2) Сравнение выручки за разные периоды (например год к году)
yearly_revenue = (
    monthly_trends
      .groupBy("year")
      .agg(F.sum("revenue").alias("yearly_revenue"))
)

# 3) Средний размер заказа по месяцам
avg_order_size_by_month = (
    fact_sales
      .join(dim_dates.withColumnRenamed("full_date","sale_date"), "date_id")
      .withColumn("year",  F.year("sale_date"))
      .withColumn("month", F.month("sale_date"))
      .groupBy("year","month")
      .agg((F.sum("total_price")/F.count("*")).alias("avg_order_size"))
)

for tbl, df, order_cols in [
    ("monthly_trends",          monthly_trends,          ["year","month"]),
    ("yearly_revenue",          yearly_revenue,          ["year"]),
    ("avg_order_size_by_month", avg_order_size_by_month, ["year","month"])
]:
    df.write.format("jdbc") \
       .option("url", ch_url).option("dbtable", tbl) \
       .options(**{**ch_props, "createTableOptions":f"ENGINE = MergeTree() ORDER BY ({','.join(order_cols)})"}) \
       .mode("overwrite").save()
    print(f"✔️ {tbl}: {df.count()} строк")


✔️ monthly_trends: 12 строк
✔️ yearly_revenue: 1 строк
✔️ avg_order_size_by_month: 12 строк


In [17]:
# Ячейка 6a: sales_by_store — все метрики
# 1) Топ-5 магазинов по выручке
top5_stores = (
    fact_sales
      .join(dim_stores, "store_id")
      .groupBy("store_id","store_name")
      .agg(F.sum("total_price").alias("revenue"))
      .orderBy(F.col("revenue").desc())
      .limit(5)
)

# 2) Распределение продаж по городам и странам
sales_by_city_country = (
    fact_sales
      .join(dim_stores, "store_id")
      .join(dim_cities,    ["city_id"],    "left")
      .join(dim_countries, ["country_id"], "left")
      .groupBy("city_name","country_name")
      .agg(F.sum("total_price").alias("revenue"))
)

# 3) Средний чек для каждого магазина
avg_check_per_store = (
    fact_sales
      .groupBy("store_id")
      .agg((F.sum("total_price")/F.count("*")).alias("avg_check"))
)

for tbl, df, order_cols in [
    ("top5_stores",           top5_stores,           ["revenue"]),
    ("sales_by_city_country", sales_by_city_country, ["city_name"]),
    ("avg_check_per_store",   avg_check_per_store,   ["store_id"])
]:
    df.write.format("jdbc") \
       .option("url", ch_url).option("dbtable", tbl) \
       .options(**{**ch_props, "createTableOptions":f"ENGINE = MergeTree() ORDER BY ({','.join(order_cols)})"}) \
       .mode("overwrite").save()
    print(f"✔️ {tbl}: {df.count()} строк")


✔️ top5_stores: 5 строк
✔️ sales_by_city_country: 383 строк
✔️ avg_check_per_store: 383 строк


In [18]:
# Ячейка 7a: sales_by_supplier — все метрики
# 1) Топ-5 поставщиков по выручке
sales_with_supp = fact_sales.join(dim_products.select("product_id","supplier_id"), "product_id")
top5_suppliers = (
    sales_with_supp
      .groupBy("supplier_id")
      .agg(F.sum("total_price").alias("revenue"))
      .orderBy(F.col("revenue").desc())
      .limit(5)
)

# 2) Средняя цена товаров от каждого поставщика
avg_price_by_supplier = (
    sales_with_supp
      .groupBy("supplier_id")
      .agg((F.sum("total_price")/F.sum("quantity")).alias("avg_price"))
)

# 3) Распределение продаж по странам поставщиков
sales_by_supplier_country = (
    sales_with_supp
      .join(dim_suppliers, "supplier_id")
      .join(dim_countries, ["country_id"], "left")
      .groupBy("country_name")
      .agg(F.sum("total_price").alias("revenue"))
)

for tbl, df, order_cols in [
    ("top5_suppliers",            top5_suppliers,            ["revenue"]),
    ("avg_price_by_supplier",     avg_price_by_supplier,     ["supplier_id"]),
    ("sales_by_supplier_country", sales_by_supplier_country, ["country_name"])
]:
    df.write.format("jdbc") \
       .option("url", ch_url).option("dbtable", tbl) \
       .options(**{**ch_props, "createTableOptions":f"ENGINE = MergeTree() ORDER BY ({','.join(order_cols)})"}) \
       .mode("overwrite").save()
    print(f"✔️ {tbl}: {df.count()} строк")


✔️ top5_suppliers: 3 строк
✔️ avg_price_by_supplier: 3 строк
✔️ sales_by_supplier_country: 3 строк


In [20]:
# Ячейка 8a (исправленная): product_quality
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

quality = (
    fact_sales
      .join(dim_products, "product_id")
      .select("product_id","product_name","rating","reviews","quantity")
)

window_desc = Window.orderBy(F.col("rating").desc())
window_asc  = Window.orderBy(F.col("rating").asc())

highest_rating = (
    quality
      .withColumn("rn", row_number().over(window_desc))
      .filter(F.col("rn") == 1)
      .drop("rn")
)
lowest_rating = (
    quality
      .withColumn("rn", row_number().over(window_asc))
      .filter(F.col("rn") == 1)
      .drop("rn")
)

corr_val = quality.stat.corr("rating","quantity")

most_reviewed = (
    quality
      .groupBy("product_id","product_name")
      .agg(F.sum("reviews").alias("total_reviews"))
      .orderBy(F.col("total_reviews").desc())
      .limit(10)
)

for tbl, df, order_cols in [
    ("highest_rating", highest_rating, ["rating"]),
    ("lowest_rating",  lowest_rating,  ["rating"]),
    ("most_reviewed",  most_reviewed,  ["total_reviews"])
]:
    df.write.format("jdbc") \
       .option("url", ch_url).option("dbtable", tbl) \
       .options(**{**ch_props, "createTableOptions":f"ENGINE = MergeTree() ORDER BY ({','.join(order_cols)})"}) \
       .mode("overwrite").save()
    print(f"✔️ {tbl}: {df.count()} строк")

print(f"🧮 Корреляция rating↔units_sold = {corr_val:.4f}")


✔️ highest_rating: 1 строк
✔️ lowest_rating: 1 строк
✔️ most_reviewed: 3 строк
🧮 Корреляция rating↔units_sold = 0.0076
