In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id, current_timestamp

In [2]:
spark = SparkSession.builder \
    .appName("Postgres_Star_Schema_ETL") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", "/opt/spark/jars/postgresql-42.7.8.jar") \
    .getOrCreate()

props = {
    "user": "user",
    "password": "user",
    "driver": "org.postgresql.Driver"
}
url = "jdbc:postgresql://postgres:5432/lab2_db_postgres"

In [3]:
mock_data = spark.read.jdbc(url=url, table="mock_data", properties=props).cache()

In [4]:
# Размерность покупателя
customers_df = (mock_data.select(
        col("customer_first_name").alias("first_name"),
        col("customer_last_name").alias("last_name"),
        col("customer_age").alias("age"),
        col("customer_email").alias("email"),
        col("customer_country").alias("country"),
        col("customer_postal_code").alias("postal_code"),
        col("customer_pet_type").alias("pet_type"),
        col("customer_pet_name").alias("pet_name"),
        col("customer_pet_breed").alias("pet_breed"),
        col("pet_category").alias("pet_category")
    )
    .distinct()
    .withColumn("customer_id", monotonically_increasing_id())
    .withColumn("created_at", current_timestamp())
)

customers_df.write.mode("append").jdbc(url=url, table="dim_customer", properties=props)

In [5]:
# Размерность продавцов
sellers_df = (mock_data.select(
        col("seller_first_name").alias("first_name"),
        col("seller_last_name").alias("last_name"),
        col("seller_email").alias("email"),
        col("seller_country").alias("country"),
        col("seller_postal_code").alias("postal_code")
    )
    .distinct()
    .withColumn("seller_id", monotonically_increasing_id())
    .withColumn("created_at", current_timestamp())
)

sellers_df.write.mode("append").jdbc(url=url, table="dim_seller", properties=props)

In [6]:
# Размерность продуктов
products_df = (mock_data.select(
        col("product_name").alias("name"),
        col("product_category").alias("category"),
        col("product_price").alias("price"),
        col("product_weight").alias("weight"),
        col("product_color").alias("color"),
        col("product_size").alias("size"),
        col("product_brand").alias("brand"),
        col("product_material").alias("material"),
        col("product_description").alias("description"),
        col("product_rating").alias("rating"),
        col("product_reviews").alias("reviews"),
        col("product_release_date").alias("release_date"),
        col("product_expiry_date").alias("expiry_date")
    )
    .distinct()
    .withColumn("product_id", monotonically_increasing_id())
    .withColumn("created_at", current_timestamp())
)

products_df.write.mode("append").jdbc(url=url, table="dim_product", properties=props)

In [7]:
# Размерность магазинов
stores_df = (mock_data.select(
        col("store_name").alias("name"),
        col("store_location").alias("location"),
        col("store_city").alias("city"),
        col("store_state").alias("state"),
        col("store_country").alias("country"),
        col("store_phone").alias("phone"),
        col("store_email").alias("email")
    )
    .distinct()
    .withColumn("store_id", monotonically_increasing_id())
    .withColumn("created_at", current_timestamp())
)

stores_df.write.mode("append").jdbc(url=url, table="dim_store", properties=props)

In [8]:
# Размерность постващиков
suppliers_df = (mock_data.select(
        col("supplier_name").alias("name"),
        col("supplier_contact").alias("contact"),
        col("supplier_email").alias("email"),
        col("supplier_phone").alias("phone"),
        col("supplier_address").alias("address"),
        col("supplier_city").alias("city"),
        col("supplier_country").alias("country")
    )
    .distinct()
    .withColumn("supplier_id", monotonically_increasing_id())
    .withColumn("created_at", current_timestamp())
)

suppliers_df.write.mode("append").jdbc(url=url, table="dim_supplier", properties=props)

In [9]:
# Факты продаж
sales_df = (mock_data.alias("m")
    .join(
        customers_df.alias("c"),
        (col("m.customer_email") == col("c.email")) & 
        (col("m.customer_first_name") == col("c.first_name")),
        "left"
    )
    .join(
        sellers_df.alias("s"),
        col("m.seller_email") == col("s.email"),
        "left"
    )
    .join(
        products_df.alias("p"),
        (col("m.product_name") == col("p.name")) & 
        (col("m.product_category") == col("p.category")) &
        (col("m.product_price") == col("p.price")),
        "left"
    )
    .join(
        stores_df.alias("st"),
        (col("m.store_name") == col("st.name")) & 
        (col("m.store_city") == col("st.city")),
        "left"
    )
    .join(
        suppliers_df.alias("sup"),
        (col("m.supplier_name") == col("sup.name")) & 
        (col("m.supplier_email") == col("sup.email")),
        "left"
    )
    .select(
        col("m.sale_date"),
        col("c.customer_id"),
        col("s.seller_id"),
        col("p.product_id"),
        col("st.store_id"),
        col("sup.supplier_id"),
        col("m.sale_quantity").alias("quantity"),
        col("m.sale_total_price").alias("total_price")
    )
    .withColumn("sales_id", monotonically_increasing_id())
    .withColumn("created_at", current_timestamp())
    
    .select(
        "sales_id", "sale_date", "customer_id", "seller_id", 
        "product_id", "store_id", "supplier_id", 
        "quantity", "total_price", "created_at"
    )
)

sales_df.write.mode("append").jdbc(url=url, table="fact_sales", properties=props)

In [10]:
spark.stop()