## Dim_Products

In [0]:
product_path = '/mnt/silver/SalesLT/STG_Products/'
output_path = '/mnt/gold/SalesLT/Dim_Products/'

dim_product = spark.read.format("delta").load(product_path)
dim_product.write.format("delta").mode("append").save(output_path)

## Dim_Customers

In [0]:
customers_path = '/mnt/silver/SalesLT/STG_Customers/'
output_path = '/mnt/gold/SalesLT/Dim_Customers/'

dim_customer = spark.read.format("delta").load(customers_path)
dim_customer.write.format("delta").mode("append").save(output_path)

## Fact_Sales = SalesOrderHeader + SalesOrderDetail

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, FloatType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window


path_sales_order_detail = '/mnt/silver/SalesLT/STG_SalesOrderDetail/'
path_sales_order_header = '/mnt/silver/SalesLT/STG_SalesOrderHeader/'
output_path = '/mnt/gold/SalesLT/Fact_Sales/'

sales_order_detail_df = spark.read.format("delta").load(path_sales_order_detail)
sales_order_header_df = spark.read.format("delta").load(path_sales_order_header)

schema = StructType([
    StructField("SalesOrderID", IntegerType(), True),
    StructField("OrderDate", DateType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("ProductID", IntegerType(), True),
    StructField("OrderQty", IntegerType(), True),
    StructField("UnitPrice", FloatType(), True),
    StructField("LineTotal", FloatType(), True)
])

joined_df = sales_order_header_df.alias("soh") \
  .join(
    sales_order_detail_df.alias("sod"),
    sales_order_header_df["SalesOrderID"] == sales_order_detail_df["SalesOrderID"]
  ) \
    .select(
      sales_order_header_df["SalesOrderID"].cast("int"),
      sales_order_header_df["OrderDate"].cast("date"),
      sales_order_header_df["CustomerID"].cast("int"),
      sales_order_detail_df["ProductID"].cast("int"),
      sales_order_detail_df["OrderQty"].cast("int"),
      sales_order_detail_df["UnitPrice"].cast("float"),
      sales_order_detail_df["LineTotal"].cast("float")
    )

empty_df = spark.createDataFrame(data=[], schema=schema)

final_df = joined_df.union(empty_df)

# Create a windows key incremental
window_spec = Window.orderBy(col("SalesOrderID"))

# Add col SaleKey incremental
final_df_with_key = final_df.withColumn(
    "SaleKey",
    row_number().over(window_spec)
)
# Resort cols 
columns = ["SaleKey"] + [col for col in final_df_with_key.columns if col != "SaleKey"]
fact_sales = final_df_with_key.select(columns)

fact_sales.write.format("delta").mode("append").save(output_path)

## Dim_Date

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, date_format, year, month, dayofmonth, dayofweek, weekofyear, quarter, when
from pyspark.sql.types import DateType
import datetime

output_path = '/mnt/gold/SalesLT/Dim_Date/'

# Definir el rango de fechas
start_date = datetime.date(1900, 1, 1)
end_date = datetime.date(2050, 12, 31)

# Generar un rango de fechas
date_range = [(start_date + datetime.timedelta(days=x)) for x in range((end_date - start_date).days + 1)]

# Crear un DataFrame inicial con las fechas
df = spark.createDataFrame(date_range, DateType()).toDF("date")

# Añadir columnas adicionales para la tabla Dim_Date
dim_date = (
    df
    .withColumn("year", year(col("date")))
    .withColumn("month", month(col("date")))
    .withColumn("day", dayofmonth(col("date")))
    .withColumn("day_of_week", dayofweek(col("date")))  # 1=Domingo, 7=Sábado
    .withColumn("week_of_year", weekofyear(col("date")))
    .withColumn("quarter", quarter(col("date")))
    .withColumn("is_weekend", when(col("day_of_week").isin(1, 7), lit(True)).otherwise(lit(False)))  # True si es sábado o domingo
    .withColumn("date_key", date_format(col("date"), "yyyyMMdd").cast("int"))  # Llave única en formato AAAAMMDD
)

# Mostrar un ejemplo de la tabla
# dim_date.show(10, truncate=False)

# Cantidad de registros en la tabla
# print(f"Total de registros: {dim_date.count()}")

# Save to Delta
dim_date.write.format("delta").mode("append").save(output_path)