# Gold Layer - Data Modelling

## 1) Create Connection to Azure Storage account

In [0]:
storage_account = "team04sa"
application_id = "7ab46e7b-cc68-4f3f-9903-9a6bae8e347a"
directory_id = "b7a954b3-aa07-453e-b8a3-97101aeffcad"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", application_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", "6CO8Q~LRNOBGY5V~1UjmhmTdtEQwcbNbiB6ojcaw")
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{directory_id}/oauth2/token")

## 2) Read delta format from Silver Layer

In [0]:
GOLD_BASE = "abfss://fooddata@team04sa.dfs.core.windows.net/gold/"
SILVER_PATH = "abfss://fooddata@team04sa.dfs.core.windows.net/silver/"

silver_df = spark.read.format("delta").load(SILVER_PATH)

silver_df.cache()
print("rows:", silver_df.count())
silver_df.printSchema()

## 3) Create DIM Tables

In [0]:
from pyspark.sql import functions as F, Window

# Country
dim_country = (silver_df
               .select(F.col("adm0_id").alias("country_id"),
                       F.col("adm0_name").alias("country"))
               .distinct()
               .withColumn("country_sk", F.monotonically_increasing_id())
               .select("country_sk", "country_id", "country")
               )

# Product
dim_product = (silver_df
               .select(F.col("cm_id").alias("product_id"), 
                       F.col("cm_name").alias("product_name"),
                       F.col("cm_group").alias("product_group"))
               .distinct()
               .withColumn("product_sk", F.monotonically_increasing_id())
               .select("product_sk", "product_id", "product_name", "product_group")
               )

from pyspark.sql import functions as F

dim_time = (
    silver_df
    .select("date_month", "mp_year", "mp_month", "year_month")
    .distinct()
    # Quartal: floor((month-1)/3)+1  => 1..4
    .withColumn(
        "qtr",
        F.concat(
            F.lit("Q"),
            (F.floor((F.col("mp_month").cast("int") - F.lit(1)) / F.lit(3)) + F.lit(1)).cast("string")
        )
    )
    .withColumnRenamed("mp_year", "year")
    .withColumnRenamed("mp_month", "month")
    .withColumnRenamed("date_month", "date")
)

# Unit
dim_unit = (silver_df
            .select(F.col("std_unit_group"))
            .distinct()
            .withColumn("unit_sk", F.monotonically_increasing_id())
            .select("unit_sk", "std_unit_group")
            )

# Market
dim_market = (silver_df
              .select("mkt_name","mkt_id").distinct()
              .withColumn("market_sk", F.monotonically_increasing_id())
              .select("market_sk", "mkt_name","mkt_id")
              )

# Write DIMs
(dim_country.write.mode("overwrite").format("delta").save(GOLD_BASE+"dim_country"))
(dim_product.write.mode("overwrite").format("delta").save(GOLD_BASE+"dim_product"))
(dim_time.write.mode("overwrite").format("delta").save(GOLD_BASE+"dim_time"))
(dim_unit.write.mode("overwrite").format("delta").save(GOLD_BASE+"dim_unit"))
(dim_market.write.mode("overwrite").format("delta").option("mergeSchema", "true").save(GOLD_BASE+"dim_market"))

## 4) Create Fact table - fact_Food_Price_Comparison

In [0]:
# ---------- FACT: Country Difference (KAZ vs AFG) on ProductGroup ----------
from pyspark.sql import functions as F

keys = ["cm_group", "year_month"]

df = silver_df.withColumn("adm0_up", F.upper(F.col("adm0_name")))

kaz = (
    df.filter(F.col("adm0_up") == "KAZAKHSTAN")
      .groupBy(*keys)
      .agg(
          F.avg("mp_price_eur").alias("avg_kaz_eur"),
          F.count("*").alias("n_kaz")
      )
)

afg = (
    df.filter(F.col("adm0_up") == "AFGHANISTAN")
      .groupBy(*keys)
      .agg(
          F.avg("mp_price_eur").alias("avg_afg_eur"),
          F.count("*").alias("n_afg")
      )
)

both = (
    kaz.join(afg, keys, "inner")
       .withColumn("delta", F.col("avg_kaz_eur") - F.col("avg_afg_eur"))
       .withColumn(
           "delta_pct",
           F.when(F.col("avg_afg_eur") == 0, F.lit(None).cast("double"))
            .otherwise(F.col("delta") / F.col("avg_afg_eur"))
       )
)

# Country-IDs aus dim_country (Spalten: country_id, country_sk, country)
fcd = (
    both.alias("p")
    .join(
        dim_product.alias("dp"),
        F.col("p.cm_group").cast("string") == F.col("dp.product_group").cast("string"),
        "left"
    )
    .join(
        dim_country.alias("dc_kaz"),
        F.upper(F.trim(F.col("dc_kaz.country"))) == F.lit("KAZAKHSTAN"),
        "left"
    )
    .join(
        dim_country.alias("dc_afg"),
        F.upper(F.trim(F.col("dc_afg.country"))) == F.lit("AFGHANISTAN"),
        "left"
    )
    .select(
        F.col("p.cm_group"),
        F.col("p.year_month"),
        F.col("p.avg_kaz_eur").alias("avg_kaz"),
        F.col("p.avg_afg_eur").alias("avg_afg"),
        F.col("p.delta"),
        F.col("p.delta_pct"),
        F.col("p.n_kaz"),
        F.col("p.n_afg"),
        F.col("dp.product_id").cast("string").alias("product_id"),
        F.col("dp.product_sk").alias("product_sk"),
        F.col("dp.product_name").alias("product_name"),
        F.col("dc_kaz.country_id").alias("country_id_kaz"),
        F.col("dc_afg.country_id").alias("country_id_afg")
    )
)

display(fcd)

(fcd.write.mode("overwrite").format("delta").option("mergeSchema", "true").save(GOLD_BASE+"fact_Food_Price_Comparison"))