In [None]:

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum, current_timestamp
import os

# Configuration - Silver to Gold data flow
WORKSPACE_NAME = "Fabric_MAAG"
SOURCE_LAKEHOUSE_NAME = "MAAG_LH_Silver"
SOURCE_SCHEMA = "sales"
SOURCE_TABLE = "orderline"

# Source: Absolute path to Silver lakehouse table
SOURCE_TABLE_PATH = f"abfss://{WORKSPACE_NAME}@onelake.dfs.fabric.microsoft.com/{SOURCE_LAKEHOUSE_NAME}.Lakehouse/Tables/{SOURCE_SCHEMA}/{SOURCE_TABLE}"

# Read OrderLine table from lakehouse
df = spark.read.format("delta").load(SOURCE_TABLE_PATH)

# Ensure OrderDate is present, join with Order table if needed
if "OrderDate" not in df.columns:
    order_df = spark.read.table("sales.Order")
    df = df.join(order_df.select("OrderId", "OrderDate"), on="OrderId", how="left")

from pyspark.sql.functions import year
df = df.withColumn("Year", year(col("OrderDate")))

# Aggregate total sales by ProductName and Year
sales_by_product_year = (
    df.groupBy("Year", "ProductName")
      .agg(spark_sum(col("LineTotal")).alias("TotalSales"))
)

# Get top 2 products for each year (2020, 2021, 2022)
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
windowSpec2 = Window.partitionBy("Year").orderBy(col("TotalSales").desc())
years_to_plot = [2020, 2021, 2022]
top2_products = sales_by_product_year.withColumn("rank", row_number().over(windowSpec2)).filter((col("rank") <= 2) & (col("Year").isin(years_to_plot)))
pdf_top2 = top2_products.orderBy("Year", "rank").toPandas()

# Plot top 2 selling products for each year (2020, 2021, 2022) in one diagram
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

plt.figure(figsize=(10, 6))
sns.barplot(
    data=pdf_top2,
    x="Year",
    y="TotalSales",
    hue="ProductName"
)
plt.ylabel("Total Sales (Thousands)")
plt.xlabel("Year")
plt.legend(title="Product Name", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{int(x/1000):,}K'))
plt.tight_layout()
plt.show()

