# Reading the raw data

In [0]:
from pyspark.sql.functions import split, col
from pyspark.sql.types import *

In [0]:
df = spark.read\
          .format("parquet")\
          .load("abfss://bronze@storageadlsgen3.dfs.core.windows.net/rawdata")
          

In [0]:
display(df)

# Transformation

In [0]:
df_clean = df.withColumn("Model_Category", split(col("Model_ID"),'-')[0])\
                .withColumn("price", col("Revenue")/col("Units_Sold"))\
                .withColumnRenamed("BranchName", "Branch_Name")\
                .withColumnRenamed("DealerName", "Dealer_Name")\
                .fillna('Other', subset=["Model_Category"])\
                .dropDuplicates()


In [0]:
#from pyspark.sql.functions import expr, to_date
#df_date = df_category.withColumn("Date", expr("concat(cast(Day as string), '/', cast(Month as string), '/', cast(Year as string))"))

In [0]:
display(df_clean.limit(2))

# Saving the data into silver container

In [0]:
df_clean.write\
        .format("parquet")\
        .mode("overwrite")\
        .save("abfss://silver@storageadlsgen3.dfs.core.windows.net/carsales")

In [0]:
from pyspark.sql.functions import sum
df_clean.groupBy("Year", "Branch_Name").agg(sum("Revenue").alias("Total_Revenue")).sort("Year", "Total_Revenue", ascending=[1,0]).display()

In [0]:
df_clean.groupBy("Year", "Branch_Name").agg(sum("Units_Sold").alias("Total_Units")).sort("Year", "Total_Units", ascending=[1,0]).display()

Databricks visualization. Run in Databricks to view.

In [0]:
#df_clean.groupBy("Year", "BranchName").agg(sum("Units_Sold").alias("Total_Units_Sold")).orderBy(col("Year").asc(), col("Total_Units_Sold").desc()).display()

# Quering Silver Data

In [0]:
df = spark.read.format("parquet").load("abfss://silver@storageadlsgen3.dfs.core.windows.net/carsales")

In [0]:
#df.createOrReplaceTempView("carsales")

In [0]:
%sql
SELECT * FROM parquet.`abfss://silver@storageadlsgen3.dfs.core.windows.net/carsales`