# Import libraries and directories

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

layers = ['bronze','silver','gold']
storage_account = dbutils.widgets.get('storage_account')
table_name = dbutils.widgets.get('table_name')
adls = {layer: f'abfss://{layer}@{storage_account}.dfs.core.windows.net' for layer in layers}

adls_bronze = adls['bronze']
adls_silver = adls['silver']
adls_gold = adls['gold']

dbutils.widgets.text('date_id','DT00000')
date_id = dbutils.widgets.get('date_id')

In [0]:
print(dbutils.fs.ls(adls_bronze))
print(dbutils.fs.ls(adls_silver))
print(dbutils.fs.ls(adls_gold))

[FileInfo(path='abfss://bronze@carhuymdatalake.dfs.core.windows.net/raw_data/', name='raw_data/', size=0, modificationTime=1738311839000)]
[FileInfo(path='abfss://silver@carhuymdatalake.dfs.core.windows.net/car_sales/', name='car_sales/', size=0, modificationTime=1738311859000)]
[FileInfo(path='abfss://gold@carhuymdatalake.dfs.core.windows.net/dim_branch/', name='dim_branch/', size=0, modificationTime=1738262139000), FileInfo(path='abfss://gold@carhuymdatalake.dfs.core.windows.net/dim_date/', name='dim_date/', size=0, modificationTime=1738259033000), FileInfo(path='abfss://gold@carhuymdatalake.dfs.core.windows.net/dim_dealer/', name='dim_dealer/', size=0, modificationTime=1738258376000), FileInfo(path='abfss://gold@carhuymdatalake.dfs.core.windows.net/dim_model/', name='dim_model/', size=0, modificationTime=1738258878000), FileInfo(path='abfss://gold@carhuymdatalake.dfs.core.windows.net/fact_sales/', name='fact_sales/', size=0, modificationTime=1738262253000)]


# Data Reading

In [0]:
df = spark.read.format('parquet') \
    .option('header','True')\
    .option('inferSchema','True')\
    .load(f'{adls_bronze}/raw_data/carsales-{date_id}.parquet')

# Data Transformation

In [0]:
df = df.withColumn('model_category',split(col('Model_ID',),'-')[0])

In [0]:
df = df.withColumn('RevPerUnit',col('Revenue')/col('Units_Sold'))

# AD-Hoc

In [0]:
df.groupby('year','BranchName').agg(\
    sum('Units_Sold').alias('Total_Units_Sold')\
    ).sort('Year','Total_Units_Sold',ascending=[1,0]).limit(10).display()

year,BranchName,Total_Units_Sold
2020,Power Ranger Motors,6
2020,DataFam Motors,3
2020,Puma Motors,3
2020,huym,3
2020,SH Bro,3
2020,hanh,3
2020,nhi,3
2020,Premier Motors,1


Databricks visualization. Run in Databricks to view.

#Data Writing

In [0]:
df.write.format('parquet') \
.mode('overwrite') \
.save(f'{adls_silver}/car_sales/{date_id}')

In [0]:

spark.sql(f'''
select * from parquet.`{adls_silver}/car_sales/{date_id}` limit 10
''').display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Day,Month,Year,BranchName,DealerName,Product_Name,model_category,RevPerUnit
BR9546,DLR0060,Jee-M10,7223451,1,DT01246,28,5,2020,Premier Motors,maihuy-dealer,Jeep,Jee,7223451.0
BR9666,DLR0062,Jee-M12,22093020,3,DT01246,30,5,2020,Puma Motors,Ford Australia Motors,Jeep,Jee,7364340.0
BR9726,DLR0063,Jee-M13,22372413,3,DT01247,31,5,2020,Power Ranger Motors,Ford do Brasil Motors,Jeep,Jee,7457471.0
BR0131,DLR0063,Jee-M13,22372413,3,DT01247,31,5,2020,SH Bro,Ford do Brasil Motors,Jeep,Jee,7457471.0
XYZ9726,XYZ0063,ZYXM13,22372413,3,DT01247,31,5,2020,DataFam Motors,Datafam Dealers,Surprise,ZYXM13,7457471.0
XYZ9727,XYZ0063,ZYXM13,22372413,3,DT01247,31,5,2020,huym,Datafam Dealers,Surprise,ZYXM13,7457471.0
XYZ9727,XYZ0063,ZYXM13,22372413,3,DT01248,31,5,2020,nhi,Datafam Dealers,Surprise,ZYXM13,7457471.0
XYZ9727,XYZ0063,ZYXM13,22372413,3,DT01248,31,5,2020,hanh,Datafam Dealers,Surprise,ZYXM13,7457471.0
BR9726,DLR0063,Jee-M13,22372413,3,DT01248,31,5,2020,Power Ranger Motors,maihuydealer,Jeep,Jee,7457471.0
