In [1]:
import re
from pyspark.sql.functions import when, col
from pyspark.sql import SparkSession
from os.path import abspath
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, weekofyear, dayofweek, floor, date_format, when, lit, concat
from datetime import date, timedelta


In [2]:

spark = SparkSession \
    .builder \
    .master("local[4]") \
    .appName("Caching") \
    .config("spark.sql.warehouse.dir", "hdfs://user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext


In [3]:

start_date = date(2016, 1, 1)
end_date = date(2018, 12, 31)
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

date_df = spark.createDataFrame([(d,) for d in date_range], ["date"]).withColumn("date", col("date").cast("date"))

date_dim = date_df.withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth(col("date"))) \
    .withColumn("week", weekofyear(col("date"))) 

date_dim = date_dim.withColumn("date_sur_key", concat(col('day'), col('month'), col('year')))

database_name = "sales_schema"
table_name = "date_dim"

    spark.sql(f"USE {database_name}")

    date_dim.write.mode('overwrite').saveAsTable(table_name)
    print(f"Date dimension table saved to Hive table {database_name}.{table_name}")





Date dimension table saved to Hive table sales_schema.date_dim


In [4]:
df_transactions =spark.read.parquet("/user/silver/sales_transaction/sales_transactions.parquet")

In [6]:
df_transactions.columns

['transaction_date',
 'transaction_id',
 'customer_id',
 'customer_fname',
 'customer_lname',
 'sales_agent_id',
 'branch_id',
 'product_id',
 'product_name',
 'product_category',
 'units',
 'unit_price',
 'is_online',
 'payment_method',
 'shipping_address',
 'customer_email',
 'offers',
 'total_price',
 'city',
 'state',
 'postal_code',
 'row_num',
 'audit_sur',
 'source_file',
 'created_at',
 'created_by',
 'is_valid_email',
 'is_positive_units',
 'is_positive_unit_price',
 'is_valid_transaction',
 'customer_sur',
 'date_sur']

In [5]:
df_online = df_transactions.filter(col("shipping_address") != "")
df_branch = df_transactions.filter(col("is_online") == "no")

In [8]:
def create_customer_dimension(df_transactions):
    customer_cols = ["customer_id","customer_fname","customer_lname","customer_email","customer_sur"]
    return df_transactions.select(customer_cols).dropDuplicates()

customer_dim = create_customer_dimension(df_transactions)

customer_dim.write\
    .mode("overwrite")\
    .saveAsTable("sales_schema.dim_customer")

customer_dim.write\
    .mode("overwrite")\
    .saveAsTable("online_sales_schema.dim_customer")

customer_dim.write\
    .mode("overwrite")\
    .saveAsTable("branches_sales_schema.dim_customer")

In [9]:
def create_product_dimension(df_transactions):
    product_cols = ["product_id","product_name","product_category","unit_price"]
    return df_transactions.select(product_cols).dropDuplicates()

product_dim= create_product_dimension(df_transactions)

product_dim.write\
    .mode("overwrite")\
    .saveAsTable("sales_schema.dim_product")

product_dim.write\
    .mode("overwrite")\
    .saveAsTable("branches_sales_schema.dim_product")

product_dim.write\
    .mode("overwrite")\
    .saveAsTable("online_sales_schema.dim_product")

In [10]:
def create_online_fact(df_online):
    branch_fact_cols = [
'transaction_date',
 'transaction_id',
 'customer_id',
 'product_id',
 'units',
 'unit_price',
 'payment_method',
 'offers',
 'total_price',
 'postal_code',
 'audit_sur'
    ]
    return df_online.select(branch_fact_cols)

In [11]:
def create_super_fact(df_transactions):
    super_fact = [ 'transaction_date','transaction_id', 'customer_id','product_id','units','unit_price','payment_method','offers','total_price']
    return df_transactions.select(super_fact)

In [12]:
def create_branch_fact(df_transactions):
    super_fact = [ 'transaction_date','transaction_id', 'customer_id','sales_agent_id','branch_id','product_id','units','unit_price','payment_method','offers','total_price']
    return df_branch.select(super_fact)

In [26]:
online_fact = create_online_fact(df_online)

In [27]:
super_fact = create_super_fact(df_transactions)

In [28]:
branch_fact = create_branch_fact (df_branch)

In [35]:
#external

super_fact.write\
    .mode("overwrite")\
    .partitionBy("transaction_date")\
    .option("path","hdfs:///user/hive/warehouse/sales_schema.db/super_fact")\
    .saveAsTable("sales_schema.super_fact_external")

branch_fact.write\
    .mode("overwrite")\
    .partitionBy("transaction_date")\
    .option("path","hdfs:///user/hive/warehouse/branches_sales_schema.db/branch_fact")\
    .saveAsTable("branches_sales_schema.factbranches")

online_fact.write\
    .mode("overwrite")\
    .partitionBy("transaction_date")\
    .option("path","hdfs:///user/hive/warehouse/online_sales_schema.db/online_fact")\
    .saveAsTable("online_sales_schema.online_fact")

In [None]:
super_fact.write\
    .mode("overwrite")\
    .partitionBy("transaction_date")\
    .saveAsTable("sales_schema.super_fact")

branch_fact.write\
    .mode("overwrite")\
    .partitionBy("transaction_date")\
    .saveAsTable("branches_sales_schema.factbranches")

online_fact.write\
    .mode("overwrite")\
    .partitionBy("transaction_date")\
    .saveAsTable("online_sales_schema.online_fact")

In [18]:
def create_audit_dimension(df_transactions):
    audit_cols = ["transaction_id","source_file", "created_at", "created_by", "is_valid_email", "is_positive_units", "is_positive_unit_price", "is_valid_transaction"]
    return df_transactions.select(audit_cols)

audit_dim = create_audit_dimension(df_transactions)


audit_dim.write\
    .mode("overwrite")\
    .saveAsTable("sales_schema.dim_audit")

audit_dim.write\
    .mode("overwrite")\
    .saveAsTable("branches_sales_schema.dim_audit")

audit_dim.write\
    .mode("overwrite")\
    .saveAsTable("online_sales_schema.dim_audit")

In [None]:
#external

audit_dim.write\
    .mode("overwrite")\
    .option("path","hdfs:///user/hive/warehouse/sales_schema.db/dim_audit")\
    .saveAsTable("sales_schema.dim_audit")

audit_dim.write\
    .mode("overwrite")\
    .option("path","hdfs:///user/hive/warehouse/branches_sales_schema.db/dim_audit")\
    .saveAsTable("branches_sales_schema.dim_audit")

audit_dim.write\
    .mode("overwrite")\
    .option("path","hdfs:///user/hive/warehouse/online_sales_schema.db/dim_audit")\
    .saveAsTable("online_sales_schema.dim_audit")


.option("path","hdfs:///user/hive/warehouse/online_sales_schema.db/online_fact")\

In [31]:
#branch_fact.show()

+-------------------+----------------+-----------+--------------+---------+----------+-----+----------+--------------+------+------------------+
|   transaction_date|  transaction_id|customer_id|sales_agent_id|branch_id|product_id|units|unit_price|payment_method|offers|       total_price|
+-------------------+----------------+-----------+--------------+---------+----------+-----+----------+--------------+------+------------------+
|2022-01-01 00:00:00|trx-742931471887|      85520|             2|        2|         2|    6|    699.99|   Credit Card|     0|4199.9400000000005|
|2022-01-01 00:00:00|trx-290800105795|      85543|             5|        5|        29|    4|     39.99|   Credit Card|    15|            159.96|
|2022-01-01 00:00:00|trx-578598481204|      85515|            11|        2|         1|    9|    999.99|   Credit Card|    10|           8999.91|
|2022-01-01 00:00:00|trx-855082289495|      85514|             7|        5|         8|    3|     79.99|   Credit Card|     0|239.9

In [79]:
#spark.sql("select * from sales_schema.dim_branch")

_c0,_c1,_c2,_c3
branch_id,location,establish_date,class
1,New York,2017-01-15,A
2,Los Angeles,2016-07-28,B
3,Chicago,2015-03-10,A
4,Houston,2016-11-05,D
5,Phoenix,2017-09-20,C


In [43]:
spark.sql("select * from sales_schema.dim_sales_agent")

sales_person_id,name,hire_date
1,John Doe,2020-6-3
2,Jane Smith,2018-5-13
3,Michael Johnson,2021-10-3
4,Emily Brown,2020-10-25
5,David Wilson,2021-4-8
6,Emma Taylor,2019-3-28
7,Christopher Miller,2020-1-11
8,Olivia Davis,2021-10-24
9,Daniel Martinez,2018-10-8
10,Sophia Moore,2019-5-25


In [40]:
#spark.sql("select * from sales_schema.dim_audit")

source_file,created_at,created_by,is_valid_email,is_positive_units,is_positive_unit_price,is_valid_transaction
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True
hdfs://localhost:...,2024-07-01 05:48:...,Eslam Fayez,True,True,True,True


In [30]:
branches= spark.read.parquet("/user/silver/branches/scd2/*.parquet")


branches.write\
    .mode("overwrite")\
    .saveAsTable("branches_sales_schema.branches_dim")

In [20]:
branches.show()

+---------+-----------+-------------------+-----+------------+--------------+---------------+------------+
|branch_id|   location|     establish_date|class|current_flag|effective_date|expiration_date|sk_branch_id|
+---------+-----------+-------------------+-----+------------+--------------+---------------+------------+
|        2|Los Angeles|2016-07-28 00:00:00|    B|        true|    2024-07-12|     9999-12-31|           2|
|        1|   New York|2017-01-15 00:00:00|    A|       false|    2024-07-12|     2024-07-12|           1|
|        3|    Chicago|2015-03-10 00:00:00|    A|        true|    2024-07-12|     9999-12-31|           3|
|        5|    Phoenix|2017-09-20 00:00:00|    C|        true|    2024-07-12|     9999-12-31|           5|
|        4|    Houston|2016-11-05 00:00:00|    D|        true|    2024-07-12|     9999-12-31|           4|
|        1|      tanta|2017-01-15 00:00:00|    A|        true|    2024-07-12|     9999-12-31|           7|
|        6|       Alex|2017-09-21 00:

In [24]:
sales_agents=spark.read.parquet("/user/silver/sales_agent/sales_agent.parquet")


sales_agents.write\
    .mode("overwrite")\
    .saveAsTable("branches_sales_schema.sales_agent_dim")

In [25]:
sales_agents.show()

+---------------+------------------+-------------------+
|sales_person_id|              name|          hire_date|
+---------------+------------------+-------------------+
|              1|          John Doe|2020-06-03 00:00:00|
|              2|        Jane Smith|2018-05-13 00:00:00|
|              3|   Michael Johnson|2021-10-03 00:00:00|
|              4|       Emily Brown|2020-10-25 00:00:00|
|              5|      David Wilson|2021-04-08 00:00:00|
|              6|       Emma Taylor|2019-03-28 00:00:00|
|              7|Christopher Miller|2020-01-11 00:00:00|
|              8|      Olivia Davis|2021-10-24 00:00:00|
|              9|   Daniel Martinez|2018-10-08 00:00:00|
|             10|      Sophia Moore|2019-05-25 00:00:00|
+---------------+------------------+-------------------+

