# Import Libraries

In [26]:
%run utilities/global_class

StatementMeta(, 17, -1, Finished, Available)

In [27]:
%run utilities/package

StatementMeta(, 17, -1, Finished, Available)

In [28]:
#staging file format
container_name = "hanifsystem"
account_name = "hanifdatalake"
stg_fileformat = "parquet"
stg_delimiter = ","
stg_withheader = "true"
stg_main_folder = "synapse/workspaces/AdventureWorks2022/humanresources_employeedepartmenthistory"
#curated file format
cur_fileformat = "delta"
cur_delimiter = ","
cur_withheader = "true"
cur_main_folder = "synapse/workspaces/curated/AdventureWorks/humanresources_employeedepartmenthistory"
#mart file format
mart_fileformat = "parquet"
mart_delimiter = ","
mart_withheader = "true"
mart_main_folder = "synapse/workspaces/datamart/AdventureWorks2022/humanresources_employeedepartmenthistory"

StatementMeta(smallnotebook, 17, 22, Finished, Available)

In [29]:
start_date = datetime.strftime(datetime.now(), '%Y%m%d')
ingest_range_day_structured = -2

date_1 = datetime.strptime(start_date, '%Y%m%d')
result_1 = date_1 + timedelta(days = ingest_range_day_structured)
filter_date = result_1.strftime('%Y%m%d')

print(filter_date)

StatementMeta(smallnotebook, 17, 23, Finished, Available)

20230716


In [30]:
set_stg_path = PathConstructor(container_name, account_name, stg_main_folder)
stg_path = set_stg_path.pathconstructor()
print(stg_path)
set_cur_path = PathConstructor(container_name, account_name, cur_main_folder)
cur_path = set_cur_path.pathconstructor()
print(cur_path)

StatementMeta(smallnotebook, 17, 24, Finished, Available)

abfss://hanifsystem@hanifdatalake.dfs.core.windows.net/synapse/workspaces/AdventureWorks2022/humanresources_employeedepartmenthistory
abfss://hanifsystem@hanifdatalake.dfs.core.windows.net/synapse/workspaces/curated/AdventureWorks/humanresources_employeedepartmenthistory


# Staging Section

In [31]:
#df_stg_without_schema
df_stg_without_schema = spark.read.format(stg_fileformat)\
.option('header',stg_withheader)\
.option('inferSchema','true')\
.load(stg_path)
display(df_stg_without_schema.limit(10))

StatementMeta(smallnotebook, 17, 25, Finished, Available)

SynapseWidget(Synapse.DataFrame, 09751bd2-15a2-477a-8bb4-f075bc048da6)

In [32]:
df_stg_without_schema.printSchema()

StatementMeta(smallnotebook, 17, 26, Finished, Available)

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- DepartmentID: integer (nullable = true)
 |-- ShiftID: integer (nullable = true)
 |-- StartDate: date (nullable = true)
 |-- EndDate: date (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [33]:
col_name = df_stg_without_schema.columns
lower_col_name = [name.lower() for name in col_name]
df_stg = df_stg_without_schema.toDF(*lower_col_name)

StatementMeta(smallnotebook, 17, 27, Finished, Available)

In [34]:
df_stg.printSchema()

StatementMeta(smallnotebook, 17, 28, Finished, Available)

root
 |-- businessentityid: integer (nullable = true)
 |-- departmentid: integer (nullable = true)
 |-- shiftid: integer (nullable = true)
 |-- startdate: date (nullable = true)
 |-- enddate: date (nullable = true)
 |-- modifieddate: timestamp (nullable = true)



In [35]:
display(df_stg.limit(10))

StatementMeta(smallnotebook, 17, 29, Finished, Available)

SynapseWidget(Synapse.DataFrame, f2f3060e-45cd-4e4a-a589-893cb1cb907f)

In [36]:
df_stg =df_stg.withColumnRenamed('modifieddate','modifiedutcdate')
df_stg.printSchema()

StatementMeta(smallnotebook, 17, 30, Finished, Available)

root
 |-- businessentityid: integer (nullable = true)
 |-- departmentid: integer (nullable = true)
 |-- shiftid: integer (nullable = true)
 |-- startdate: date (nullable = true)
 |-- enddate: date (nullable = true)
 |-- modifiedutcdate: timestamp (nullable = true)



In [37]:
partition_column = ["businessentityid"]

df_stg_final = df_stg.withColumn("rank", row_number() \
                                        .over(Window.partitionBy(*partition_column) \
                                        .orderBy(desc("modifiedutcdate")))) \
                     .withColumn("curated_date", F.lit(datetime.now())) \
                     .where("rank == 1").drop("rank")

StatementMeta(smallnotebook, 17, 31, Finished, Available)

# Curated Section

In [38]:
try:
    set_df_cur = ReadFile(cur_path, cur_fileformat, cur_delimiter, cur_withheader)
    df_cur = set_df_cur.readfrompath()
except Exception as ex:
    df_final = df_stg_final.coalesce(1)
    df_final.write.format('delta') \
            .mode('overwrite') \
            .save(cur_path)

    set_df_cur = ReadFile(cur_path, cur_fileformat, cur_delimiter, cur_withheader)
    df_cur = set_df_cur.readfrompath()

StatementMeta(smallnotebook, 17, 32, Finished, Available)

In [39]:
display(df_cur)

StatementMeta(smallnotebook, 17, 33, Finished, Available)

SynapseWidget(Synapse.DataFrame, cd078735-249a-4922-a8e5-8fbb94d2d982)

In [41]:
df_cur.createOrReplaceTempView("targetTableName")
df_stg_final.createOrReplaceTempView("updatesTableName")

StatementMeta(smallnotebook, 17, 35, Finished, Available)

In [42]:
df_cur.printSchema()

StatementMeta(smallnotebook, 17, 36, Finished, Available)

root
 |-- businessentityid: integer (nullable = true)
 |-- departmentid: integer (nullable = true)
 |-- shiftid: integer (nullable = true)
 |-- startdate: date (nullable = true)
 |-- enddate: date (nullable = true)
 |-- modifiedutcdate: timestamp (nullable = true)
 |-- curated_date: timestamp (nullable = true)



In [44]:
spark.sql("""
        MERGE INTO targetTableName
        USING updatesTableName
        ON date_format(updatesTableName.modifiedutcdate, 'yyyy-MM-dd') >= TO_DATE('{0}','yyyyMMdd') AND
           targetTableName.businessentityid = updatesTableName.businessentityid
        WHEN MATCHED THEN UPDATE SET
                targetTableName.departmentid = updatesTableName.departmentid,
                targetTableName.shiftid = updatesTableName.shiftid,
                targetTableName.startdate = updatesTableName.startdate,
                targetTableName.enddate = updatesTableName.enddate,
                targetTableName.modifiedutcdate = updatesTableName.modifiedutcdate,
                targetTableName.curated_date = updatesTableName.curated_date                                
        WHEN NOT MATCHED AND (date_format(updatesTableName.modifiedutcdate, 'yyyy-MM-dd') >= TO_DATE('{1}','yyyyMMdd')) THEN INSERT * """.format(filter_date, filter_date)
)

StatementMeta(smallnotebook, 17, 38, Finished, Available)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

# Mart Section

In [45]:
def WriteFile(df, final_path):
    try: 
        spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs","false")
        df_final = df.coalesce(1)
        df_final.write.format('parquet') \
                .mode('overwrite') \
                .save(final_path)
                                        
        print("Write Success")
    except Exception as ex:
        print("Write Failed", str(ex))

StatementMeta(smallnotebook, 17, 39, Finished, Available)

In [46]:
set_mart_path = PathConstructor(container_name, account_name, mart_main_folder)
mart_path = set_mart_path.pathconstructor()

print("Source: ", cur_path, "Mart: ", mart_path)

StatementMeta(smallnotebook, 17, 40, Finished, Available)

Source:  abfss://hanifsystem@hanifdatalake.dfs.core.windows.net/synapse/workspaces/curated/AdventureWorks/humanresources_employeedepartmenthistory Mart:  abfss://hanifsystem@hanifdatalake.dfs.core.windows.net/synapse/workspaces/datamart/AdventureWorks2022/humanresources_employeedepartmenthistory


In [47]:
#Create empty dataframe
df_mart = spark.createDataFrame([], StructType([]))
df_mart = df_cur

# print(df_mart.count())

StatementMeta(smallnotebook, 17, 41, Finished, Available)

In [48]:
from dateutil.relativedelta import relativedelta

periode = datetime.now() + relativedelta(months = -2)
periode = periode.strftime("%Y-%m") + "-01"
print(periode)
df_mart_final = df_mart.filter(col("curated_date") >= (lit(periode)))

StatementMeta(smallnotebook, 17, 42, Finished, Available)

2023-05-01


In [50]:
partition_list = df_mart_final.select(year(col("curated_date")).alias("year"), month(col("curated_date")) \
.alias("month")).dropDuplicates().orderBy(col("year").asc()).orderBy(col("month").asc()).collect()

print(partition_list)

StatementMeta(smallnotebook, 17, 44, Finished, Available)

[Row(year=2023, month=7)]


In [51]:
for partition in partition_list:
    final_path = mart_path + '/' + str(partition.year) + str(partition.month).zfill(2)
    print('Partition path', final_path)

    try:
        mssparkutils.fs.rm(final_path, True)
    except Exception as e:
        pass

    df_final = df_mart_final.filter(year(col("curated_date")) == partition.year).filter(month(col("curated_date")) == partition.month)

    WriteFile(df_final, final_path)

StatementMeta(smallnotebook, 17, 45, Finished, Available)

Partition path abfss://hanifsystem@hanifdatalake.dfs.core.windows.net/synapse/workspaces/datamart/AdventureWorks2022/humanresources_employeedepartmenthistory/202307
Write Success


In [52]:
display(df_final)

StatementMeta(smallnotebook, 17, 46, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4c5cdde8-a2f9-40a7-bd20-134ae3ff8a0a)

In [53]:
df_final.printSchema()

StatementMeta(smallnotebook, 17, 47, Finished, Available)

root
 |-- businessentityid: integer (nullable = true)
 |-- departmentid: integer (nullable = true)
 |-- shiftid: integer (nullable = true)
 |-- startdate: date (nullable = true)
 |-- enddate: date (nullable = true)
 |-- modifiedutcdate: timestamp (nullable = true)
 |-- curated_date: timestamp (nullable = true)

