In [1]:
dims_df = (
    spark.read.option("header", True)
    .option("inferSchema", True)
    .option("header", True)
    .option("sep", "|")
    .csv("Files/bronze/*.txt")
)

StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 3, Finished, Available)

In [2]:
from  pyspark.sql.functions import input_file_name
from pyspark.sql.functions import regexp_extract

dims_df_w_names = (dims_df.withColumn("filename", regexp_extract(input_file_name(), r'([a-zA-Z_]+)\.txt*', 1)))

StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 4, Finished, Available)

In [3]:
from pyspark.sql.functions import lit,current_timestamp, col, monotonically_increasing_id, row_number, trim, concat
from pyspark.sql.window import Window


window = Window.partitionBy("filename").orderBy("Description")
dims_df_w_names = dims_df_w_names.withColumn("start_date",current_timestamp())\
        .withColumn("end_date", lit(None).cast("TIMESTAMP"))\
        .withColumn("status", lit(1))\
        .withColumn("key",  row_number().over(window))\
        .withColumn("Code", trim(dims_df_w_names.Code))\
        .withColumn("Description", trim(dims_df_w_names.Description))


StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 5, Finished, Available)

In [4]:
dims_df_w_names.write.mode("overwrite").partitionBy('filename').parquet('Files/silver/dims-silver')

StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 6, Finished, Available)

In [5]:
dims_df_post_updated = (
    spark.read.option("header", True)
    .option("inferSchema", True)
    .parquet("Files/silver/dims-silver")
)
dims_df_post_updated.show()

StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 7, Finished, Available)

+-----+--------------------+--------------------+--------+------+---+--------+
| Code|         Description|          start_date|end_date|status|key|filename|
+-----+--------------------+--------------------+--------+------+---+--------+
|   R6|ACP (African, Car...|2024-04-05 14:29:...|    null|     1|  1|DIM_AREA|
|   5U|ADC (Andean Devel...|2024-04-05 14:29:...|    null|     1|  2|DIM_AREA|
|   7H|AFREXIMBANK (Afri...|2024-04-05 14:29:...|    null|     1|  3|DIM_AREA|
|   5M|AMF (Arab Monetar...|2024-04-05 14:29:...|    null|     1|  4|DIM_AREA|
|  R16|APEC (Asia-Pacifi...|2024-04-05 14:29:...|    null|     1|  5|DIM_AREA|
|   R4|ASEAN (Countries ...|2024-04-05 14:29:...|    null|     1|  6|DIM_AREA|
|  R45|             ASEAN-5|2024-04-05 14:29:...|    null|     1|  7|DIM_AREA|
|4J842|    ATHENA Mechanism|2024-04-05 14:29:...|    null|     1|  8|DIM_AREA|
| XR29|Advanced Economie...|2024-04-05 14:29:...|    null|     1|  9|DIM_AREA|
| XR23|Advanced Economie...|2024-04-05 14:29:...|   

In [6]:
dims_df_post_updated.where(dims_df_post_updated.filename=="DIM_ACCOUNTING").show()

StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 8, Finished, Available)

+-----+--------------------+--------------------+--------+------+---+--------------+
| Code|         Description|          start_date|end_date|status|key|      filename|
+-----+--------------------+--------------------+--------+------+---+--------------+
|    A|Assets (Net Acqui...|2024-04-05 14:29:...|    null|     1|  1|DIM_ACCOUNTING|
|   AS|Assets - short po...|2024-04-05 14:29:...|    null|     1|  2|DIM_ACCOUNTING|
|    B|Balance (Credits ...|2024-04-05 14:29:...|    null|     1|  3|DIM_ACCOUNTING|
|   CL|Contingent liabil...|2024-04-05 14:29:...|    null|     1|  4|DIM_ACCOUNTING|
| C_EN|Credit (Environme...|2024-04-05 14:29:...|    null|     1|  5|DIM_ACCOUNTING|
|    C|  Credit (Resources)|2024-04-05 14:29:...|    null|     1|  6|DIM_ACCOUNTING|
| D_EN|Debit (Environmen...|2024-04-05 14:29:...|    null|     1|  7|DIM_ACCOUNTING|
|    D|        Debit (Uses)|2024-04-05 14:29:...|    null|     1|  8|DIM_ACCOUNTING|
|   EC|Economy to enviro...|2024-04-05 14:29:...|    null|     1|

In [7]:
#area_df = dims_df_w_names.where(dims_df_w_names.filename == "DIM_AREA" )

#from pyspark.sql.functions import lit,current_timestamp, col, monotonically_increasing_id, row_number
#from pyspark.sql.window import Window

#window = Window.orderBy("Description")
#area_df = area_df.withColumn("start_date",current_timestamp())\
#        .withColumn("end_date", lit(None).cast("TIMESTAMP"))\
#        .withColumn("status", lit(1))\
#        .withColumn("key",  row_number().over(window))

#area_df.write.parquet("Files/dims-silver/staging_dim_area.parquet")

#dims_df_post = (
#    spark.read.option("header", True)
#    .option("inferSchema", True)
#    .parquet("Files/dims-silver/staging_dim_area.parquet")
#)
#dims_df_post.show()

StatementMeta(, ae56760a-eb4d-489c-adcd-be5267f13d92, 9, Finished, Available)