In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import *

In [0]:
source_catalog_name   = 'bronze'
source_database_name  = 'ingestion'
source_table_name     = 'video_games_sales'

dest_catalog_name    = 'silver'
dest_database_name   = 'refined'
dest_table_name      = 'video_games_sales'


print(f""" > source_catalog_name:  {source_catalog_name}""")
print(f""" > source_database_name: {source_database_name}""")
print(f""" > source_table_name:    {source_table_name}""")
print()
print(f""" > dest_catalog_name:    {dest_catalog_name}""")
print(f""" > dest_database_name:   {dest_database_name}""")
print(f""" > dest_table_name:      {dest_table_name}""")

In [0]:
spark.sql(f"CREATE CATALOG if NOT EXISTS {dest_catalog_name}")

In [0]:
spark.sql(f"CREATE SCHEMA if NOT EXISTS {dest_catalog_name}.{dest_database_name}")

In [0]:
filter_max = (
    spark
    .table(f"{source_catalog_name}.{source_database_name}.{source_table_name}")
    .select(f.max("ingestion_timestamp"))
    .collect()[0][0]
)
print(f""" > filter_max: {filter_max}""")


In [0]:
df = (
    spark
    .table(f"{source_catalog_name}.{source_database_name}.{source_table_name}")
    .filter(f"ingestion_timestamp = {filter_max}")
)

In [0]:
df = (
    df.withColumn(
        "Year_of_Release",
        f.when(f.col("Year_of_Release").rlike("^[0-9]+$"), f.col("Year_of_Release"))
        .otherwise("1900")
        .cast("int"),
    )
    .withColumn("NA_Sales", f.col("NA_Sales").cast("decimal(10,2)"))
    .withColumn("EU_Sales", f.col("EU_Sales").cast("decimal(10,2)"))
    .withColumn("JP_Sales", f.col("JP_Sales").cast("decimal(10,2)"))
    .withColumn("Global_Sales", f.col("Global_Sales").cast("decimal(10,2)"))
    .withColumn(
        "Critic_Score",
        f.when(f.col("Critic_Score").isNull(), 0)
        .otherwise(f.col("Critic_Score"))
        .cast("int"),
    )
    .withColumn(
        "Critic_Count",
        f.when(f.col("Critic_Count").isNull(), 0)
        .otherwise(f.col("Critic_Count"))
        .cast("int"),
    )
    .withColumn("Other_Sales", f.col("Other_Sales").cast("decimal(10,2)"))
    .withColumn(
        "User_Score",
        f.when(
            (f.col("User_Score").isNull())
            | (f.col("User_Score").rlike("^[0-9]+(\\.[0-9]+)?$") == False),
            f.lit("0"),
        )
        .otherwise(f.col("User_Score"))
        .cast("decimal(10,2)"),
    )
    .withColumn(
        "User_Count",
        f.when(f.col("User_Count").isNull(), 0)
        .otherwise(f.col("User_Count"))
        .cast("int"),
    )
    .withColumn(
        "Developer",
        f.when(f.col("Developer").isNull(), f.lit("N/A")).otherwise(f.col("Developer")),
    )
    .withColumn(
        "Rating",
        f.when(f.col("Rating").isNull(), f.lit("N/A")).otherwise(f.col("Rating")),
    )
    .withColumn(
        "Name", f.when(f.col("Name").isNull(), f.lit("N/A")).otherwise(f.col("Name"))
    )
    .withColumn(
        "Genre", f.when(f.col("Genre").isNull(), f.lit("N/A")).otherwise(f.col("Genre"))
    )
)

In [0]:
(
    df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(f"{dest_catalog_name}.{dest_database_name}.{dest_table_name}")
)


In [0]:
display(
  spark
  .table(f"{dest_catalog_name}.{dest_database_name}.{dest_table_name}")
  .limit(10)
)