### Clean/Format title_basics table

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Load bronze delta table

In [0]:
title_basics_raw_df = spark.table("imdb_dev.bronze.title_basics")

In [0]:
display(title_basics_raw_df)

##### Step 2 - Fix to correct value

In [0]:
from pyspark.sql.functions import col, lit, split, when

In [0]:
# Convert 0 and 1 to boolean
# Tranform to array the genres column
title_basics_type_df = title_basics_raw_df \
                       .withColumn("isAdult", (col("isAdult") == lit(1))) \
                       .withColumn("genres",split_to_array("genres"))

##### Step 3 - Rename Columns

In [0]:
title_basics_renamed_df = title_basics_type_df \
                        .withColumnRenamed("titleType", "title_type") \
                        .withColumnRenamed("primaryTitle", "primary_title") \
                        .withColumnRenamed("originalTitle", "original_title") \
                        .withColumnRenamed("isAdult", "adult") \
                        .withColumnRenamed("startYear", "start_year") \
                        .withColumnRenamed("endYear", "end_year") \
                        .withColumnRenamed("runtimeMinutes", "runtime_minutes") \
                        .withColumn("data_source", lit(v_data_source))

##### Step 4 - Add ingestion date

In [0]:
title_basics_final_df = add_ingestion_date(title_basics_renamed_df)

In [0]:
display(title_basics_final_df)

##### Step 5 - Filter to keep only "movies"

In [0]:
df_final = title_basics_final_df.filter((col("title_type")=="movie") & col("start_year").isNotNull())

In [0]:
display(df_final)

#### Load

##### Step 6 - Write data to datalake as delta table

In [0]:
(
    df_final.write
      .format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")               
      .saveAsTable("imdb_dev.silver.movies") 
)