In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

25/11/15 18:35:23 WARN Utils: Your hostname, user-HP-Pavilion-x360-Convertible-14-dh0xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.24 instead (on interface wlo1)
25/11/15 18:35:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/15 18:35:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

1. Read Bronze table

The columns in the bronze table are as follows:    
**id**, **full_name**, **email**, **age**, **raw_ts**

#### Step 1: Handle NULL values

#### Let's consider a dataframe and practise .na.fill()

In [3]:
df = spark.createDataFrame([
    (10, 80.5, "Alice", None),
    (5, None, "Bob", None),
    (None, None, "Tom", None),
    (None, None, None, True)],
    schema=["age", "height", "name", "bool"]) 
df.show()

                                                                                

+----+------+-----+----+
| age|height| name|bool|
+----+------+-----+----+
|  10|  80.5|Alice|NULL|
|   5|  NULL|  Bob|NULL|
|NULL|  NULL|  Tom|NULL|
|NULL|  NULL| NULL|true|
+----+------+-----+----+



In [4]:
df2 = df.na.fill(
    {
      "age" : 0,
      "height" : 0,
      "name" : "unknown",
      "bool" : False 
    }
)
df2.show()

+---+------+-------+-----+
|age|height|   name| bool|
+---+------+-------+-----+
| 10|  80.5|  Alice|false|
|  5|   0.0|    Bob|false|
|  0|   0.0|    Tom|false|
|  0|   0.0|unknown| true|
+---+------+-------+-----+



# drop rows with critical nulls:

In [5]:
df.filter(F.col("age").isNotNull()).show()

+---+------+-----+----+
|age|height| name|bool|
+---+------+-----+----+
| 10|  80.5|Alice|NULL|
|  5|  NULL|  Bob|NULL|
+---+------+-----+----+



#### Clean unwanted characters

In [None]:
silver_df = silver_df.withColumn(
    "full_name_clean",
    F.regexp_replace("full_name", r"[^a-zA-Z\s]", "")  # keep only letters + spaces
)

#### Split the "full_name_clean" into two columns

In [None]:
# first create a column "name_split" which splits "full_name_column"
silver_df = silver_df.withColumn(
    "name_split",
    F.split(F.col("full_name_clean"), r"\s+")
)
#Then col("name_split").getItem(0) as first_name 
#Then col("name_split").getItem(1) as second_name 

silver_df = silver_df.withColumn("first_name", F.col("name_split").getItem(0))
silver_df = silver_df.withColumn("last_name", F.col("name_split").getItem(1))


#### Drop duplicate records

In [None]:
from pyspark.sql.window import Window 
winspec=Window.partitionBy("id") 
silver_df = silver_df.withColumn("rn",F.row_number().over(winspec))
silver_df = silver_df.filter(F.col("rn") == 1).drop(F.col("rn"))

from pyspark.sql.window import Window

w = Window.partitionBy("id").orderBy(F.col("raw_ts").desc())

silver_df = (silver_df
             .withColumn("rn", F.row_number().over(w))
             .filter("rn = 1")
             .drop("rn"))


#### Step 5: Data Validation

In [None]:
silver_df = silver_df.withColumn(
    "is_email_valid",
    F.col("email").rlike(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
)


#### Step 6: Add silver-layer metadata columns

In [None]:
silver_df = silver_df.withColumn("ingest_ts", F.current_timestamp()) \
                     .withColumn("source", F.lit("bronze_layer_table"))