In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType, DoubleType


Import our functions from the helper notebook 

In [None]:
%run 'functions.ipynb'

Extract

In [3]:
df_bronze = spark.read.parquet('PipelineStages/Bronze/')



Transform

In [None]:
# Cast types so we can perform calculations on the values later
df_bronze = df_bronze \
    .withColumn("timestamp", F.col("timestamp").cast(TimestampType())) \
    .withColumn("turbine_id", F.col("turbine_id").cast(IntegerType())) \
    .withColumn("wind_speed", F.col("wind_speed").cast(DoubleType())) \
    .withColumn("wind_direction", F.col("wind_direction").cast(IntegerType())) \
    .withColumn("power_output", F.col("power_output").cast(DoubleType()))

In [7]:
# Use our imported helper functions to extract our cleaned dataframe
df_cleaned = extract_rows_with_null_data(df_bronze)
df_cleaned = extract_outliers(df_cleaned)

# Take our original dataframe and anti-join it to our cleaned dataframe to identify any records which were removed
df_invalid_data = df_bronze.join(df_cleaned, ["timestamp", "turbine_id", "wind_speed", "wind_direction", "power_output"], "left_anti")

Load

In [8]:
# Write out our filtered records incase we want to use them for analysis later
df_invalid_data.write.mode('overwrite').parquet(f'PipelineStages/Silver/Invalid')



In [None]:
# Write our cleaned dataframe into a separate folder that we can ingest upstream later
df_cleaned.write.mode('overwrite').parquet(f'PipelineStages/Silver/Valid')

In [None]:
print("Silver completed successfully")