In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType
from pyspark.sql import DataFrame
from pyspark.sql.window import Window

Silver Functions

In [21]:
# Remove any entries where data is missing in any of the columns
def extract_rows_with_null_data(df_input: DataFrame) -> DataFrame:
    """ 
        Removes any invalid entries from our dataframe if they contain null values
    """

    df_filtered = df_input.filter(
        F.col('timestamp').isNotNull() &
        F.col("turbine_id").isNotNull() & 
        F.col("wind_speed").isNotNull() &
        F.col("wind_direction").isNotNull() & 
        F.col("power_output").isNotNull()
    )
    
    return df_filtered

In [22]:
# Extract any outliers which fall outside of some set ranges
def extract_outliers(df_input: DataFrame) -> DataFrame:
    """
    Removes rows where any specified column values fall outside their defined min and max limits.
    """

    # Apply filter conditions for each column
    df_filtered = df_input.filter(
        (F.col('wind_speed') >= 0.0) & (F.col('wind_speed') <= 18.0) &
        (F.col('wind_direction') >= 0) & (F.col('wind_direction') <= 359) &
        (F.col('power_output') >= 1.0) & (F.col('power_output') <= 9.0)
    )

    return df_filtered

Gold Functions

In [None]:
def generate_summary_statistics_gold_df(input_df: DataFrame) -> DataFrame:
    ''' 
        Takes our silver dataframe and generates summary statistics for each turbine
        Result is a dataframe that shows the min, max and average power output for each turbine on each day 
    '''

    df_with_date = input_df\
                    .withColumn("date", F.col("timestamp").cast(DateType()))

    # Group by 'turbine_id' and 'date', and calculate summary statistics
    summary_df = df_with_date.groupBy("turbine_id", "date").agg(
        F.min("power_output").alias("min_power_output"),
        F.max("power_output").alias("max_power_output"),
        F.avg("power_output").alias("avg_power_output")
    )

    return summary_df

In [None]:
def generate_anomalies_gold_df(input_df: DataFrame) -> DataFrame:
    ''' 
    Identifies anomalies in power output for each turbine, defined as outputs 
    outside of 2 standard deviations from the mean for each day.
    '''

    # Convert 'timestamp' to 'date' and calculate our mean and standard deviation over our window of turbine_id and date
    df_with_date = input_df.withColumn("date", F.col("timestamp").cast(DateType()))
    windowSpec = Window.partitionBy("turbine_id", "date")
    stats_df = df_with_date.withColumn("mean_power_output", F.avg("power_output").over(windowSpec)) \
                           .withColumn("stddev_power_output", F.stddev("power_output").over(windowSpec))

    # Define the range for normal data (mean ± 2 * stddev)
    stats_df = stats_df.withColumn("lower_bound", F.col("mean_power_output") - 2 * F.col("stddev_power_output")) \
                       .withColumn("upper_bound", F.col("mean_power_output") + 2 * F.col("stddev_power_output"))

    # Identify anomalies as those outside the bounds
    anomalies_df = stats_df.filter(
        (F.col("power_output") < F.col("lower_bound")) | (F.col("power_output") > F.col("upper_bound"))
    )

    return anomalies_df.select("timestamp", "turbine_id", "power_output", "mean_power_output", "stddev_power_output", "lower_bound", "upper_bound")