# Z-Score to identify and remove outliers

## ● Identifies anomalies: Identify any turbines that have significantly deviated from their expected power output over the same time period. Anomalies can be defined as turbines whose output is outside of 2 standard deviations from the mean.

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from get_data import main
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, mean, stddev, min, max, avg
import pyspark.sql.functions as F

df = main.enrich_data(spark)

def get_anomalies(data: DataFrame) -> DataFrame:
    # https://medium.com/@datasciencejourney100_83560/z-score-to-identify-and-remove-outliers-c17382a4a739
    data = data.groupBy("event_date", "turbine_id").agg(
        avg("power_output").alias("avg_power_output")
    )
    
    overall_avg = data.agg(mean("avg_power_output")).collect()[0][0]
    overall_stddev = data.agg(stddev("avg_power_output")).collect()[0][0]
    
    anomalies = data.withColumn(
        "anomalies_ind",
        F.when(
            (col("avg_power_output") < overall_avg - 2 * overall_stddev) | 
            (col("avg_power_output") > overall_avg + 2 * overall_stddev),
            "Y"
        ).otherwise("N")
    )
    
    return anomalies.filter(col("anomalies_ind")== "Y")

df = get_anomalies(df)
df.write.mode("overwrite").saveAsTable("cleaned_data")
display(df)

NameError: name 'spark' is not defined