In [46]:
#default and various
import pandas as pd
import numpy as np
import plotly.express as px
import os
import math

#datetime
from datetime import datetime
from datetime import timedelta

#ml-sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [47]:
g_dir : str         = "g_drive_data"
ip_dir : str        = "ip_forecast_data"
main_dir :str       = "main_data"

df_scada            = pd.read_csv(os.path.join(g_dir, "scada_data.csv"))
df_pc               = pd.read_csv(os.path.join(g_dir, "pc.csv"))

df_fc_6             = pd.read_csv(os.path.join(ip_dir,"weather_forecast_6.csv"))
df_fc_12            = pd.read_csv(os.path.join(ip_dir,"weather_forecast_12.csv"))
df_fc_3            = pd.read_csv(os.path.join(ip_dir,"weather_forecast_3.csv"))

## 2 Calculate predicte power output and deviation
### 2.1 Join the scada data and transformed weather forecast data

In [48]:
df_fc_list : list = [df_fc_6, df_fc_12, df_fc_3] #used for later itterations

In [49]:
#double checking the data for errors and impossible values

for df in df_fc_list:
    print(df.describe())

           temp_6.00  wind_speed_6.00  wind_direction_6.00
count  118297.000000    118297.000000        118297.000000
mean       24.259810         7.646981           138.637419
std         3.190744         2.710094            30.535595
min        16.200000         0.000000             0.000000
25%        21.800000         5.788565           128.333333
50%        24.027778         7.823814           135.666667
75%        26.700000         9.562440           142.000000
max        34.600000        15.647629           359.500000
          temp_12.00  wind_speed_12.00  wind_direction_12.00
count  118297.000000     118297.000000         118297.000000
mean       24.356345          7.858555            138.156933
std         3.298455          2.952758             26.227646
min        15.700000          0.000000              0.000000
25%        21.788889          5.813529            129.000000
50%        24.111111          8.041143            136.000000
75%        26.933333         10.030870    

In [50]:
#Creating non time zone sensitive column for joining

df_scada["time"] = pd.to_datetime(df_scada["utc_time"]).dt.tz_localize(None)

for df in df_fc_list:
    df["time"] = pd.to_datetime(df["time"]).dt.tz_localize(None)
    print(df[["time"]].dtypes)

time    datetime64[ns]
dtype: object
time    datetime64[ns]
dtype: object
time    datetime64[ns]
dtype: object


In [51]:
#prepare the data for joining on index

for df in df_fc_list:
    df.set_index("time", inplace = True)
    print(df.head(3).index)

df_scada.set_index("time", inplace = True)
print(df_scada.head(3).index)

DatetimeIndex(['2020-01-01 09:00:00', '2020-01-01 09:10:00',
               '2020-01-01 09:20:00'],
              dtype='datetime64[ns]', name='time', freq=None)
DatetimeIndex(['2020-01-01 15:00:00', '2020-01-01 15:10:00',
               '2020-01-01 15:20:00'],
              dtype='datetime64[ns]', name='time', freq=None)
DatetimeIndex(['2020-01-01 06:00:00', '2020-01-01 06:10:00',
               '2020-01-01 06:20:00'],
              dtype='datetime64[ns]', name='time', freq=None)
DatetimeIndex(['2020-01-01 06:00:00', '2020-01-01 06:00:00',
               '2020-01-01 06:00:00'],
              dtype='datetime64[ns]', name='time', freq=None)


In [52]:
#free up memory
del df_fc_list

In [53]:
#create a master df with interpolated dat
df_main_ip = df_scada.join(other = df_fc_6, how = "inner")
df_main_ip = df_main_ip.join(other = df_fc_12, how = "inner")
df_main_ip = df_main_ip.join(other = df_fc_3, how = "inner")

#free up memory
del df_fc_6
del df_fc_12
del df_fc_3

#df_main_ip.head()

In [54]:
df_main_ip.head()

Unnamed: 0_level_0,wt_id,utc_time,wind_speed_ms,power_kw,wind_direction,nacelle_direction,blade_angle_avg,rotor_speed,temp_environment,error_flag,...,wind_speed_6.00,wind_direction_6.00,init_12,temp_12.00,wind_speed_12.00,wind_direction_12.00,init_3,temp_3.00,wind_speed_3.00,wind_direction_3.00
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 15:00:00,26,2020-01-01 15:00:00+00:00,4.001,82.300167,,12.672984,1.367166,0.554133,23.039833,,...,2.960362,325.0,2020-01-01 03:00:00,26.4,4.916316,327.0,2020-01-01 12:00:00,25.55,2.458158,310.5
2020-01-01 15:00:00,29,2020-01-01 15:00:00+00:00,4.5435,124.5675,,356.938166,0.711501,0.553795,22.7335,,...,2.960362,325.0,2020-01-01 03:00:00,26.4,4.916316,327.0,2020-01-01 12:00:00,25.55,2.458158,310.5
2020-01-01 15:00:00,34,2020-01-01 15:00:00+00:00,3.9195,-20.268333,,25.268175,7.408832,0.580293,22.784333,,...,2.960362,325.0,2020-01-01 03:00:00,26.4,4.916316,327.0,2020-01-01 12:00:00,25.55,2.458158,310.5
2020-01-01 15:00:00,32,2020-01-01 15:00:00+00:00,2.844167,-9.254,,224.617608,34.342,0.06973,22.502167,,...,2.960362,325.0,2020-01-01 03:00:00,26.4,4.916316,327.0,2020-01-01 12:00:00,25.55,2.458158,310.5
2020-01-01 15:00:00,33,2020-01-01 15:00:00+00:00,2.846333,-8.911167,,209.598734,34.849307,0.065901,22.956,,...,2.960362,325.0,2020-01-01 03:00:00,26.4,4.916316,327.0,2020-01-01 12:00:00,25.55,2.458158,310.5


### 2.2 Apply the power curve table with the forecast data to get the predicted power output(y_hat_fc)
The air density as to be approximated with the temperature. The datapoints were taken from the ource below.

 - Get temparature density table data from source
 - Create linear function for the approximation of the density
 - Calulate the forecasted density in the df_main_ip
 - Calculate the corresponding power output

Table / Data source for approximation: https://www.engineeringtoolbox.com/air-density-specific-weight-d_600.html


In [55]:
#min / a: (15 / 1.2250)
#max / b: (40 / 1.1270)
#f(x) = -0.00392x + 1.2838
#round to .025 for getting the pc data

temp_to_dense = lambda x: round((x*(-0.00392) + 1.2838)*40) / 40
#temp_to_dense = lambda x : 1.225

#rounding wind speed for looking up the forecasted power output
round_wind_speed = lambda x: round(x,1)

In [56]:
#apply the lambda expression

df_main_ip["density_6.00"] = df_main_ip["temp_6.00"].apply(temp_to_dense)
df_main_ip["density_12.00"] = df_main_ip["temp_12.00"].apply(temp_to_dense)
df_main_ip["density_3.00"] = df_main_ip["temp_3.00"].apply(temp_to_dense)

In [57]:
#round wind speed forecast to one decimal for the power output look up

df_main_ip["wind_speed_6.00"] = df_main_ip["wind_speed_6.00"].apply(round_wind_speed)
df_main_ip["wind_speed_12.00"] = df_main_ip["wind_speed_12.00"].apply(round_wind_speed)
df_main_ip["wind_speed_3.00"] = df_main_ip["wind_speed_3.00"].apply(round_wind_speed)


In [58]:
#applying the power curve table
df_pc_6 = df_pc.add_suffix("_6.00")
df_pc_12 = df_pc.add_suffix("_12.00")
df_pc_3 = df_pc.add_suffix("_3.00")

In [59]:
df_main_ip = pd.merge(
    df_main_ip,
    df_pc_6,
    how = "left",
    left_on = ["wind_speed_6.00", "density_6.00"],
    right_on= ["wind_speed_6.00", "density_6.00"]
)

df_main_ip = pd.merge(
    df_main_ip,
    df_pc_12,
    how = "left",
    left_on = ["wind_speed_12.00", "density_12.00"],
    right_on= ["wind_speed_12.00", "density_12.00"]
)

df_main_ip = pd.merge(
    df_main_ip,
    df_pc_3,
    how = "left",
    left_on = ["wind_speed_3.00", "density_3.00"],
    right_on= ["wind_speed_3.00", "density_3.00"]
)

In [60]:
df_main_ip.head()

Unnamed: 0,wt_id,utc_time,wind_speed_ms,power_kw,wind_direction,nacelle_direction,blade_angle_avg,rotor_speed,temp_environment,error_flag,...,init_3,temp_3.00,wind_speed_3.00,wind_direction_3.00,density_6.00,density_12.00,density_3.00,power_6.00,power_12.00,power_3.00
0,26,2020-01-01 15:00:00+00:00,4.001,82.300167,,12.672984,1.367166,0.554133,23.039833,,...,2020-01-01 12:00:00,25.55,2.5,310.5,1.175,1.175,1.175,40.0,284.3,20.0
1,29,2020-01-01 15:00:00+00:00,4.5435,124.5675,,356.938166,0.711501,0.553795,22.7335,,...,2020-01-01 12:00:00,25.55,2.5,310.5,1.175,1.175,1.175,40.0,284.3,20.0
2,34,2020-01-01 15:00:00+00:00,3.9195,-20.268333,,25.268175,7.408832,0.580293,22.784333,,...,2020-01-01 12:00:00,25.55,2.5,310.5,1.175,1.175,1.175,40.0,284.3,20.0
3,32,2020-01-01 15:00:00+00:00,2.844167,-9.254,,224.617608,34.342,0.06973,22.502167,,...,2020-01-01 12:00:00,25.55,2.5,310.5,1.175,1.175,1.175,40.0,284.3,20.0
4,33,2020-01-01 15:00:00+00:00,2.846333,-8.911167,,209.598734,34.849307,0.065901,22.956,,...,2020-01-01 12:00:00,25.55,2.5,310.5,1.175,1.175,1.175,40.0,284.3,20.0


In [61]:
#first look at forecast data
fig = px.line(
    data_frame = df_main_ip.loc[df_main_ip["wt_id"] == 30].iloc[10000:11000],
    x = df_main_ip.iloc[10000:11000].index,
    y = ["wind_speed_ms","wind_speed_3.00","wind_speed_6.00", "wind_speed_12.00"],
    title = "wind speeds"
)

fig.show()

In [62]:
fig = px.line(
    data_frame = df_main_ip.loc[df_main_ip["wt_id"] == 30].iloc[10000:11000],
    x = df_main_ip.iloc[10000:11000].index,
    y = ["power_kw","power_3.00","power_6.00", "power_12.00"],
    title = "Power output"
)

fig.show()

### 2.3 Calculate the theoretical output from scada wind speed and power curve (y_hat_th)

In [63]:
df_main_ip.dropna(subset = ["temp_environment", "wind_speed_ms"], inplace = True)
df_main_ip.shape

(1007801, 28)

In [64]:
#calculate current density with the temperature
df_main_ip["density"] = df_main_ip["temp_environment"].apply(temp_to_dense)

#round wind speed to apply power curve table
df_main_ip["wind_speed_ms"] = df_main_ip["wind_speed_ms"].apply(round_wind_speed)

In [65]:
df_pc_theoretical = df_pc.add_suffix("_theoretical")

#apply power curve to values
df_main_ip = pd.merge(
    df_main_ip,
    df_pc_theoretical,
    how = "left",
    left_on = ["wind_speed_ms", "density"],
    right_on= ["wind_speed_theoretical", "density_theoretical"]
)

In [66]:
df_main_ip[["density","wind_speed_ms","power_kw","power_theoretical"]].head()

Unnamed: 0,density,wind_speed_ms,power_kw,power_theoretical
0,1.2,4.0,82.300167,139.0
1,1.2,4.5,124.5675,224.5
2,1.2,3.9,-20.268333,129.1
3,1.2,2.8,-9.254,32.0
4,1.2,2.8,-8.911167,32.0


### 2.4 Calculate the deviation and mean deviation from the scada data power output (y) from the pc predicted output

In [67]:
#set negative power_kw valus to zero
zero_max = lambda x: 0 if x < 0 else x
df_main_ip["power_kw"] = df_main_ip["power_kw"].apply(zero_max)

In [68]:
for power_value in ["_theoretical","_3.00","_6.00","_12.00"]:

    column = f"power{power_value}"
    deviation_column = f"deviation{power_value}"

    df_main_ip[deviation_column] = df_main_ip[column] - df_main_ip["power_kw"]

df_main_ip.head()

Unnamed: 0,wt_id,utc_time,wind_speed_ms,power_kw,wind_direction,nacelle_direction,blade_angle_avg,rotor_speed,temp_environment,error_flag,...,power_12.00,power_3.00,density,wind_speed_theoretical,power_theoretical,density_theoretical,deviation_theoretical,deviation_3.00,deviation_6.00,deviation_12.00
0,26,2020-01-01 15:00:00+00:00,4.0,82.300167,,12.672984,1.367166,0.554133,23.039833,,...,284.3,20.0,1.2,4.0,139.0,1.2,56.699833,-62.300167,-42.300167,201.999833
1,29,2020-01-01 15:00:00+00:00,4.5,124.5675,,356.938166,0.711501,0.553795,22.7335,,...,284.3,20.0,1.2,4.5,224.5,1.2,99.9325,-104.5675,-84.5675,159.7325
2,34,2020-01-01 15:00:00+00:00,3.9,0.0,,25.268175,7.408832,0.580293,22.784333,,...,284.3,20.0,1.2,3.9,129.1,1.2,129.1,20.0,40.0,284.3
3,32,2020-01-01 15:00:00+00:00,2.8,0.0,,224.617608,34.342,0.06973,22.502167,,...,284.3,20.0,1.2,2.8,32.0,1.2,32.0,20.0,40.0,284.3
4,33,2020-01-01 15:00:00+00:00,2.8,0.0,,209.598734,34.849307,0.065901,22.956,,...,284.3,20.0,1.2,2.8,32.0,1.2,32.0,20.0,40.0,284.3


In [69]:
#get an overview of the mean deviation
df_ip_mean_dev = df_main_ip[["wt_id","deviation_theoretical","deviation_3.00","deviation_6.00","deviation_12.00"]].groupby("wt_id").mean()
df_ip_mean_dev.head(20).sort_values(by = "deviation_3.00")

Unnamed: 0_level_0,deviation_theoretical,deviation_3.00,deviation_6.00,deviation_12.00
wt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
28,70.029808,-429.017303,-64.47203,-16.150338
34,83.415683,-415.974119,-50.100325,-1.258397
29,70.311261,-384.28851,-20.85075,27.53244
35,83.503754,-376.411609,-12.032874,36.399292
27,69.3677,-334.615413,30.018487,78.008703
26,73.082621,-320.347127,44.972561,93.212648
30,99.20264,-215.775562,149.518351,198.120897
31,114.445201,-173.83763,190.787486,239.235384
33,120.877565,-173.608047,190.587588,238.807232
32,131.30378,-171.645162,193.499276,242.984081


In [70]:
fig = px.bar(
    data_frame = df_ip_mean_dev,
    x = df_ip_mean_dev.index,
    y = ["deviation_theoretical", "deviation_3.00", "deviation_6.00", "deviation_12.00"],
    barmode = "group",
)
fig.show()

In [71]:
#overall mean deviaton:
df_ip_mean_dev_total = df_main_ip[["deviation_theoretical","deviation_3.00","deviation_6.00","deviation_12.00"]].mean()
df_ip_mean_dev_total

deviation_theoretical     91.525692
deviation_3.00          -299.611815
deviation_6.00            65.131975
deviation_12.00          113.627500
dtype: float64

The different turbines differ heavly in the deviton from the prediction and the effective value.
Not differ the same, meaning there is a bias for each turbine

Furthermore the deviation fro the 3 hour period is the most off, and the 6 houre seems to be the most accureate.

### 2.5 Comparing the deviation of the raw vs interpolated values

In [72]:
#get the non interpolated data
relevant_timestamps = pd.read_csv(os.path.join(g_dir, "forecasts_temp.csv"))
data_points = pd.to_datetime(relevant_timestamps["init"]).dt.tz_localize(None)
df_main_raw = df_main_ip[pd.to_datetime(df_main_ip["init_6"]).dt.tz_localize(None).isin(data_points)]

#create the deviation df
df_raw_mean_dev = df_main_raw[["wt_id","deviation_theoretical","deviation_3.00","deviation_6.00","deviation_12.00"]].groupby("wt_id").mean()
df_raw_mean_dev.head(20).sort_values(by = "deviation_6.00")

Unnamed: 0_level_0,deviation_theoretical,deviation_3.00,deviation_6.00,deviation_12.00
wt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
28,72.633179,-446.179365,-91.392369,-37.833676
34,85.657163,-437.469711,-81.982916,-27.228098
29,75.907288,-399.045915,-45.169668,8.91189
35,84.012229,-396.636175,-42.927807,11.587335
27,72.335472,-347.184925,8.183737,61.983135
26,72.863136,-334.099793,21.640603,75.486129
30,105.184056,-246.174526,109.728205,163.847106
31,118.85229,-203.505127,150.347747,204.101509
33,127.057757,-202.417419,151.518457,205.039925
32,139.745905,-197.532687,156.495194,211.743416


In [73]:
#get relative deviation for ip and raw values
round(abs((df_raw_mean_dev / df_ip_mean_dev) - 1),2)

Unnamed: 0_level_0,deviation_theoretical,deviation_3.00,deviation_6.00,deviation_12.00
wt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26,0.0,0.04,0.52,0.19
27,0.04,0.04,0.73,0.21
28,0.04,0.04,0.42,1.34
29,0.08,0.04,1.17,0.68
30,0.06,0.14,0.27,0.17
31,0.04,0.17,0.21,0.15
32,0.06,0.15,0.19,0.13
33,0.05,0.17,0.2,0.14
34,0.03,0.05,0.64,20.64
35,0.01,0.05,2.57,0.68


In [74]:
#get absolute deviation for ip and raw values
round(abs(df_raw_mean_dev - df_ip_mean_dev),2)

Unnamed: 0_level_0,deviation_theoretical,deviation_3.00,deviation_6.00,deviation_12.00
wt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
26,0.22,13.75,23.33,17.73
27,2.97,12.57,21.83,16.03
28,2.6,17.16,26.92,21.68
29,5.6,14.76,24.32,18.62
30,5.98,30.4,39.79,34.27
31,4.41,29.67,40.44,35.13
32,8.44,25.89,37.0,31.24
33,6.18,28.81,39.07,33.77
34,2.24,21.5,31.88,25.97
35,0.51,20.22,30.89,24.81


In [75]:
#save the pre processed data frames
df_main_ip.to_csv(os.path.join(main_dir, "df_main_interpolated.csv"), index=None)
#df_main_raw.to_csv(os.path.join(main_dir, "df_main_rawdata.csv"), index=None)