In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [84]:
from src.utils import (
    load_data,
    remove_duplicates_in_coloumn,
    convert_from_degree_to_ciruclar,
    resample_hourly,
    create_time_features,
    find_repeated_indices,
    load_val_dates
)

import pandas as pd

In [92]:
data = load_data()

In [93]:
# Remove duplicate rows
for location in data.keys():
    df = data[location]

    df["y"] = remove_duplicates_in_coloumn(df["y"], "time")
    df["X_test_estimated"] = remove_duplicates_in_coloumn(df["X_test_estimated"], "date_forecast")
    df["X_train_estimated"] = remove_duplicates_in_coloumn(df["X_train_estimated"], "date_forecast")
    df["X_train_observed"] = remove_duplicates_in_coloumn(df["X_train_observed"], "date_forecast")

    data[location] = df

In [94]:
# Drop columns
columns_to_drop = [
    "ceiling_height_agl:m",
    "cloud_base_agl:m",
    "snow_density:kgm3",
    "elevation:m",
    "precip_5min:mm",
    "precip_type_5min:idx",
    "pressure_50m:hPa",
    "snow_drift:idx",
    "wind_speed_u_10m:ms",
    "wind_speed_v_10m:ms",
    "wind_speed_w_1000hPa:ms",
    "date_calc"
]

for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = df["X_test_estimated"].drop(columns=columns_to_drop, errors="ignore")
    df["X_train_estimated"] = df["X_train_estimated"].drop(columns=columns_to_drop, errors="ignore")
    df["X_train_observed"] = df["X_train_observed"].drop(columns=columns_to_drop, errors="ignore")

    data[location] = df

In [95]:
# Sun azimuth feature engineering
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = convert_from_degree_to_ciruclar(df["X_test_estimated"], "sun_azimuth:d")
    df["X_train_estimated"] = convert_from_degree_to_ciruclar(df["X_train_estimated"], "sun_azimuth:d")
    df["X_train_observed"] = convert_from_degree_to_ciruclar(df["X_train_observed"], "sun_azimuth:d")

    data[location] = df

In [96]:
# Reduce granularity of data to hourly
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = resample_hourly(df["X_test_estimated"], func="sum")
    df["X_train_estimated"] = resample_hourly(df["X_train_estimated"], func="sum")
    df["X_train_observed"] = resample_hourly(df["X_train_observed"], func="sum")

    data[location] = df

In [97]:
# Function that returns train_targets, observed and estimated sets left after filtering away NaN
def drop_nan_rows_in_target_and_train(df):
    df["y"] = df["y"].dropna(subset=["pv_measurement"])
    valid_dates = df["y"]["time"]

    df["X_train_observed"] = df["X_train_observed"][
        df["X_train_observed"]["date_forecast"].isin(valid_dates)
    ]
    df["X_train_estimated"] = df["X_train_estimated"][
        df["X_train_estimated"]["date_forecast"].isin(valid_dates)
    ]
    df["y"] = df["y"][
        df["y"]["time"].isin(df["X_train_observed"]["date_forecast"])
        | df["y"]["time"].isin(df["X_train_estimated"]["date_forecast"])
    ]

    return df

In [98]:
for location in data.keys():
    df = data[location]

    df = drop_nan_rows_in_target_and_train(df)

    data[location] = df

In [99]:
# Create time features
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = create_time_features(df["X_test_estimated"], "date_forecast")
    df["X_train_estimated"] = create_time_features(df["X_train_estimated"], "date_forecast")
    df["X_train_observed"] = create_time_features(df["X_train_observed"], "date_forecast")

    data[location] = df

In [100]:
print("A:")
print(data["A"]["X_train_estimated"].shape, data["A"]["X_train_observed"].shape, data["A"]["y"].shape)

print("B:")
print(data["B"]["X_train_estimated"].shape, data["B"]["X_train_observed"].shape, data["B"]["y"].shape)

print("C:")
print(data["C"]["X_train_estimated"].shape, data["C"]["X_train_observed"].shape, data["C"]["y"].shape)


A:
(4418, 43) (29667, 43) (34085, 2)
B:
(3625, 43) (29218, 43) (32843, 2)
C:
(2954, 43) (23141, 43) (26095, 2)


In [101]:
# Making training and validation data for A

X_train = pd.DataFrame()
y_train = pd.DataFrame()

X_validate = pd.DataFrame()
y_validate = pd.DataFrame()

for location in data.keys():
    percent_observed_train_a = 1
    percent_estimated_train_a = 1

    split_index_obs = int(
        len(data[location]["X_train_observed"]) * percent_observed_train_a
    )
    split_index_est = int(
        len(data[location]["X_train_estimated"]) * percent_estimated_train_a
    )

    X_train_observed_first_75 = data[location]["X_train_observed"][:split_index_obs]
    X_train_observed_last_25 = data[location]["X_train_observed"][split_index_obs:]

    X_train_estimated_first_75 = data[location]["X_train_estimated"][:split_index_est]
    X_train_estimated_last_25 = data[location]["X_train_estimated"][split_index_est:]

    X_train_loc = pd.concat([X_train_observed_first_75, X_train_estimated_first_75])
    y_train_loc = data[location]["y"][
        data[location]["y"]["time"].isin(X_train_loc["date_forecast"])
    ]

    X_validate_loc = pd.concat([X_train_observed_last_25, X_train_estimated_last_25])
    y_validate_loc = data[location]["y"][
        data[location]["y"]["time"].isin(X_validate_loc["date_forecast"])
    ]

    repeated_indices = find_repeated_indices(y_train_loc, "pv_measurement", 24)
    y_train_loc = y_train_loc.reset_index()
    y_train_loc = y_train_loc.drop(repeated_indices)
    X_train_loc = X_train_loc[X_train_loc["date_forecast"].isin(y_train_loc["time"])]

    repeated_indices = find_repeated_indices(y_validate_loc, "pv_measurement", 24)
    y_validate_loc = y_validate_loc.reset_index()
    y_validate_loc = y_validate_loc.drop(repeated_indices)
    X_validate_loc = X_validate_loc[
        X_validate_loc["date_forecast"].isin(y_validate_loc["time"])
    ]

    y_train_loc.reset_index(drop=True, inplace=True)
    X_train_loc.reset_index(drop=True, inplace=True)
    y_validate_loc.reset_index(drop=True, inplace=True)
    X_validate_loc.reset_index(drop=True, inplace=True)

    X_train_loc["location"] = location
    y_train_loc["location"] = location
    X_validate_loc["location"] = location
    y_validate_loc["location"] = location

    X_train_loc.drop("date_forecast", axis=1, inplace=True)
    y_train_loc.drop("time", axis=1, inplace=True)
    X_validate_loc.drop("date_forecast", axis=1, inplace=True)
    y_validate_loc.drop("time", axis=1, inplace=True)

    X_train_loc = X_train_loc.reset_index().drop(columns="index")
    one_hot = pd.get_dummies(X_train_loc["location"]).astype(int)
    X_train_loc = X_train_loc.drop("location", axis=1)
    X_train_loc = pd.merge(X_train_loc, one_hot, left_index=True, right_index=True)

    X_train = pd.concat([X_train_loc, X_train])
    y_train = pd.concat([y_train_loc, y_train])
    X_validate = pd.concat([X_validate_loc, X_validate])
    y_validate = pd.concat([y_validate_loc, y_validate])

In [113]:
data["A"]["X_test_estimated"]["location"] = "A"
data["B"]["X_test_estimated"]["location"] = "B"
data["C"]["X_test_estimated"]["location"] = "C"

X_test = pd.concat([data["A"]["X_test_estimated"], data["B"]["X_test_estimated"], data["C"]["X_test_estimated"]])
# filtering out invalid dates:
X_test = X_test[X_test["date_forecast"].isin(load_val_dates())]
# removing forecast coloum
X_test = X_test.drop("date_forecast", axis=1)

X_test = X_test.reset_index().drop(columns="index")
one_hot = pd.get_dummies(X_test["location"]).astype(int)
X_test = X_test.drop("location", axis=1)
X_test = pd.merge(X_test, one_hot, left_index=True, right_index=True)

In [103]:
columns_to_exclude = ["A", "B", "C", "dew_or_rime:idx", "is_day:idx", "_in_shadow:idx"]

columns_to_normalize = [col for col in X_train.columns if col not in columns_to_exclude]

#Min-max
# Calculate min and max values for scaling
X_min = X_train[columns_to_normalize].min()
X_max = X_train[columns_to_normalize].max()

# Apply min-max scaling to the columns to be normalized
X_train[columns_to_normalize] = (X_train[columns_to_normalize] - X_min) / (X_max - X_min)
X_validate[columns_to_normalize] = (X_validate[columns_to_normalize] - X_min) / (X_max - X_min)
X_test[columns_to_normalize] = (X_test[columns_to_normalize] - X_min) / (X_max - X_min)


In [112]:
from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y_train["pv_measurement"] = y_scaler.fit_transform(y_train["pv_measurement"].values.reshape(-1,1))

In [109]:
X_train = X_train.reset_index().drop(columns="index")
y_train = y_train.reset_index().drop(columns="index")
new_train = pd.merge(X_train, y_train["pv_measurement"], left_index=True, right_index=True)
new_train = new_train.fillna(0)

X_validate = X_validate.reset_index().drop(columns="index")
y_validate = y_validate.reset_index().drop(columns="index")
new_validate = pd.merge(X_validate, y_validate["pv_measurement"], left_index=True, right_index=True)

In [111]:
new_train.to_csv("../data/processed/train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
