In [134]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [165]:
from src.utils import (
    load_data,
    remove_duplicates_in_coloumn,
    convert_from_degree_to_ciruclar,
    resample_hourly,
    create_time_features,
    find_repeated_indices,
    load_val_dates,
    create_lag_features
)

import pandas as pd
import numpy as np

In [136]:
data = load_data()

In [137]:
# Remove duplicate rows
for location in data.keys():
    df = data[location]

    df["y"] = remove_duplicates_in_coloumn(df["y"], "time")
    df["X_test_estimated"] = remove_duplicates_in_coloumn(df["X_test_estimated"], "date_forecast")
    df["X_train_estimated"] = remove_duplicates_in_coloumn(df["X_train_estimated"], "date_forecast")
    df["X_train_observed"] = remove_duplicates_in_coloumn(df["X_train_observed"], "date_forecast")

    data[location] = df

In [138]:
# Drop columns
columns_to_drop = [
    "ceiling_height_agl:m",
    "cloud_base_agl:m",
    "snow_density:kgm3",
    "elevation:m",
    "precip_5min:mm",
    "precip_type_5min:idx",
    "pressure_50m:hPa",
    "snow_drift:idx",
    "wind_speed_u_10m:ms",
    "wind_speed_v_10m:ms",
    "wind_speed_w_1000hPa:ms",
    "date_calc",

    # Duplicate columns
    "diffuse_rad_1h:J",
    "direct_rad_1h:J",
    "clear_sky_energy_1h:J",
]

for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = df["X_test_estimated"].drop(columns=columns_to_drop, errors="ignore")
    df["X_train_estimated"] = df["X_train_estimated"].drop(columns=columns_to_drop, errors="ignore")
    df["X_train_observed"] = df["X_train_observed"].drop(columns=columns_to_drop, errors="ignore")

    data[location] = df

In [139]:
# Sun azimuth feature engineering
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = convert_from_degree_to_ciruclar(df["X_test_estimated"], "sun_azimuth:d")
    df["X_train_estimated"] = convert_from_degree_to_ciruclar(df["X_train_estimated"], "sun_azimuth:d")
    df["X_train_observed"] = convert_from_degree_to_ciruclar(df["X_train_observed"], "sun_azimuth:d")

    data[location] = df

In [140]:
# Reduce granularity of data to hourly
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = resample_hourly(df["X_test_estimated"], func="sum")
    df["X_train_estimated"] = resample_hourly(df["X_train_estimated"], func="sum")
    df["X_train_observed"] = resample_hourly(df["X_train_observed"], func="sum")

    data[location] = df

In [141]:
data["A"]["X_test_estimated"].columns

Index(['date_forecast', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'clear_sky_rad:W', 'dew_or_rime:idx', 'dew_point_2m:K', 'diffuse_rad:W',
       'direct_rad:W', 'effective_cloud_cover:p', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'pressure_100m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_depth:cm', 'snow_melt_10min:mm', 'snow_water:kgm2',
       'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'sun_azimuth:d_sin', 'sun_azimuth:d_cos'],
      dtype='object')

In [142]:
# Create lag features
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = create_lag_features(
        df["X_test_estimated"],
        columns=["effective_cloud_cover:p", "absolute_humidity_2m:gm3"],
        lag=-1
    )
    df["X_train_estimated"] = create_lag_features(
        df["X_train_estimated"],
        columns=["effective_cloud_cover:p", "absolute_humidity_2m:gm3"],
        lag=-1
    )
    df["X_train_observed"] = create_lag_features(
        df["X_train_observed"],
        columns=["effective_cloud_cover:p", "absolute_humidity_2m:gm3"],
        lag=-1
    )

    df["X_test_estimated"] = create_lag_features(
        df["X_test_estimated"],
        columns=["effective_cloud_cover:p", "absolute_humidity_2m:gm3"],
        lag=-2
    )
    df["X_train_estimated"] = create_lag_features(
        df["X_train_estimated"],
        columns=["effective_cloud_cover:p", "absolute_humidity_2m:gm3"],
        lag=-2
    )
    df["X_train_observed"] = create_lag_features(
        df["X_train_observed"],
        columns=["effective_cloud_cover:p", "absolute_humidity_2m:gm3"],
        lag=-2
    )

    data[location] = df

In [143]:
# Function that returns train_targets, observed and estimated sets left after filtering away NaN
def drop_nan_rows_in_target_and_train(df):
    df["y"] = df["y"].dropna(subset=["pv_measurement"])
    valid_dates = df["y"]["time"]

    df["X_train_observed"] = df["X_train_observed"][
        df["X_train_observed"]["date_forecast"].isin(valid_dates)
    ]
    df["X_train_estimated"] = df["X_train_estimated"][
        df["X_train_estimated"]["date_forecast"].isin(valid_dates)
    ]
    df["y"] = df["y"][
        df["y"]["time"].isin(df["X_train_observed"]["date_forecast"])
        | df["y"]["time"].isin(df["X_train_estimated"]["date_forecast"])
    ]

    return df

In [144]:
for location in data.keys():
    df = data[location]

    df = drop_nan_rows_in_target_and_train(df)

    data[location] = df

In [145]:
# Create time features
for location in data.keys():
    df = data[location]

    df["X_test_estimated"] = create_time_features(df["X_test_estimated"], "date_forecast")
    df["X_train_estimated"] = create_time_features(df["X_train_estimated"], "date_forecast")
    df["X_train_observed"] = create_time_features(df["X_train_observed"], "date_forecast")

    data[location] = df

In [146]:
print("A:")
print(data["A"]["X_train_estimated"].shape, data["A"]["X_train_observed"].shape, data["A"]["y"].shape)

print("B:")
print(data["B"]["X_train_estimated"].shape, data["B"]["X_train_observed"].shape, data["B"]["y"].shape)

print("C:")
print(data["C"]["X_train_estimated"].shape, data["C"]["X_train_observed"].shape, data["C"]["y"].shape)


A:
(4418, 44) (29667, 44) (34085, 2)
B:
(3625, 44) (29218, 44) (32843, 2)
C:
(2954, 44) (23141, 44) (26095, 2)


In [147]:
# Making training and validation data for A

X_train = pd.DataFrame()
y_train = pd.DataFrame()

X_validate = pd.DataFrame()
y_validate = pd.DataFrame()

for location in data.keys():
    percent_observed_train_a = 1
    percent_estimated_train_a = 1

    split_index_obs = int(
        len(data[location]["X_train_observed"]) * percent_observed_train_a
    )
    split_index_est = int(
        len(data[location]["X_train_estimated"]) * percent_estimated_train_a
    )

    X_train_observed_first_75 = data[location]["X_train_observed"][:split_index_obs]
    X_train_observed_last_25 = data[location]["X_train_observed"][split_index_obs:]

    X_train_estimated_first_75 = data[location]["X_train_estimated"][:split_index_est]
    X_train_estimated_last_25 = data[location]["X_train_estimated"][split_index_est:]

    X_train_loc = pd.concat([X_train_observed_first_75, X_train_estimated_first_75])
    y_train_loc = data[location]["y"][
        data[location]["y"]["time"].isin(X_train_loc["date_forecast"])
    ]

    X_validate_loc = pd.concat([X_train_observed_last_25, X_train_estimated_last_25])
    y_validate_loc = data[location]["y"][
        data[location]["y"]["time"].isin(X_validate_loc["date_forecast"])
    ]

    repeated_indices = find_repeated_indices(y_train_loc, "pv_measurement", 24)
    y_train_loc = y_train_loc.reset_index()
    y_train_loc = y_train_loc.drop(repeated_indices)
    X_train_loc = X_train_loc[X_train_loc["date_forecast"].isin(y_train_loc["time"])]

    repeated_indices = find_repeated_indices(y_validate_loc, "pv_measurement", 24)
    y_validate_loc = y_validate_loc.reset_index()
    y_validate_loc = y_validate_loc.drop(repeated_indices)
    X_validate_loc = X_validate_loc[
        X_validate_loc["date_forecast"].isin(y_validate_loc["time"])
    ]

    y_train_loc.reset_index(drop=True, inplace=True)
    X_train_loc.reset_index(drop=True, inplace=True)
    y_validate_loc.reset_index(drop=True, inplace=True)
    X_validate_loc.reset_index(drop=True, inplace=True)

    X_train_loc["location"] = location
    y_train_loc["location"] = location
    X_validate_loc["location"] = location
    y_validate_loc["location"] = location

    X_train_loc.drop("date_forecast", axis=1, inplace=True)
    y_train_loc.drop("time", axis=1, inplace=True)
    X_validate_loc.drop("date_forecast", axis=1, inplace=True)
    y_validate_loc.drop("time", axis=1, inplace=True)

    X_train_loc = X_train_loc.reset_index().drop(columns="index")
    one_hot = pd.get_dummies(X_train_loc["location"]).astype(int)
    X_train_loc = X_train_loc.drop("location", axis=1)
    X_train_loc = pd.merge(X_train_loc, one_hot, left_index=True, right_index=True)

    X_train = pd.concat([X_train_loc, X_train])
    y_train = pd.concat([y_train_loc, y_train])
    X_validate = pd.concat([X_validate_loc, X_validate])
    y_validate = pd.concat([y_validate_loc, y_validate])

In [148]:
data["A"]["X_test_estimated"]["location"] = "A"
data["B"]["X_test_estimated"]["location"] = "B"
data["C"]["X_test_estimated"]["location"] = "C"

X_test = pd.concat([data["A"]["X_test_estimated"], data["B"]["X_test_estimated"], data["C"]["X_test_estimated"]])
# filtering out invalid dates:
X_test = X_test[X_test["date_forecast"].isin(load_val_dates())]
# removing forecast coloum
X_test = X_test.drop("date_forecast", axis=1)

X_test = X_test.reset_index().drop(columns="index")
one_hot = pd.get_dummies(X_test["location"]).astype(int)
X_test = X_test.drop("location", axis=1)
X_test = pd.merge(X_test, one_hot, left_index=True, right_index=True)

In [149]:
columns_to_exclude = ["A", "B", "C", "dew_or_rime:idx", "is_day:idx", "_in_shadow:idx"]

columns_to_normalize = [col for col in X_train.columns if col not in columns_to_exclude]

#Min-max
# Calculate min and max values for scaling
X_min = X_train[columns_to_normalize].min()
X_max = X_train[columns_to_normalize].max()

# Apply min-max scaling to the columns to be normalized
X_train[columns_to_normalize] = (X_train[columns_to_normalize] - X_min) / (X_max - X_min)
X_validate[columns_to_normalize] = (X_validate[columns_to_normalize] - X_min) / (X_max - X_min)
X_test[columns_to_normalize] = (X_test[columns_to_normalize] - X_min) / (X_max - X_min)


In [150]:
from sklearn.preprocessing import MinMaxScaler

y_scaler = MinMaxScaler()
y_train["pv_measurement"] = y_scaler.fit_transform(y_train["pv_measurement"].values.reshape(-1,1))

In [220]:
X_train = X_train.reset_index().drop(columns="index")
y_train = y_train.reset_index().drop(columns="index")
new_train = pd.merge(X_train, y_train["pv_measurement"], left_index=True, right_index=True)
new_train = new_train.fillna(0)

X_validate = X_validate.reset_index().drop(columns="index")
y_validate = y_validate.reset_index().drop(columns="index")
new_validate = pd.merge(X_validate, y_validate["pv_measurement"], left_index=True, right_index=True)

In [152]:
new_train.to_csv("../data/processed/train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)


In [256]:
repeated_indices_c.__len__()

0

In [251]:
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold

In [None]:
# 3D AWESOME DEEEEP NEURAL NEWORK

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = Sequential()
  model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Flatten())
  model.add(Dense(256, activation='relu'))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(no_classes, activation='softmax'))

  # Compile the model
  model.compile(loss=loss_function,
                optimizer=optimizer,
                metrics=['accuracy'])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(inputs[train], targets[train],
              batch_size=batch_size,
              epochs=no_epochs,
              verbose=verbosity)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

In [228]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

h2o_train= h2o.H2OFrame(new_train)
h2o_test = h2o.H2OFrame(X_test)

aml = H2OAutoML(max_models=20, seed=1, stopping_metric="MAE", sort_metric="MAE", stopping_tolerance=0.01)
# aml.train(x=h2o_train.columns, y="pv_measurement", training_frame=h2o_train, validation_frame=h2o_validate)
aml.train(x=h2o_train.columns, y="pv_measurement", training_frame=h2o_train)
lb = aml.leaderboard
preds = aml.leader.predict(h2o_test)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.1" 2021-10-19; OpenJDK Runtime Environment Temurin-17.0.1+12 (build 17.0.1+12); OpenJDK 64-Bit Server VM Temurin-17.0.1+12 (build 17.0.1+12, mixed mode)
  Starting server from /Users/mathiasraa/anaconda3/envs/forecasting/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/xd/z_ptq9v136q7kj9lf2f4sblh0000gn/T/tmpiauhqxnm
  JVM stdout: /var/folders/xd/z_ptq9v136q7kj9lf2f4sblh0000gn/T/tmpiauhqxnm/h2o_mathiasraa_started_from_python.out
  JVM stderr: /var/folders/xd/z_ptq9v136q7kj9lf2f4sblh0000gn/T/tmpiauhqxnm/h2o_mathiasraa_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.4
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_mathiasraa_e9goa0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
14:43:44.451: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [230]:
aml.leader.predict(h2o.H2OFrame(new_train)).as_data_frame().to_csv("../data/comparison/model_local.csv", index=False)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [231]:
lb

model_id,mae,rmse,mse,rmsle,mean_residual_deviance
GBM_grid_1_AutoML_1_20231016_144344_model_5,0.0140406,0.0424401,0.00180116,0.0313323,0.00180116
StackedEnsemble_BestOfFamily_1_AutoML_1_20231016_144344,0.0143641,0.0423158,0.00179063,0.0312811,0.00179063
GBM_4_AutoML_1_20231016_144344,0.014453,0.0433035,0.00187519,0.0320185,0.00187519
StackedEnsemble_AllModels_1_AutoML_1_20231016_144344,0.0145703,0.0411439,0.00169282,0.0304413,0.00169282
GBM_3_AutoML_1_20231016_144344,0.014981,0.0441991,0.00195356,0.0326893,0.00195356
DRF_1_AutoML_1_20231016_144344,0.0151076,0.0446005,0.0019892,0.0330795,0.0019892
GBM_1_AutoML_1_20231016_144344,0.0152928,0.0451105,0.00203496,0.0333356,0.00203496
GBM_2_AutoML_1_20231016_144344,0.0155128,0.0454086,0.00206194,0.0335916,0.00206194
GBM_grid_1_AutoML_1_20231016_144344_model_4,0.0155716,0.042865,0.00183741,0.031767,0.00183741
GBM_5_AutoML_1_20231016_144344,0.0160482,0.0463692,0.0021501,0.0343221,0.0021501


In [249]:
h2o.cluster().shutdown

<bound method H2OCluster.shutdown of <h2o.backend.cluster.H2OCluster object at 0x2944dd3c0>>

In [236]:
preds = aml.leader.predict(h2o_test).as_data_frame()
preds


gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,predict
0,-0.000001
1,0.000014
2,0.000079
3,0.013178
4,0.063545
...,...
2155,0.011058
2156,0.006231
2157,0.002533
2158,0.001854


In [243]:
predictions = pd.read_csv("../data/results/predictions_2.csv").drop(columns="Unnamed: 0")

In [246]:
prediction_df_scaled = y_scaler.inverse_transform(predictions)
prediction_df_scaled_df = pd.DataFrame(prediction_df_scaled)

In [247]:
resultframe = pd.DataFrame(columns = ["id", "prediction"])
resultframe["prediction"] = prediction_df_scaled_df
resultframe['prediction'] = np.where(resultframe['prediction'] < 0, 0, resultframe['prediction'])
resultframe["id"] = range(len(resultframe))
resultframe.head()

Unnamed: 0,id,prediction
0,0,0.0
1,1,0.0
2,2,0.0
3,3,49.371084
4,4,385.476516


In [248]:
import datetime


resultframe.to_csv("../data/results/"+ str(datetime.datetime.now()) + "-submission.csv", index=False)