## Imports

In [842]:
import os

In [843]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.layers import Flatten
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.ensemble import HistGradientBoostingRegressor

## Constants

In [844]:
EPOCHS = 100
BATCH_SIZE = 32
PATIENCE = 10

root = os.getcwd()

## Load and Process Data

In [845]:
def aggregate_data(df):
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')

    df.set_index('hourly_timestamp', inplace=True)

    aggregations = {col: 'median' for col in df.columns if col != 'date_forecast'}
    aggregations['date_forecast'] = 'first'

    df = df.groupby('hourly_timestamp').agg(aggregations).reset_index()

    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    
    return df

In [846]:
def clean_aggregated_data(df):
    df = add_date_features(df.copy())
    df.drop(columns=['snow_density:kgm3'], inplace=True)
    
    if "date_calc" in df.columns:
        df.drop(columns=['date_calc'], inplace=True)
    return df

In [847]:
 # Function to add date features
def add_date_features(X):
    X['hour_sin'] = np.sin(2 * np.pi * X['date_forecast'].dt.hour / 24)
    X['hour_cos'] = np.cos(2 * np.pi * X['date_forecast'].dt.hour / 24)
    X['month_sin'] = np.sin(2 * np.pi * X['date_forecast'].dt.month / 12)
    X['month_cos'] = np.cos(2 * np.pi * X['date_forecast'].dt.month / 12)
    
    return X

In [848]:
def process(data, targets):
    aggregated_data = aggregate_data(data.copy())
    
    aggregated_data = clean_aggregated_data(aggregated_data)
    
    merged_data = aggregated_data.merge(targets[['time', 'pv_measurement']], how='inner', left_on='date_forecast', right_on='time')
    merged_data.dropna(subset=['pv_measurement'], inplace=True)
    y = merged_data["pv_measurement"]
    X = merged_data.drop(columns=['pv_measurement', 'date_forecast', 'time'])
    return X, y

In [849]:
# Directory paths for each group of data
train_dirpath_A = os.path.join(root, "data", "A")
train_dirpath_B = os.path.join(root, "data", "B")
train_dirpath_C = os.path.join(root, "data", "C")

# pv_measurement for all training and validation rows
a_targets = pd.read_parquet(os.path.join(train_dirpath_A, 'train_targets.parquet'))
b_targets = pd.read_parquet(os.path.join(train_dirpath_B, 'train_targets.parquet'))
c_targets = pd.read_parquet(os.path.join(train_dirpath_C, 'train_targets.parquet'))

# training data
X_train_observed_a = pd.read_parquet(os.path.join(train_dirpath_A, 'X_train_observed.parquet'))
X_train_observed_b = pd.read_parquet(os.path.join(train_dirpath_B, 'X_train_observed.parquet'))
X_train_observed_c = pd.read_parquet(os.path.join(train_dirpath_C, 'X_train_observed.parquet'))

# validation data
X_train_estimated_a = pd.read_parquet(os.path.join(train_dirpath_A, 'X_train_estimated.parquet'))
X_train_estimated_b = pd.read_parquet(os.path.join(train_dirpath_B, 'X_train_estimated.parquet'))
X_train_estimated_c = pd.read_parquet(os.path.join(train_dirpath_C, 'X_train_estimated.parquet'))

# test data
X_test_estimated_a = pd.read_parquet(os.path.join(train_dirpath_A, 'X_test_estimated.parquet'))
X_test_estimated_b = pd.read_parquet(os.path.join(train_dirpath_B, 'X_test_estimated.parquet'))
X_test_estimated_c = pd.read_parquet(os.path.join(train_dirpath_C, 'X_test_estimated.parquet'))


In [850]:
X_train_A, y_train_A = process(X_train_observed_a, a_targets)
X_val_A, y_val_A = process(X_train_estimated_a, a_targets)

X_test_A = aggregate_data(X_test_estimated_a.copy())
X_test_A = clean_aggregated_data(X_test_A.copy())
X_test_A.drop(columns=['date_forecast'], inplace=True)

In [851]:
X_train_B, y_train_B = process(X_train_observed_b, b_targets)
X_val_B, y_val_B = process(X_train_estimated_b, b_targets)

X_test_B = aggregate_data(X_test_estimated_b.copy())
X_test_B = clean_aggregated_data(X_test_B.copy())
X_test_B.drop(columns=['date_forecast'], inplace=True)

In [852]:
X_train_C, y_train_C = process(X_train_observed_c, c_targets)
X_val_C, y_val_C = process(X_train_estimated_c, c_targets)

X_test_C = aggregate_data(X_test_estimated_c.copy())
X_test_C = clean_aggregated_data(X_test_C.copy())
X_test_C.drop(columns=['date_forecast'], inplace=True)

## Handle Missing Values

In [853]:
def forward_backward_fill(X):
    X['ceiling_height_agl:m'].fillna(method='ffill', inplace=True)
    X['ceiling_height_agl:m'].fillna(method='bfill', inplace=True)
    
    X['cloud_base_agl:m'].fillna(method='ffill', inplace=True)
    X['cloud_base_agl:m'].fillna(method='bfill', inplace=True)
    
    return X

In [854]:
def replace_nan_with_mean(df):
    for column in df.columns:
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
    return df

In [855]:
handle_nan = forward_backward_fill

X_train_A = handle_nan(X_train_A)
X_val_A = handle_nan(X_val_A)
X_test_A = handle_nan(X_test_A)

X_train_B = handle_nan(X_train_B)
X_val_B = handle_nan(X_val_B)
X_test_B = handle_nan(X_test_B)

X_train_C = handle_nan(X_train_C)
X_val_C = handle_nan(X_val_C)
X_test_C = handle_nan(X_test_C)

## Normalize Data

In [856]:
scaler = RobustScaler()
X_train_A = scaler.fit_transform(X_train_A)
X_val_A = scaler.transform(X_val_A)
X_test_A = scaler.transform(X_test_A)

In [857]:
scaler = RobustScaler()
X_train_B = scaler.fit_transform(X_train_B)
X_val_B = scaler.transform(X_val_B)
X_test_B = scaler.transform(X_test_B)

In [858]:
scaler = RobustScaler()
X_train_C = scaler.fit_transform(X_train_C)
X_val_C = scaler.transform(X_val_C)
X_test_C = scaler.transform(X_test_C)

## Create and Train Model

In [859]:
def create_mlp(input_shape, lr, weights):
    model = Sequential()
    model.add(Flatten(input_shape=input_shape))
    model.add(Dense(units=weights, activation='relu'))
    model.add(Dense(units=weights//2, activation='relu'))
    model.add(Dense(units=1, activation='linear'))
    model.compile(optimizer=Adam(learning_rate=lr), loss='mean_absolute_error')
    return model


In [860]:
def train_and_save_mlp_model(X_train, y_train, X_val, y_val, model_save_path, lr=1e-3, weights=128):
    input_shape = X_train.shape[1:]  # Adjusted this line
    model = create_mlp(input_shape, lr, weights)
    
    early_stop = EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True)
    location = model_save_path.split('_')[-1].split('.')[0]
    print("-------------------------------")
    print("Training model for location:", location)
    print("-------------------------------")
    
    model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), callbacks=[early_stop])
    model.save(os.path.join(root, model_save_path))
    return model


In [861]:
lgbm_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [20, 40],
    'num_leaves': [31, 61],
    'boosting_type': ['gbdt', 'dart'],
    'objective': ['regression'],
    'random_state': [501],
    'colsample_bytree': [0.5, 0.7],
    'subsample': [0.5, 0.7],
    'min_split_gain': [0.01],
}

def create_and_tune_lgbm(X_train, y_train, X_val, y_val):
    X_combined = np.vstack((X_train, X_val))
    y_combined = np.hstack((y_train, y_val))

    test_fold = [-1]*len(X_train) + [0]*len(X_val)

    ps = PredefinedSplit(test_fold)
    
    gridsearch = GridSearchCV(lgb.LGBMRegressor(verbose=-1), lgbm_param_grid, cv=ps, scoring='neg_mean_squared_error', verbose=1)

    gridsearch.fit(X_combined, y_combined)
    
    return gridsearch.best_estimator_

In [862]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'max_iter': [50, 100],
    'max_leaf_nodes': [31, None],
    'l2_regularization': [0.0, 0.1],
    'max_depth': [None, 10],
    'min_samples_leaf': [1, 20],
    'loss': ['least_squares', 'poisson'],
    'early_stopping': [False, True],
    'n_iter_no_change': [10],  # if early_stopping is True
    'tol': [1e-6]
}

def create_and_tune_histgb(X_train, y_train, X_val, y_val):
    X_combined = np.vstack((X_train, X_val))
    y_combined = np.hstack((y_train, y_val))

    test_fold = [-1]*len(X_train) + [0]*len(X_val)

    ps = PredefinedSplit(test_fold)
    
    gridsearch = GridSearchCV(HistGradientBoostingRegressor(), param_grid, cv=ps, scoring='neg_mean_squared_error', verbose=1)

    gridsearch.fit(X_combined, y_combined)
    
    return gridsearch.best_estimator_

In [863]:
mlp_model_A = train_and_save_mlp_model(X_train_A, y_train_A, X_val_A, y_val_A, 'models/model_location_A.h5')
mlp_model_B = train_and_save_mlp_model(X_train_B, y_train_B, X_val_B, y_val_B, 'models/model_location_B.h5')
mlp_model_C = train_and_save_mlp_model(X_train_C, y_train_C, X_val_C, y_val_C, 'models/model_location_C.h5')

lgbm_model_A = create_and_tune_lgbm(X_train_A, y_train_A, X_val_A, y_val_A)
lgbm_model_B = create_and_tune_lgbm(X_train_B, y_train_B, X_val_B, y_val_B)
lgbm_model_C = create_and_tune_lgbm(X_train_C, y_train_C, X_val_C, y_val_C)

hist_model_A = create_and_tune_histgb(X_train_A, y_train_A, X_val_A, y_val_A)
hist_model_B = create_and_tune_histgb(X_train_B, y_train_B, X_val_B, y_val_B)
hist_model_C = create_and_tune_histgb(X_train_C, y_train_C, X_val_C, y_val_C)

-------------------------------
Training model for location: A
-------------------------------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
-------------------------------
Training model for location: B
-------------------------------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
-------------------------------
Training model for location: C
-------------------------------
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
E

  return np.exp(raw_prediction, out=out)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 101, in _che

Fitting 1 folds for each of 384 candidates, totalling 384 fits


  return np.exp(raw_prediction, out=out)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 101, in _che

Fitting 1 folds for each of 384 candidates, totalling 384 fits


  return np.exp(raw_prediction, out=out)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 101, in _che

## Predict

In [864]:
# Predict using each model
y_pred_a_mlp = mlp_model_A.predict(X_test_A).ravel()
y_pred_b_mlp = mlp_model_B.predict(X_test_B).ravel()
y_pred_c_mlp = mlp_model_C.predict(X_test_C).ravel()

y_pred_a_lgbm = lgbm_model_A.predict(X_test_A).ravel()
y_pred_b_lgbm = lgbm_model_B.predict(X_test_B).ravel()
y_pred_c_lgbm = lgbm_model_C.predict(X_test_C).ravel()

y_pred_a_hist = hist_model_A.predict(X_test_A).ravel()
y_pred_b_hist = hist_model_B.predict(X_test_B).ravel()
y_pred_c_hist = hist_model_C.predict(X_test_C).ravel()

y_pred_a_mlp[y_pred_a_mlp < 0] = 0
y_pred_b_mlp[y_pred_b_mlp < 0] = 0
y_pred_c_mlp[y_pred_c_mlp < 0] = 0

y_pred_a_lgbm[y_pred_a_lgbm < 0] = 0
y_pred_b_lgbm[y_pred_b_lgbm < 0] = 0
y_pred_c_lgbm[y_pred_c_lgbm < 0] = 0

y_pred_a_hist[y_pred_a_hist < 0] = 0
y_pred_b_hist[y_pred_b_hist < 0] = 0
y_pred_c_hist[y_pred_c_hist < 0] = 0

# Compute the geometric mean
y_pred_A = np.sqrt(y_pred_a_mlp * y_pred_a_lgbm * y_pred_a_hist)
y_pred_B = np.sqrt(y_pred_b_mlp * y_pred_b_lgbm * y_pred_b_hist)
y_pred_C = np.sqrt(y_pred_c_mlp * y_pred_c_lgbm * y_pred_c_hist)


test_data = pd.read_csv(os.path.join(root, 'test.csv'))
location_a_test = test_data[test_data['location'] == "A"]
location_b_test = test_data[test_data['location'] == "B"]
location_c_test = test_data[test_data['location'] == "C"]

# Combine predictions into a single DataFrame
submission_A = pd.DataFrame({'id': location_a_test['id'], 'predicted_values': y_pred_A.flatten()})
submission_B = pd.DataFrame({'id': location_b_test['id'], 'predicted_values': y_pred_B.flatten()})
submission_C = pd.DataFrame({'id': location_c_test['id'], 'predicted_values': y_pred_C.flatten()})

# Combine all location submissions into one DataFrame
final_submission = pd.concat([submission_A, submission_B, submission_C], ignore_index=True)

# Save the combined predictions to a single CSV file
final_submission.to_csv('Submissions/submission_combined.csv', index=False)

