# Imports

In [43]:

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold


import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message="is_sparse is deprecated")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")


# Load datasets

In [44]:
X_train_estimated_a = pd.read_parquet('../data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('../data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('../data/C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('../data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('../data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('../data/C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('../data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('../data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('../data/C/X_test_estimated.parquet')

train_targets_a = pd.read_parquet('../data/A/train_targets.parquet')
train_targets_b = pd.read_parquet('../data/B/train_targets.parquet')
train_targets_c = pd.read_parquet('../data/C/train_targets.parquet')

# Data clean up

In [45]:

# Data set A, B and C clean up

def data_clean_up(x_train_est, x_train_observe, y_train):

  if 'date_calc' in x_train_est.columns:
    x_train_est.drop(columns="date_calc", inplace=True)

  x_train = pd.concat([x_train_observe, x_train_est])

  # Group the rows into blocks of 4 and apply the aggregation function
  agg_func = {col: 'mean' for col in x_train.columns[1:]}
  X_train_downscaled = x_train.groupby(x_train.index // 4).agg({**{'date_forecast': 'first'}, **agg_func})

  y_train.dropna(inplace=True)
  combined_data = pd.merge(X_train_downscaled, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'date_forecast' and 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns="date_forecast", inplace=True)
    combined_data.drop(columns="time", inplace=True)
    combined_data.drop(columns="pv_measurement", inplace=True)

  return combined_data, y_train

x_train_a, y_train_a = data_clean_up(X_train_estimated_a, X_train_observed_a, train_targets_a)
x_train_b, y_train_b = data_clean_up(X_train_estimated_b, X_train_observed_b, train_targets_b)
x_train_c, y_train_c = data_clean_up(X_train_estimated_c, X_train_observed_c, train_targets_c)


def data_clean_up_test(x_test_est):

  # Group the rows into blocks of 4 and apply the aggregation function
  agg_func = {col: 'mean' for col in x_test_est.columns[1:]}
  X_test_downscaled = x_test_est.groupby(x_test_est.index // 4).agg({**{'date_forecast': 'first'}, **agg_func})

  if 'date_forecast' in X_test_downscaled.columns:
    X_test_downscaled.drop(columns="date_forecast", inplace=True)

  return X_test_downscaled

# Feature engineering

In [46]:
# Do something

# Training the model

In [47]:
model_a1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model_b1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model_c1 = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

model_a1.fit(x_train_a, y_train_a)
model_b1.fit(x_train_b, y_train_b)
model_c1.fit(x_train_c, y_train_c)

model_a2 = HistGradientBoostingRegressor()
model_b2 = HistGradientBoostingRegressor()
model_c2 = HistGradientBoostingRegressor()

model_a2.fit(x_train_a, y_train_a)
model_b2.fit(x_train_b, y_train_b)
model_c2.fit(x_train_c, y_train_c)

X_test_estimated_a = data_clean_up_test(X_test_estimated_a)
X_test_estimated_b = data_clean_up_test(X_test_estimated_b)
X_test_estimated_c = data_clean_up_test(X_test_estimated_c)

y_pred_a1 = model_a1.predict(X_test_estimated_a)
y_pred_b1 = model_b1.predict(X_test_estimated_b)
y_pred_c1= model_c1.predict(X_test_estimated_c)

y_pred_a2 = model_a2.predict(X_test_estimated_a)
y_pred_b2 = model_b2.predict(X_test_estimated_b)
y_pred_c2= model_c2.predict(X_test_estimated_c)

y_pred_xgb = np.concatenate((y_pred_a1, y_pred_b1, y_pred_c1), axis=0)
y_pred_hgb = np.concatenate((y_pred_a2, y_pred_b2, y_pred_c2), axis=0)
average_preds = (y_pred_xgb + y_pred_hgb) / 2.0


# # Load and preprocess your data
# X_train_a, y_train_a = data_clean_up(X_train_estimated_a, X_train_observed_a, train_targets_a)
# X_train_b, y_train_b = data_clean_up(X_train_estimated_b, X_train_observed_b, train_targets_b)
# X_train_c, y_train_c = data_clean_up(X_train_estimated_c, X_train_observed_c, train_targets_c)

# def cv_stack(X_train, y_train):
#     # Define the number of folds for cross-validation
#     n_splits = 5  # Adjust as needed

#     # Initialize arrays to store cross-validation predictions
#     cv_preds = np.zeros((len(y_train), n_splits))

#     # Create a KFold splitter
#     kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

#     # Train base models and generate cross-validation predictions
#     for i, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
#         print("train_idx:", train_idx)
#         print("valid_idx:", valid_idx)
#         X_tr, X_val = X_train[train_idx], X_train[valid_idx]
#         y_tr, y_val = y_train[train_idx], y_train[valid_idx]
        
#         # Train base regression models (e.g., HistGradientBoostingRegressor, XGBRegressor) on X_tr and y_tr
        
#         # Generate predictions from base models on the validation subset
#         hist_gb_model = HistGradientBoostingRegressor()
#         hist_gb_model.fit(X_tr, y_tr)
#         xgb_model = xgb.XGBRegressor()
#         xgb_model.fit(X_tr, y_tr)
        
#         cv_preds[valid_idx, i] = (hist_gb_model.predict(X_val) + xgb_model.predict(X_val)) / 2  # Use an average for stacking

#     return cv_preds

# # Generate cross-validation predictions for the training dataset
# cv_preds_a = cv_stack(X_train_a, y_train_a)
# cv_preds_b = cv_stack(X_train_b, y_train_b)
# cv_preds_c = cv_stack(X_train_c, y_train_c)

# # Combine cross-validation predictions into a new dataset
# stacked_data_a = cv_preds_a.mean(axis=1).reshape(-1, 1)  # Use an average for stacking
# stacked_data_b = cv_preds_b.mean(axis=1).reshape(-1, 1)
# stacked_data_c = cv_preds_c.mean(axis=1).reshape(-1, 1)

# # Train a meta-model (e.g., linear regression) on the stacked data
# meta_model_a = LinearRegression()
# meta_model_a.fit(stacked_data_a, y_train_a)

# meta_model_b = LinearRegression()
# meta_model_b.fit(stacked_data_b, y_train_b)

# meta_model_c = LinearRegression()
# meta_model_c.fit(stacked_data_c, y_train_c)

# # Make final predictions on the entire training dataset
# final_predictions_a = meta_model_a.predict(X_test_estimated_a)
# final_predictions_b = meta_model_b.predict(X_test_estimated_b)
# final_predictions_c = meta_model_c.predict(X_test_estimated_c)

# # Evaluate the stacked ensemble model (e.g., using mean squared error)
# mse = np.mean_squared_error(y_train, final_predictions_a)
# print(f"Stacked Ensemble MSE A: {mse}")

# mse = np.mean_squared_error(y_train, final_predictions_b)
# print(f"Stacked Ensemble MSE B: {mse}")

# mse = np.mean_squared_error(y_train, final_predictions_c)
# print(f"Stacked Ensemble MSE C: {mse}")



# Make predictions

In [48]:
# Do some more stuff

# Evaluate prediction

In [49]:
# Evaluate the model

# Create submission

In [51]:
y_test_pred = average_preds

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission.csv', index=False)