# Imports

In [142]:

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
warnings.filterwarnings("ignore", category=FutureWarning, message="is_sparse is deprecated")



# Load datasets

In [143]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


# Data clean up an aggregation

In [144]:
# making shure that target values line up with x_values

def data_allign(x_train, y_train):
  
  y_train.dropna(inplace=True)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train

import data_func.aggregation as data_agg

categorical_features = ['dew_or_rime:idx', 'is_day:idx', 'is_in_shadow:idx', 'precip_type_5min:idx', 'snow_drift:idx']

for i in range(len(X_frames_train)):

  X_frames_train[i] = data_agg.gen_agg(X_frames_train[i], 'mean')
  X_frames_train[i], Y_frames_train[i] = data_allign(X_frames_train[i], Y_frames_train[i])

for j in range(len(X_frames_test)):
    X_frames_test[j] = data_agg.gen_agg(X_frames_test[j], 'mean')

print(len(X_frames_train[0]))
print(len(Y_frames_train[0]))
print(len(X_frames_test[0]))

29667
29667
720


# Feature engineering

In [145]:
import data_func.timeseasonality as DTS
import data_func.date_forecast as DTF
import data_func.combine_all_frames as CAF
import data_func.one_hot_encoding as OHE

for i in range(len(X_frames_train)):
    X_frames_train[i] = DTS.append_seasonal_columns(X_frames_train[i])
    X_frames_train[i] = DTF.date_forecast_columns(X_frames_train[i])
    # X_frames_train[i].drop(columns=['absolute_humidity_2m:gm3'], inplace=True)
    # X_frames_train[i].drop(columns=['air_density_2m:kgm3'], inplace=True)
    # X_frames_train[i]['ceiling_height_agl:m'] = X_frames_train[i]['ceiling_height_agl:m'].fillna(0)
    # X_frames_train[i]['cloud_base_agl:m'] = X_frames_train[i]['cloud_base_agl:m'].fillna(100000)

X_frames_train_pd = CAF.combine_all_frames(X_frames_train)
X_frames_train_pd = OHE.one_hot_encode(X_frames_train_pd, ["location"])

for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i] = DTF.date_forecast_columns(X_frames_test[i])

X_frames_test_pd = CAF.combine_all_frames(X_frames_test)
X_frames_test_pd = OHE.one_hot_encode(X_frames_test_pd, ["location"])

# Y_frames_train_pd = CAF.combine_all_frames(Y_frames_train)
Y_frames_train_pd = pd.concat(Y_frames_train, ignore_index=True)

# print(X_frames_train[0]['ceiling_height_agl:m'].isnull().sum())
X_frames_test_pd.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,cosinus_day,sinus_year,cosinus_year,year,month,day,hours,location_0.0,location_1.0,location_2.0
0,4.325,1.28675,912.700012,0.0,0.0,1061.550049,0.0,271.650024,0.0,0.0,...,0.995185,0.87933,-0.476213,2023,5,1,0,1,0,0
1,4.275,1.286,1482.099976,0.0,0.0,1075.100098,0.0,271.450012,0.0,0.0,...,0.935906,0.878988,-0.476843,2023,5,1,1,1,0,0
2,4.15,1.28375,1791.300049,0.0,0.0,1200.400024,0.0,271.049988,0.0,0.0,...,0.812847,0.878646,-0.477473,2023,5,1,2,1,0,0
3,4.025,1.282,2312.875,10124.424805,11.675,1179.849976,0.0,270.649994,9.375,16845.226562,...,0.634393,0.878304,-0.478103,2023,5,1,3,1,0,0
4,3.9,1.281,2198.299805,141748.59375,76.875,920.049988,0.0,270.375,47.400002,102209.703125,...,0.412707,0.877961,-0.478732,2023,5,1,4,1,0,0


In [146]:
def columnsToDrop(df):
    columns_to_drop = [
    'wind_speed_10m:ms',
    'wind_speed_u_10m:ms',
    'wind_speed_v_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'snow_density:kgm3',
    'snow_melt_10min:mm',
    'snow_drift:idx',
    'elevation:m',
    'year',
    'prob_rime:p'
    ]
    df = df.drop(columns=columns_to_drop)
    return df
X_frames_test_pd = columnsToDrop(X_frames_test_pd)
X_frames_train_pd = columnsToDrop(X_frames_train_pd)


# Training the model

In [147]:
import optuna

# Split the data into training and validation sets

x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_frames_train_pd, Y_frames_train_pd, test_size=0.17, random_state=42)
# x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_frames_train[1], Y_frames_train[1], test_size=0.17, random_state=42)
# x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_frames_train[2], Y_frames_train[2], test_size=0.17, random_state=42)

def objective(trial):

  params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
    }

  model_c = xgb.XGBRegressor(**params) # Change to model to optimize
  model_c.fit(x_train_a, y_train_a)

  # Make predictions on the validation set
  y_pred = model_c.predict(x_val_a)

  # Calculate the Mean Squared Error (MSE) as the metric to optimize
  mse = mean_squared_error(y_val_a, y_pred)

  return mse

#study = optuna.create_study(direction='minimize')
#study.optimize(objective, n_trials=30)

#best = study.best_params
# Print the best hyperparameters found
#print("Best hyperparameters:", best)

# Use params from hyperparameter tuning using optuna
# params_a = {'random_state': 42, 'n_estimators': 980, 'max_depth': 9, 'learning_rate': 0.029035565559484028, 'subsample': 0.8393121619033767, 'colsample_bytree': 0.7589542758688459}
# params_b = {'random_state': 42, 'n_estimators': 955, 'max_depth': 9, 'learning_rate': 0.02949625834198986, 'subsample': 0.8030196155828968, 'colsample_bytree': 0.72518389089994}
# params_c = {'random_state': 42, 'n_estimators': 727, 'max_depth': 8, 'learning_rate': 0.08609213174337473, 'subsample': 0.8107057409889747, 'colsample_bytree': 0.8763563332327975}
# model_a = xgb.XGBRegressor(**params_a)
# model_b = xgb.XGBRegressor(**params_b)
# model_c = xgb.XGBRegressor(**params_c)

# model_a.fit(x_train_a, y_train_a)
# model_b.fit(x_train_b, y_train_b)
# model_c.fit(x_train_c, y_train_c)

x_train_a.head()

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,sinus_day,cosinus_day,sinus_year,cosinus_year,month,day,hours,location_0.0,location_1.0,location_2.0
15623,7.4,1.2605,1148.125,0.0,0.0,409.950012,0.0,279.575012,0.0,0.0,...,0.707107,0.707107,-0.978527,0.206117,10,13,3,0,1,0
14435,4.4,1.259,3417.149902,48257.27,28.199999,2000.850098,0.0,271.849976,17.475,33603.5,...,0.707107,-0.707107,0.401355,0.915923,1,24,9,1,0,0
4074,5.275,1.265125,177.75,507115.9,108.375,177.75,0.0,274.600006,31.099998,122741.0625,...,-0.866025,-0.5,-0.667169,0.744907,11,19,16,1,0,0
22148,11.925,1.22175,,403895.4,162.949997,1950.375,0.0,287.049988,51.025002,148262.65625,...,0.866025,0.5,-0.174195,-0.984711,7,12,4,0,1,0
22815,4.55,1.2675,2043.224976,1401904.0,445.149994,769.349976,0.0,272.575012,152.175003,505702.9375,...,0.866025,-0.5,0.984,-0.178171,4,12,8,0,0,1


In [148]:

params_combined = {'n_estimators': 917, 'max_depth': 10, 'learning_rate': 0.1278757907554517, 'subsample': 0.8955820144088287, 'colsample_bytree': 0.8123114046955043, "random_state":42}
model = xgb.XGBRegressor(**params_combined)

model.fit(x_train_a, y_train_a)

# Evaluate prediction

In [149]:
# Evaluate the model based on the validation data

##mse_a = mean_squared_error(y_val_a, model_a.predict(x_val_a))
# print("MSE for A: ", mse_a)
# mse_b = mean_squared_error(y_val_b, model_b.predict(x_val_b))
# print("MSE for B: ", mse_b)
# mse_c = mean_squared_error(y_val_c, model_c.predict(x_val_c))
# print("MSE for C: ", mse_c)
# print("Mean MSE: ", (mse_a + mse_b + mse_c) / 3)
mse_combined = mean_squared_error(y_val_a, model.predict(x_val_a))
MAE_combined = mean_absolute_error(y_val_a, model.predict(x_val_a))

# Evaluate the predictions

# score_a = model_a.score(x_val_a, y_val_a)
# score_b = model_b.score(x_val_b, y_val_b)
# score_c = model_c.score(x_val_c, y_val_c)

# print("Score A: ", score_a)
# print("Score B: ", score_b)
# print("Score C: ", score_c)
# print('')
print("MSE Combined: ", mse_combined)
print("MAE Combined: ", MAE_combined)
score = model.score(X_frames_train_pd, Y_frames_train_pd)

feature_importance_scores = model.feature_importances_

# Create a DataFrame to associate features with their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_frames_train_pd.columns, 'Importance': feature_importance_scores})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

pd.set_option('display.max_rows', 500)
feature_importance_df.head(500)
print(feature_importance_df)
pd.reset_option('display.max_rows')
# Print or visualize the feature importance scores

# Get feature importance scores
# models = [(model_a, 'A'), (model_b, 'B'), (model_c, 'C')]
# for model in models:

#     feature_importance_scores = model[0].feature_importances_

# # Create a DataFrame to associate features with their importance scores
#     feature_importance_df1 = pd.DataFrame({'Feature': x_train_a.columns, 'Importance': feature_importance_scores})

# # Sort features by importance in descending order
#     feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

# # Print or visualize the feature importance scores
    # pd.set_option('display.max_rows', 500)
    # feature_importance_df1.head(500)
    # print(f'Model {model[1]}')
    # print(feature_importance_df1)
    # pd.reset_option('display.max_rows')


MSE Combined:  53446.87462226406
MAE Combined:  78.9547748049631
                           Feature  Importance
10                    direct_rad:W    0.277390
43                    location_0.0    0.242283
45                    location_2.0    0.044398
8                    diffuse_rad:W    0.039279
4                  clear_sky_rad:W    0.038672
21                  precip_5min:mm    0.026697
28                   snow_depth:cm    0.023744
40                           month    0.022130
30                   sun_azimuth:d    0.020830
39                    cosinus_year    0.020229
18                      is_day:idx    0.018737
22            precip_type_5min:idx    0.018048
11                 direct_rad_1h:J    0.013657
15               fresh_snow_24h:cm    0.013562
19                is_in_shadow:idx    0.011776
31                 sun_elevation:d    0.011576
13               fresh_snow_12h:cm    0.010066
37                     cosinus_day    0.009607
41                             day    0.00

```
Most_common = ['direct_rad:W', 'clear_sky_rad:W']

MSE for A:  155326.11984010294
MSE for B:  4311.822664627681
MSE for C:  2484.332046556924
Mean MSE:  54040.75818376252
Score A:  0.8869367102250868
Score B:  0.8880678863853381
Score C:  0.9167532450100108

Model A
                 Feature  Importance
9           direct_rad:W    0.558548 <-------- 3
7          diffuse_rad:W    0.077622 <-------- 2
18      is_in_shadow:idx    0.028293 <--------2
3        clear_sky_rad:W    0.026308 <-------- 3
40          cosinus_year    0.024731
24     snow_density:kgm3    0.022149 <-------- 2
29         sun_azimuth:d    0.021677
20  precip_type_5min:idx    0.016280
6         dew_point_2m:K    0.015750
19        precip_5min:mm    0.014176 <-------- 2

Model B
             Feature  Importance
9       direct_rad:W    0.369634     <--------- 3
30   sun_elevation:d    0.165624     <-------- 2
3    clear_sky_rad:W    0.084479     <--------- 3
18  is_in_shadow:idx    0.074470     <---------
17        is_day:idx    0.037519
22   rain_water:kgm2    0.028054
40      cosinus_year    0.024928
7      diffuse_rad:W    0.019828     <--------- 2
39        sinus_year    0.016917
16  fresh_snow_6h:cm    0.015372     <-------- 1/2

Model C
                 Feature  Importance
30       sun_elevation:d    0.737467 <--------- 2
3        clear_sky_rad:W    0.110762 <--------- 3
9           direct_rad:W    0.029844 <--------- 3
10       direct_rad_1h:J    0.023283
20  precip_type_5min:idx    0.010948
12     fresh_snow_12h:cm    0.010607
24     snow_density:kgm3    0.009615 <-------- 2
14     fresh_snow_24h:cm    0.007877 <-------- 1/2
6         dew_point_2m:K    0.005590
19        precip_5min:mm    0.005087 <-------- 2
```

# Make predictions

In [150]:
# Train the model on the entire training data

y_pred = model.predict(X_frames_test_pd)



In [151]:

print(len(y_pred), len(X_frames_test_pd))

2160 2160


In [152]:
for i in range(len(y_pred)):
    if y_pred[i] < 0: 
        y_pred[i] = 0



In [153]:
## plot y_pred

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime




# Create submission

In [154]:
y_test_pred = y_pred
print(len(y_test_pred))

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission.csv', index=False)

2160
