# Imports

In [76]:

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
warnings.filterwarnings("ignore", category=FutureWarning, message="is_sparse is deprecated")



# Load datasets

In [77]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


In [78]:
XTRA = X_frames_train[0]
XTRB = X_frames_train[1]
XTRC = X_frames_train[2]

YA = Y_frames_train[0]
YB = Y_frames_train[1]
YC = Y_frames_train[2]

In [79]:
start_times_b = [
    '2019-03-24 00:00:00',
'2019-05-31 00:00:00',
'2019-10-28 14:00:00',
'2020-02-23 17:00:00',
'2020-03-26 14:00:00',
'2020-04-02 03:00:00',
'2020-07-12 23:00:00',
'2020-09-24 14:00:00',
'2021-01-15 10:00:00',
'2021-04-29 00:00:00',
'2021-06-05 03:00:00',
'2021-06-13 04:00:00',
'2021-06-22 03:00:00',
'2021-07-03 15:00:00',
'2021-08-26 00:00:00',
'2021-09-08 15:00:00',
'2021-09-19 02:00:00',
'2021-01-30 15:00:00',
'2022-02-10 21:00:00',
'2022-02-16 14:00:00',
'2022-03-19 15:00:00',
'2022-12-05 17:00:00',
'2023-02-24 01:00:00',
'2023-03-07 10:00:00',
'2023-03-25 23:00:00'
]

end_times_b = [
    '2019-03-28 00:00:00',
'2019-06-03 14:00:00',
'2019-10-30 23:00:00',
'2020-03-06 06:00:00',
'2020-03-27 22:00:00',
'2020-04-16 08:00:00',
'2020-08-25 23:00:00',
'2020-09-25 23:00:00',
'2021-04-19 09:00:00',
'2021-05-01 23:00:00',
'2021-06-07 08:00:00',
'2021-06-14 10:00:00',
'2021-06-24 08:00:00',
'2021-07-06 07:00:00',
'2021-09-03 22:00:00',
'2021-09-14 13:00:00',
'2021-09-27 10:00:00',
'2022-02-04 09:00:00',
'2022-02-13 07:00:00',
'2022-02-24 06:00:00',
'2022-04-13 06:00:00',
'2023-01-05 08:00:00',
'2023-02-27 05:00:00',
'2023-03-10 01:00:00',
'2023-03-28 02:00:00'
]

print("before B drop: ", len(YB))

for i in range(len(start_times_b)):
    a = pd.to_datetime(start_times_b[i])
    b = pd.to_datetime(end_times_b[i])
    ind = YB[ (YB['time'] >= a) & (YB['time'] <= b)].index
    
    YB.drop(ind, inplace=True)

print("after B drop: ", len(YB))

print("before C drop: ", len(YC))

a = pd.to_datetime("2020-02-23 17:00:00")
b = pd.to_datetime('2020-03-08 08:00:00')
ind = YC[ (YC['time'] >= a) & (YC['time'] <= b)].index
    
YC.drop(ind, inplace=True)
print("after C drop: ", len(YC))

before B drop:  32848
after B drop:  20628
before C drop:  32155
after C drop:  31827


# Data clean up an aggregation

In [80]:
# Making sure that target values line up with x_values
import data_func.aggregation as data_agg

categorical_col = ['dew_or_rime:idx', 'precip_type_5min:idx', 'is_day:idx', 'is_in_shadow:idx']

def aggregate_correct_x(x: pd.DataFrame) -> pd.DataFrame:
   '''
   Takes a given dataframe and returns an aggregated dataframe based on selected categorical functions. 
   Assumes grouping of 4.
   '''
   categorical = x[["date_forecast"] + categorical_col]
   mean = x.drop(columns=categorical_col)

   categorical = data_agg.gen_agg(categorical, agg_type=data_agg.stocastic_median)
   mean = data_agg.gen_agg(mean, "mean")

   return pd.merge(categorical, mean, on="date_forecast")

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  x_train = aggregate_correct_x(x_train)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train



X_train = [XTRA, XTRB, XTRC]
Y_train = [YA, YB, YC]

for i in range(len(X_train)):
    X_train[i], Y_train[i] = data_allign(X_train[i], Y_train[i])

for j in range(len(X_frames_test)):
    X_frames_test[j] = aggregate_correct_x(X_frames_test[j])


In [81]:

print(len(X_train[0]), len(X_train[1]), len(X_train[2]))
print(len(Y_train[0]), len(Y_train[1]), len(Y_train[2]))

29667 17161 22813
29667 17161 22813


# Feature engineering

In [82]:
import data_func.timeseasonality as DTS
import data_func.date_forecast as DTF
import data_func.combine_all_frames as CAF
import data_func.one_hot_encoding as OHE

for i in range(len(X_train)):
    X_train[i] = DTS.append_seasonal_columns(X_train[i])
    X_train[i] = DTF.date_forecast_columns(X_train[i])
    # X_train[i].drop(columns=['absolute_humidity_2m:gm3'], inplace=True)
    # X_train[i].drop(columns=['air_density_2m:kgm3'], inplace=True)
    # X_train[i]['ceiling_height_agl:m'] = X_train[i]['ceiling_height_agl:m'].fillna(0)
    # X_train[i]['cloud_base_agl:m'] = X_train[i]['cloud_base_agl:m'].fillna(100000)

X_train_pd = CAF.combine_all_frames(X_train)
X_train_pd = OHE.one_hot_encode(X_train_pd, ["location"])

for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i] = DTF.date_forecast_columns(X_frames_test[i])

X_frames_test_pd = CAF.combine_all_frames(X_frames_test)
X_frames_test_pd = OHE.one_hot_encode(X_frames_test_pd, ["location"])

# Y_frames_train_pd = CAF.combine_all_frames(Y_frames_train)
Y_frames_train_pd = pd.concat(Y_train, ignore_index=True)

# print(X_frames_train[0]['ceiling_height_agl:m'].isnull().sum())
X_frames_test_pd.head()

Unnamed: 0,dew_or_rime:idx,precip_type_5min:idx,is_day:idx,is_in_shadow:idx,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,...,cosinus_day,sinus_year,cosinus_year,year,month,day,hours,location_0.0,location_1.0,location_2.0
0,0,0,0,1,4.325,1.28675,912.700012,0.0,0.0,1061.550049,...,1.0,0.879458,-0.475977,2023,5,1,0,1,0,0
1,0,0,0,1,4.275,1.286,1482.099976,0.0,0.0,1075.100098,...,0.965926,0.879116,-0.476607,2023,5,1,1,1,0,0
2,0,0,0,1,4.15,1.28375,1791.300049,0.0,0.0,1200.400024,...,0.866025,0.878775,-0.477237,2023,5,1,2,1,0,0
3,0,0,1,0,4.025,1.282,2312.875,10124.424805,11.675,1179.849976,...,0.707107,0.878432,-0.477867,2023,5,1,3,1,0,0
4,0,0,1,0,3.9,1.281,2198.299805,141748.59375,76.875,920.049988,...,0.5,0.87809,-0.478496,2023,5,1,4,1,0,0


In [83]:
def columnsToDrop(df):
    columns_to_drop = [
    'wind_speed_10m:ms',
    'wind_speed_u_10m:ms',
    'wind_speed_v_10m:ms',
    'wind_speed_w_1000hPa:ms',
    'snow_density:kgm3',
    'snow_melt_10min:mm',
    'snow_drift:idx',
    'elevation:m',
    'year',
    'prob_rime:p'
    ]
    df = df.drop(columns=columns_to_drop)
    return df
X_frames_test_pd = columnsToDrop(X_frames_test_pd)
X_train_pd = columnsToDrop(X_train_pd)


# Training the model

In [84]:
import optuna

# Split the data into training and validation sets

x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_train_pd, Y_frames_train_pd, test_size=0.17, random_state=42)
# x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_frames_train[1], Y_frames_train[1], test_size=0.17, random_state=42)
# x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_frames_train[2], Y_frames_train[2], test_size=0.17, random_state=42)

def objective(trial):

  params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
    }

  model_a = xgb.XGBRegressor(**params) # Change to model to optimize
  model_a.fit(x_train_a, y_train_a)

  # Make predictions on the validation set
  y_pred = model_a.predict(x_val_a)

  # Calculate the Mean Squared Error (MSE) as the metric to optimize
  mse = mean_squared_error(y_val_a, y_pred)

  return mse

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

# best = study.best_params
# #Print the best hyperparameters found
# print("Best hyperparameters:", best)

# Use params from hyperparameter tuning using optuna
# params_a = {'random_state': 42, 'n_estimators': 980, 'max_depth': 9, 'learning_rate': 0.029035565559484028, 'subsample': 0.8393121619033767, 'colsample_bytree': 0.7589542758688459}
# params_b = {'random_state': 42, 'n_estimators': 955, 'max_depth': 9, 'learning_rate': 0.02949625834198986, 'subsample': 0.8030196155828968, 'colsample_bytree': 0.72518389089994}
# params_c = {'random_state': 42, 'n_estimators': 727, 'max_depth': 8, 'learning_rate': 0.08609213174337473, 'subsample': 0.8107057409889747, 'colsample_bytree': 0.8763563332327975}
# model_a = xgb.XGBRegressor(**params_a)
# model_b = xgb.XGBRegressor(**params_b)
# model_c = xgb.XGBRegressor(**params_c)

# model_a.fit(x_train_a, y_train_a)
# model_b.fit(x_train_b, y_train_b)
# model_c.fit(x_train_c, y_train_c)

x_train_a.head()

[I 2023-10-31 09:01:26,307] A new study created in memory with name: no-name-0d2dc177-9c1f-44cb-9219-e02ed7718072


[I 2023-10-31 09:02:00,057] Trial 0 finished with value: 74230.69322379776 and parameters: {'n_estimators': 213, 'max_depth': 10, 'learning_rate': 0.021606246684266153, 'subsample': 0.9432948911024266, 'colsample_bytree': 0.8541858256532062}. Best is trial 0 with value: 74230.69322379776.
[I 2023-10-31 09:02:31,883] Trial 1 finished with value: 64686.99078150789 and parameters: {'n_estimators': 835, 'max_depth': 8, 'learning_rate': 0.06815128077919139, 'subsample': 0.5450449540302948, 'colsample_bytree': 0.7128320613290939}. Best is trial 1 with value: 64686.99078150789.
[I 2023-10-31 09:02:36,841] Trial 2 finished with value: 83344.98807142107 and parameters: {'n_estimators': 425, 'max_depth': 4, 'learning_rate': 0.060273587634094135, 'subsample': 0.9053008780421685, 'colsample_bytree': 0.5693934230665864}. Best is trial 1 with value: 64686.99078150789.
[I 2023-10-31 09:02:47,815] Trial 3 finished with value: 93622.17751045756 and parameters: {'n_estimators': 386, 'max_depth': 4, 'lea

Best hyperparameters: {'n_estimators': 580, 'max_depth': 9, 'learning_rate': 0.05275655498939793, 'subsample': 0.7381280329820211, 'colsample_bytree': 0.6348798516047398}


Unnamed: 0,dew_or_rime:idx,precip_type_5min:idx,is_day:idx,is_in_shadow:idx,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,...,sinus_day,cosinus_day,sinus_year,cosinus_year,month,day,hours,location_0.0,location_1.0,location_2.0
16461,0,0,0,1,5.9,1.2695,,22156.525391,0.0,,...,-0.9659258,0.258819,0.956976,-0.290166,4,18,19,1,0,0
13504,0,0,0,1,6.6,1.2605,421.149994,359.049988,0.0,381.450012,...,-0.5,-0.866025,-0.251596,0.967832,12,16,14,1,0,0
45010,0,0,0,1,6.275,1.2485,1716.75,0.0,0.0,112.175003,...,0.8660254,-0.5,-0.13777,0.990464,12,23,8,0,1,0
6049,0,0,0,1,5.0,1.1885,1261.125,0.0,0.0,1261.125,...,-0.258819,0.965926,0.632862,0.774265,2,9,23,1,0,0
29498,0,0,0,1,5.45,1.27525,,0.0,0.0,40.650002,...,-1.631132e-12,1.0,-0.977122,0.212678,10,14,0,1,0,0


In [86]:

params_combined = {'random_state': 42, 'n_estimators': 580, 'max_depth': 9, 'learning_rate': 0.05275655498939793, 'subsample': 0.7381280329820211, 'colsample_bytree': 0.6348798516047398}
model = xgb.XGBRegressor(**params_combined)

model.fit(x_train_a, y_train_a)

# Evaluate prediction

In [87]:
# Evaluate the model based on the validation data

##mse_a = mean_squared_error(y_val_a, model_a.predict(x_val_a))
# print("MSE for A: ", mse_a)
# mse_b = mean_squared_error(y_val_b, model_b.predict(x_val_b))
# print("MSE for B: ", mse_b)
# mse_c = mean_squared_error(y_val_c, model_c.predict(x_val_c))
# print("MSE for C: ", mse_c)
# print("Mean MSE: ", (mse_a + mse_b + mse_c) / 3)
mse_combined = mean_squared_error(y_val_a, model.predict(x_val_a))
MAE_combined = mean_absolute_error(y_val_a, model.predict(x_val_a))

# Evaluate the predictions

# score_a = model_a.score(x_val_a, y_val_a)
# score_b = model_b.score(x_val_b, y_val_b)
# score_c = model_c.score(x_val_c, y_val_c)

# print("Score A: ", score_a)
# print("Score B: ", score_b)
# print("Score C: ", score_c)
# print('')
print("MSE Combined: ", mse_combined)
print("MAE Combined: ", MAE_combined)
score = model.score(X_train_pd, Y_frames_train_pd)

feature_importance_scores = model.feature_importances_

# Create a DataFrame to associate features with their importance scores
feature_importance_df = pd.DataFrame({'Feature': X_train_pd.columns, 'Importance': feature_importance_scores})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

pd.set_option('display.max_rows', 500)
feature_importance_df.head(500)
print(feature_importance_df)
pd.reset_option('display.max_rows')
# Print or visualize the feature importance scores

# Get feature importance scores
# models = [(model_a, 'A'), (model_b, 'B'), (model_c, 'C')]
# for model in models:

#     feature_importance_scores = model[0].feature_importances_

# # Create a DataFrame to associate features with their importance scores
#     feature_importance_df1 = pd.DataFrame({'Feature': x_train_a.columns, 'Importance': feature_importance_scores})

# # Sort features by importance in descending order
#     feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

# # Print or visualize the feature importance scores
    # pd.set_option('display.max_rows', 500)
    # feature_importance_df1.head(500)
    # print(f'Model {model[1]}')
    # print(feature_importance_df1)
    # pd.reset_option('display.max_rows')


MSE Combined:  64110.845163030775
MAE Combined:  89.44246072676326
                           Feature  Importance
44                    location_1.0    0.182418
13                    direct_rad:W    0.175305
43                    location_0.0    0.125032
45                    location_2.0    0.119214
8                  clear_sky_rad:W    0.047853
14                 direct_rad_1h:J    0.042837
40                           month    0.022346
11                   diffuse_rad:W    0.021769
36                       sinus_day    0.019487
42                           hours    0.018371
22                  precip_5min:mm    0.016894
28                   snow_depth:cm    0.015506
39                    cosinus_year    0.013708
30                   sun_azimuth:d    0.013125
37                     cosinus_day    0.011642
18               fresh_snow_24h:cm    0.010292
31                 sun_elevation:d    0.008126
38                      sinus_year    0.007348
27                sfc_pressure:hPa    0.

```
Most_common = ['direct_rad:W', 'clear_sky_rad:W']

MSE for A:  155326.11984010294
MSE for B:  4311.822664627681
MSE for C:  2484.332046556924
Mean MSE:  54040.75818376252
Score A:  0.8869367102250868
Score B:  0.8880678863853381
Score C:  0.9167532450100108

Model A
                 Feature  Importance
9           direct_rad:W    0.558548 <-------- 3
7          diffuse_rad:W    0.077622 <-------- 2
18      is_in_shadow:idx    0.028293 <--------2
3        clear_sky_rad:W    0.026308 <-------- 3
40          cosinus_year    0.024731
24     snow_density:kgm3    0.022149 <-------- 2
29         sun_azimuth:d    0.021677
20  precip_type_5min:idx    0.016280
6         dew_point_2m:K    0.015750
19        precip_5min:mm    0.014176 <-------- 2

Model B
             Feature  Importance
9       direct_rad:W    0.369634     <--------- 3
30   sun_elevation:d    0.165624     <-------- 2
3    clear_sky_rad:W    0.084479     <--------- 3
18  is_in_shadow:idx    0.074470     <---------
17        is_day:idx    0.037519
22   rain_water:kgm2    0.028054
40      cosinus_year    0.024928
7      diffuse_rad:W    0.019828     <--------- 2
39        sinus_year    0.016917
16  fresh_snow_6h:cm    0.015372     <-------- 1/2

Model C
                 Feature  Importance
30       sun_elevation:d    0.737467 <--------- 2
3        clear_sky_rad:W    0.110762 <--------- 3
9           direct_rad:W    0.029844 <--------- 3
10       direct_rad_1h:J    0.023283
20  precip_type_5min:idx    0.010948
12     fresh_snow_12h:cm    0.010607
24     snow_density:kgm3    0.009615 <-------- 2
14     fresh_snow_24h:cm    0.007877 <-------- 1/2
6         dew_point_2m:K    0.005590
19        precip_5min:mm    0.005087 <-------- 2
```

# Make predictions

In [None]:
# Train the model on the entire training data

y_pred = model.predict(X_frames_test_pd)

In [None]:

print(len(y_pred), len(X_frames_test_pd))

2160 2160


In [None]:
for i in range(len(y_pred)):
    if y_pred[i] < 0: 
        y_pred[i] = 0



In [None]:
## plot y_pred

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime




# Create submission

In [None]:
y_test_pred = y_pred
print(len(y_test_pred))

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission.csv', index=False)

2160
