# Imports

In [53]:

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
warnings.filterwarnings("ignore", category=FutureWarning, message="is_sparse is deprecated")



# Load datasets

In [54]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


In [55]:
XTRA = X_frames_train[0]
XTRB = X_frames_train[1]
XTRC = X_frames_train[2]

YA = Y_frames_train[0]
YB = Y_frames_train[1]
YC = Y_frames_train[2]

In [56]:
start_times_b = [
    '2019-03-24 00:00:00',
'2019-05-31 00:00:00',
'2019-10-28 14:00:00',
'2020-02-23 17:00:00',
'2020-03-26 14:00:00',
'2020-04-02 03:00:00',
'2020-07-12 23:00:00',
'2020-09-24 14:00:00',
'2021-01-15 10:00:00',
'2021-04-29 00:00:00',
'2021-06-05 03:00:00',
'2021-06-13 04:00:00',
'2021-06-22 03:00:00',
'2021-07-03 15:00:00',
'2021-08-26 00:00:00',
'2021-09-08 15:00:00',
'2021-09-19 02:00:00',
'2021-01-30 15:00:00',
'2022-02-10 21:00:00',
'2022-02-16 14:00:00',
'2022-03-19 15:00:00',
'2022-12-05 17:00:00',
'2023-02-24 01:00:00',
'2023-03-07 10:00:00',
'2023-03-25 23:00:00'
]

end_times_b = [
    '2019-03-28 00:00:00',
'2019-06-03 14:00:00',
'2019-10-30 23:00:00',
'2020-03-06 06:00:00',
'2020-03-27 22:00:00',
'2020-04-16 08:00:00',
'2020-08-25 23:00:00',
'2020-09-25 23:00:00',
'2021-04-19 09:00:00',
'2021-05-01 23:00:00',
'2021-06-07 08:00:00',
'2021-06-14 10:00:00',
'2021-06-24 08:00:00',
'2021-07-06 07:00:00',
'2021-09-03 22:00:00',
'2021-09-14 13:00:00',
'2021-09-27 10:00:00',
'2022-02-04 09:00:00',
'2022-02-13 07:00:00',
'2022-02-24 06:00:00',
'2022-04-13 06:00:00',
'2023-01-05 08:00:00',
'2023-02-27 05:00:00',
'2023-03-10 01:00:00',
'2023-03-28 02:00:00'
]

print("before B drop: ", len(YB))

for i in range(len(start_times_b)):
    a = pd.to_datetime(start_times_b[i])
    b = pd.to_datetime(end_times_b[i])
    ind = YB[ (YB['time'] >= a) & (YB['time'] <= b)].index
    
    YB.drop(ind, inplace=True)

print("after B drop: ", len(YB))

print("before C drop: ", len(YC))

a = pd.to_datetime("2020-02-23 17:00:00")
b = pd.to_datetime('2020-03-08 08:00:00')
ind = YC[ (YC['time'] >= a) & (YC['time'] <= b)].index
    
YC.drop(ind, inplace=True)
print("after C drop: ", len(YC))

before B drop:  32848
after B drop:  20628
before C drop:  32155
after C drop:  31827


# Data clean up an aggregation

In [57]:
# Making sure that target values line up with x_values
import data_func.aggregation as data_agg

categorical_col = ['dew_or_rime:idx', 'precip_type_5min:idx', 'is_day:idx', 'is_in_shadow:idx']

def aggregate_correct_x(x: pd.DataFrame) -> pd.DataFrame:
   '''
   Takes a given dataframe and returns an aggregated dataframe based on selected categorical functions. 
   Assumes grouping of 4.
   '''
   categorical = x[["date_forecast"] + categorical_col]
   mean = x.drop(columns=categorical_col)

   categorical = data_agg.gen_agg(categorical, agg_type=data_agg.stocastic_median)
   mean = data_agg.gen_agg(mean, "mean")

   return pd.merge(categorical, mean, on="date_forecast")

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  x_train = aggregate_correct_x(x_train)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train



X_train = [XTRA, XTRB, XTRC]
Y_train = [YA, YB, YC]

for i in range(len(X_train)):
    X_train[i], Y_train[i] = data_allign(X_train[i], Y_train[i])

for j in range(len(X_frames_test)):
    X_frames_test[j] = aggregate_correct_x(X_frames_test[j])


print(len(X_train[0]), len(Y_train[0]))
print(len(X_train[1]), len(Y_train[1]))
print(len(X_train[2]), len(Y_train[2]))

print(len(X_frames_test[0]))
print(len(X_frames_test[1]))
print(len(X_frames_test[2]))

29667 29667
17161 17161
22813 22813
720
720
720


# Feature engineering

In [58]:
import data_func.timeseasonality as DTS
import data_func.date_forecast as DTF
import data_func.one_hot_encoding as OHE

categorical_features = ['dew_or_rime:idx', 'precip_type_5min:idx']


for i in range(len(X_train)):
    X_train[i] = DTS.append_seasonal_columns(X_train[i])
    X_train[i] = DTF.date_forecast_columns(X_train[i])
    # X_train[i] = OHE.one_hot_encode(X_train[i], categorical_features)
    X_train[i].drop(columns=['snow_drift:idx'], inplace=True)



for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i] = DTF.date_forecast_columns(X_frames_test[i])
    # X_frames_test[i] = OHE.one_hot_encode(X_frames_test[i], categorical_features)
    X_frames_test[i].drop(columns=['snow_drift:idx'], inplace=True)



# Training the model

In [59]:

# Split the data into training and validation sets

x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_train[0], Y_train[0], test_size=0.17, random_state=42)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_train[1], Y_train[1], test_size=0.17, random_state=42)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_train[2], Y_train[2], test_size=0.17, random_state=42)

# Use params from hyperparameter tuning using optuna
params_a = {'random_state': 42, 'n_estimators': 980, 'max_depth': 9, 'learning_rate': 0.029035565559484028, 'subsample': 0.8393121619033767, 'colsample_bytree': 0.7589542758688459}
params_b = {'random_state': 42, 'n_estimators': 955, 'max_depth': 9, 'learning_rate': 0.02949625834198986, 'subsample': 0.8030196155828968, 'colsample_bytree': 0.72518389089994}
params_c = {'random_state': 42, 'n_estimators': 727, 'max_depth': 8, 'learning_rate': 0.08609213174337473, 'subsample': 0.8107057409889747, 'colsample_bytree': 0.8763563332327975}
model_a = xgb.XGBRegressor(**params_a)
model_b = xgb.XGBRegressor(**params_b)
model_c = xgb.XGBRegressor(**params_c)


model_a.fit(x_train_a, y_train_a)
model_b.fit(x_train_b, y_train_b)
model_c.fit(x_train_c, y_train_c)



# Evaluate prediction

In [1]:
# Evaluate the model based on the validation data

mae_a = mean_absolute_error(y_val_a, model_a.predict(x_val_a))
print("MAE for A: ", mae_a)
mae_b = mean_absolute_error(y_val_b, model_b.predict(x_val_b))
print("MAE for B: ", mae_b)
mae_c = mean_absolute_error(y_val_c, model_c.predict(x_val_c))
print("MAE for C: ", mae_c)
print("Mean MAE: ", (mae_a + mae_b + mae_c) / 3)

# Evaluate the predictions

score_a = model_a.score(x_val_a, y_val_a)
score_b = model_b.score(x_val_b, y_val_b)
score_c = model_c.score(x_val_c, y_val_c)

print("Score A: ", score_a)
print("Score B: ", score_b)
print("Score C: ", score_c)
print('')

# Get feature importance scores
models = [(model_a, 'A'), (model_b, 'B'), (model_c, 'C')]
for model in models:

    feature_importance_scores = model[0].feature_importances_

# Create a DataFrame to associate features with their importance scores
    feature_importance_df1 = pd.DataFrame({'Feature': x_train_a.columns, 'Importance': feature_importance_scores})

# Sort features by importance in descending order
    feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importance scores
    pd.set_option('display.max_rows', 500)
    feature_importance_df1.head(500)
    print(f'Model {model[1]}')
    print(feature_importance_df1)
    pd.reset_option('display.max_rows')


NameError: name 'mean_squared_error' is not defined

```
Most_common = ['direct_rad:W', 'clear_sky_rad:W']

MSE for A:  155326.11984010294
MSE for B:  4311.822664627681
MSE for C:  2484.332046556924
Mean MSE:  54040.75818376252
Score A:  0.8869367102250868
Score B:  0.8880678863853381
Score C:  0.9167532450100108

Model A
                 Feature  Importance
9           direct_rad:W    0.558548 <-------- 3
7          diffuse_rad:W    0.077622 <-------- 2
18      is_in_shadow:idx    0.028293 <--------2
3        clear_sky_rad:W    0.026308 <-------- 3
40          cosinus_year    0.024731
24     snow_density:kgm3    0.022149 <-------- 2
29         sun_azimuth:d    0.021677
20  precip_type_5min:idx    0.016280
6         dew_point_2m:K    0.015750
19        precip_5min:mm    0.014176 <-------- 2

Model B
             Feature  Importance
9       direct_rad:W    0.369634     <--------- 3
30   sun_elevation:d    0.165624     <-------- 2
3    clear_sky_rad:W    0.084479     <--------- 3
18  is_in_shadow:idx    0.074470     <---------
17        is_day:idx    0.037519
22   rain_water:kgm2    0.028054
40      cosinus_year    0.024928
7      diffuse_rad:W    0.019828     <--------- 2
39        sinus_year    0.016917
16  fresh_snow_6h:cm    0.015372     <-------- 1/2

Model C
                 Feature  Importance
30       sun_elevation:d    0.737467 <--------- 2
3        clear_sky_rad:W    0.110762 <--------- 3
9           direct_rad:W    0.029844 <--------- 3
10       direct_rad_1h:J    0.023283
20  precip_type_5min:idx    0.010948
12     fresh_snow_12h:cm    0.010607
24     snow_density:kgm3    0.009615 <-------- 2
14     fresh_snow_24h:cm    0.007877 <-------- 1/2
6         dew_point_2m:K    0.005590
19        precip_5min:mm    0.005087 <-------- 2
```

# Make predictions

In [61]:
# Train the model on the entire training data
model_a.fit(X_train[0], Y_train[0])
model_b.fit(X_train[1], Y_train[1])
model_c.fit(X_train[2], Y_train[2])

y_pred_a = model_a.predict(X_frames_test[0])
y_pred_b = model_b.predict(X_frames_test[1])
y_pred_c = model_c.predict(X_frames_test[2])
print(len(y_pred_a))

y_pred = np.concatenate((y_pred_a, y_pred_b, y_pred_c), axis=0)



720


In [62]:
print(len(y_pred_a), len(X_frames_test[0]))
print(len(y_pred_b), len(X_frames_test[1]))
print(len(y_pred_c), len(X_frames_test[2]))

720 720
720 720
720 720


In [63]:
for i in range(len(y_pred)):
    if y_pred[i] < 0: 
        y_pred[i] = 0



# Create submission

In [64]:
y_test_pred = y_pred
print(len(y_test_pred))

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission.csv', index=False)

2160
