# Imports

In [1]:

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")



# Load datasets

In [2]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


# Data clean up an aggregation

In [3]:
# making shure that target values line up with x_values

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train

import data_func.aggregation as data_agg

for i in range(len(X_frames_train)):
    X_frames_train[i] = data_agg.gen_agg(X_frames_train[i], 'mean')
    X_frames_train[i], Y_frames_train[i] = data_allign(X_frames_train[i], Y_frames_train[i])


for j in range(len(X_frames_test)):
    X_frames_test[j] = data_agg.gen_agg(X_frames_test[j], 'mean')

print(len(X_frames_train[0]))
print(len(Y_frames_train[0]))
print(len(X_frames_test[0]))


29667
29667
720


# Feature engineering

In [4]:
import data_func.timeseasonality as DTS
for i in range(len(X_frames_train)):
    X_frames_train[i] = DTS.append_seasonal_columns(X_frames_train[i])
    X_frames_train[i].drop(columns=['date_forecast'], inplace=True)

for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i].drop(columns=['date_forecast'], inplace=True)

# Training the model

In [5]:

# Split the data into training and validation sets

x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_frames_train[0], Y_frames_train[0], test_size=0.17, random_state=None)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_frames_train[1], Y_frames_train[1], test_size=0.17, random_state=None)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_frames_train[2], Y_frames_train[2], test_size=0.17, random_state=None)

model_a = xgb.XGBRegressor(objective='reg:squarederror', random_state=None, learning_rate=0.25, max_depth=15, min_child_weight=3, gamma=8, reg_lambda=44)
model_b = xgb.XGBRegressor(objective='reg:squarederror', random_state=None, learning_rate=0.25, max_depth=10, min_child_weight=4, gamma=34, reg_lambda=20)
model_c = xgb.XGBRegressor(objective='reg:squarederror', random_state=None, learning_rate=0.24, max_depth=10, min_child_weight=3, gamma=8, reg_lambda=44)
# max_depth = 6 gives best


model_a.fit(x_train_a, y_train_a)
model_b.fit(x_train_b, y_train_b)
model_c.fit(x_train_c, y_train_c)



# Evaluate prediction

In [6]:
# Evaluate the model based on the validation data

mse_a = mean_squared_error(y_val_a, model_a.predict(x_val_a))
print("MSE for A: ", mse_a)
mse_b = mean_squared_error(y_val_b, model_b.predict(x_val_b))
print("MSE for B: ", mse_b)
mse_c = mean_squared_error(y_val_c, model_c.predict(x_val_c))
print("MSE for C: ", mse_c)
print("Mean MSE: ", (mse_a + mse_b + mse_c) / 3)

# Evaluate the predictions

score_a = model_a.score(x_val_a, y_val_a)
score_b = model_b.score(x_val_b, y_val_b)
score_c = model_c.score(x_val_c, y_val_c)

print("Score A: ", score_a)
print("Score B: ", score_b)
print("Score C: ", score_c)
print('')

# Get feature importance scores
models = [(model_a, 'A'), (model_b, 'B'), (model_c, 'C')]
for model in models:

    feature_importance_scores = model[0].feature_importances_

# Create a DataFrame to associate features with their importance scores
    feature_importance_df1 = pd.DataFrame({'Feature': x_train_a.columns, 'Importance': feature_importance_scores})

# Sort features by importance in descending order
    feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importance scores
    
    print(f'Model {model[1]}')
    print(feature_importance_df1.head(10))
    print('')


MSE for A:  152142.9309509963
MSE for B:  3800.655202913581
MSE for C:  2637.166722211876
Mean MSE:  52860.25095870725
Score A:  0.8939208780307045
Score B:  0.905450785281629
Score C:  0.9149393602433711

Model A
                 Feature  Importance
10          direct_rad:W    0.544118
8          diffuse_rad:W    0.067302
4        clear_sky_rad:W    0.041510
33    snow_melt_10min:mm    0.031026
48          cosinus_year    0.022969
20      is_in_shadow:idx    0.020353
23  precip_type_5min:idx    0.017036
35         sun_azimuth:d    0.015271
22        precip_5min:mm    0.014586
45             sinus_day    0.013040

Model B
             Feature  Importance
10      direct_rad:W    0.355808
36   sun_elevation:d    0.135486
4    clear_sky_rad:W    0.104658
20  is_in_shadow:idx    0.094286
48      cosinus_year    0.026637
18  fresh_snow_6h:cm    0.024173
8      diffuse_rad:W    0.022668
19        is_day:idx    0.021480
47        sinus_year    0.017070
27   rain_water:kgm2    0.015288

Model 

```
Most_common = ['direct_rad:W', 'clear_sky_rad:W']

MSE for A:  155326.11984010294
MSE for B:  4311.822664627681
MSE for C:  2484.332046556924
Mean MSE:  54040.75818376252
Score A:  0.8869367102250868
Score B:  0.8880678863853381
Score C:  0.9167532450100108

Model A
                 Feature  Importance
9           direct_rad:W    0.558548 <-------- 3
7          diffuse_rad:W    0.077622 <-------- 2
18      is_in_shadow:idx    0.028293 <--------2
3        clear_sky_rad:W    0.026308 <-------- 3
40          cosinus_year    0.024731
24     snow_density:kgm3    0.022149 <-------- 2
29         sun_azimuth:d    0.021677
20  precip_type_5min:idx    0.016280
6         dew_point_2m:K    0.015750
19        precip_5min:mm    0.014176 <-------- 2

Model B
             Feature  Importance
9       direct_rad:W    0.369634     <--------- 3
30   sun_elevation:d    0.165624     <-------- 2
3    clear_sky_rad:W    0.084479     <--------- 3
18  is_in_shadow:idx    0.074470     <---------
17        is_day:idx    0.037519
22   rain_water:kgm2    0.028054
40      cosinus_year    0.024928
7      diffuse_rad:W    0.019828     <--------- 2
39        sinus_year    0.016917
16  fresh_snow_6h:cm    0.015372     <-------- 1/2

Model C
                 Feature  Importance
30       sun_elevation:d    0.737467 <--------- 2
3        clear_sky_rad:W    0.110762 <--------- 3
9           direct_rad:W    0.029844 <--------- 3
10       direct_rad_1h:J    0.023283
20  precip_type_5min:idx    0.010948
12     fresh_snow_12h:cm    0.010607
24     snow_density:kgm3    0.009615 <-------- 2
14     fresh_snow_24h:cm    0.007877 <-------- 1/2
6         dew_point_2m:K    0.005590
19        precip_5min:mm    0.005087 <-------- 2
```

# Make predictions

In [7]:
""" 
model_a = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, learning_rate=0.25, max_depth=10, min_child_weight=2, gamma=150, reg_lambda=20)
model_b = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, learning_rate=0.25, max_depth=10, min_child_weight=4, gamma=34, reg_lambda=20)
model_c = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, learning_rate=0.24, max_depth=10, min_child_weight=3, gamma=8, reg_lambda=44)
 """
# max_depth = 6 gives best
model_a.fit(X_frames_train[0], Y_frames_train[0])
model_b.fit(X_frames_train[1], Y_frames_train[1])
model_c.fit(X_frames_train[2], Y_frames_train[2])

y_pred_a = model_a.predict(X_frames_test[0])
y_pred_b = model_b.predict(X_frames_test[1])
y_pred_c = model_c.predict(X_frames_test[2])
print(len(y_pred_a))

y_pred = np.concatenate((y_pred_a, y_pred_b, y_pred_c), axis=0)



720


In [8]:
print(len(y_pred_a), len(X_frames_test[0]))
print(len(y_pred_b), len(X_frames_test[1]))
print(len(y_pred_c), len(X_frames_test[2]))

720 720
720 720
720 720


In [None]:
for i in range(len(y_pred)):
    if y_pred[i] < 0: 
        y_pred[i] = 0



# Create submission

In [9]:
y_test_pred = y_pred
print(len(y_test_pred))

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission_2.csv', index=False)

2160
