# Imports

In [1]:

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")
warnings.filterwarnings("ignore", category=FutureWarning, message="is_sparse is deprecated")


# Load datasets

In [2]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


In [3]:
XTRA = X_frames_train[0]
XTRB = X_frames_train[1]
XTRC = X_frames_train[2]

YA = Y_frames_train[0]
YB = Y_frames_train[1]
YC = Y_frames_train[2]

drop 0 since we want to match on the "whole" hour.

In [4]:
for i in range(len(X_frames_test)):
    X_frames_test[i] = X_frames_test[i].drop(columns=["date_calc"])

# Data clean up

In [5]:
# making shure that target values line up with x_values
import data_func.aggregation as data_agg

categorical_col = ['dew_or_rime:idx', 'precip_type_5min:idx']

def aggregate_correct_x(x: pd.DataFrame) -> pd.DataFrame:
   categorical = x[["date_forecast"] + categorical_col]
   mean = x.drop(columns=categorical_col)

   categorical = data_agg.gen_agg(categorical, data_agg.stocastic_median)
   mean = data_agg.gen_agg(mean, "mean")

   return pd.merge(categorical, mean, on="date_forecast")

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  x_train = aggregate_correct_x(x_train)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train



X_train = [XTRA, XTRB, XTRC]
Y_train = [YA, YB, YC]

for i in range(len(X_train)):
    X_train[i], Y_train[i] = data_allign(X_train[i], Y_train[i])

for j in range(len(X_frames_test)):
    X_frames_test[j] = aggregate_correct_x(X_frames_test[j])




In [6]:
#Verify length matches
for x in range(len(X_train)):
    print("x,y: ", len(X_train[x]), len(Y_train[x]))



x,y:  29667 29667
x,y:  29218 29218
x,y:  23141 23141


# Feature engineering

In [7]:
import data_func.timeseasonality as DTS
import data_func.one_hot_encoding as OHE
for i in range(len(X_train)):
    X_train[i] = DTS.append_seasonal_columns(X_train[i])
    X_train[i].drop(columns=['date_forecast'], inplace=True)

for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i].drop(columns=['date_forecast'], inplace=True)

# THIS SECTION CAN ONLY WORK IF THE PREVIOUS AGGREGATION IS DONE INDIVIDUALLY FOR CATEGORICAL DATA
# import data_func.one_hot_encoding as OHE

# for i in range(len(X_train)):
#      X_train[i] = OHE.one_hot_encode(X_train[i], ['dew_or_rime:idx', 'precip_type_5min:idx'])

# for i in range(len(X_frames_test)):
#      X_frames_test[i] = OHE.one_hot_encode(X_frames_test[i], ['dew_or_rime:idx', 'precip_type_5min:idx'])
#      X_frames_test[i]["dew_or_rime:idx_-1"] = 0 
#      X_frames_test[i]["precip_type_5min:idx_2"] = 0 
#      X_frames_test[i]["precip_type_5min:idx_3"] = 0 



In [8]:
X_train[0].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
dew_or_rime:idx,29667.0,0.016854,0.171208,-1.0,0.0,0.0,0.0,1.0
precip_type_5min:idx,29667.0,0.068123,0.32312,0.0,0.0,0.0,0.0,5.0
absolute_humidity_2m:gm3,29667.0,6.379581,2.612027,0.7,4.4,5.975,8.1,16.05
air_density_2m:kgm3,29667.0,1.252307,0.034827,1.145,1.229,1.251,1.274,1.42625
ceiling_height_agl:m,24681.0,3043.045898,2555.343994,27.849998,1183.925049,2080.0,4255.549805,12285.65
clear_sky_energy_1h:J,29667.0,565528.5625,825693.5625,0.0,0.0,47827.3,959101.6875,2988628.0
clear_sky_rad:W,29667.0,157.091171,229.956268,0.0,0.0,13.25,273.575012,835.1
cloud_base_agl:m,28055.0,1746.57666,1822.560547,27.9,598.125,1160.9,2088.106201,11673.62
dew_point_2m:K,29667.0,276.139557,6.374989,251.074997,271.875,276.05,280.924988,291.95
diffuse_rad:W,29667.0,42.953587,61.441681,0.0,0.0,6.1625,72.262501,332.275


In [9]:
X_frames_test[0].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
dew_or_rime:idx,720.0,0.02916667,0.1683905,0.0,0.0,0.0,0.0,1.0
precip_type_5min:idx,720.0,0.06388889,0.2447249,0.0,0.0,0.0,0.0,1.0
absolute_humidity_2m:gm3,720.0,8.205903,2.178425,3.2,6.69375,8.05,9.98125,13.675
air_density_2m:kgm3,720.0,1.23315,0.03210283,1.151,1.209,1.238625,1.26,1.29975
ceiling_height_agl:m,532.0,3096.333,2945.732,53.299999,953.0,1712.7,4097.362,11430.78
clear_sky_energy_1h:J,720.0,1227651.0,1101479.0,0.0,48799.869141,933790.0,2276145.0,2987530.0
clear_sky_rad:W,720.0,341.014,306.9119,0.0,17.95,299.575,661.7375,834.95
cloud_base_agl:m,650.0,1891.609,2128.155,30.225,509.556274,1035.875,2437.175,11256.7
dew_point_2m:K,720.0,280.8002,4.339592,268.100006,278.024994,281.05,284.3812,289.55
diffuse_rad:W,720.0,84.90232,78.5902,0.0,8.0875,75.3,134.2313,307.5


## Dropping data

In [12]:
drop_col = ["snow_density:kgm3", 
            "pressure_100m:hPa", 
            "pressure_50m:hPa", 
            "snow_depth:cm", 
            "ceiling_height_agl:m", 
            "fresh_snow_24h:cm", 
            "fresh_snow_3h:cm"]

for i in range(len(X_train)):
    X_train[i] = X_train[i].drop(columns = drop_col )
    X_frames_test[i] = X_frames_test[i].drop(columns= drop_col)


# Training the model

In [13]:
x_train_a, x_val_a, y_train_a, y_val_a = train_test_split(X_train[0], Y_train[0], test_size=0.17, random_state=None)
x_train_b, x_val_b, y_train_b, y_val_b = train_test_split(X_train[1], Y_train[1], test_size=0.17, random_state=None)
x_train_c, x_val_c, y_train_c, y_val_c = train_test_split(X_train[2], Y_train[2], test_size=0.17, random_state=None)

In [14]:
# Use params from hyperparameter tuning using optuna
params_a = {'random_state': 42, 'n_estimators': 980, 'max_depth': 9, 'learning_rate': 0.029035565559484028, 'subsample': 0.8393121619033767, 'colsample_bytree': 0.7589542758688459}
params_b = {'random_state': 42, 'n_estimators': 955, 'max_depth': 9, 'learning_rate': 0.02949625834198986, 'subsample': 0.8030196155828968, 'colsample_bytree': 0.72518389089994}
params_c = {'random_state': 42, 'n_estimators': 727, 'max_depth': 8, 'learning_rate': 0.08609213174337473, 'subsample': 0.8107057409889747, 'colsample_bytree': 0.8763563332327975}
model_a = xgb.XGBRegressor(**params_a)
model_b = xgb.XGBRegressor(**params_b)
model_c = xgb.XGBRegressor(**params_c)



In [15]:

model_a.fit(x_train_a, y_train_a)
model_b.fit(x_train_b, y_train_b)
model_c.fit(x_train_c, y_train_c)

# Make predictions

In [17]:
model_a.fit(X_train[0], Y_train[0])
model_b.fit(X_train[1], Y_train[1])
model_c.fit(X_train[2], Y_train[2])

In [18]:
# Do some more stuff
y_pred_a = model_a.predict(X_frames_test[0])
y_pred_b = model_b.predict(X_frames_test[1])
y_pred_c = model_c.predict(X_frames_test[2])


# Evaluate prediction

In [16]:

# Evaluate the model based on the validation data

mse_a = mean_squared_error(y_val_a, model_a.predict(x_val_a))
print("MSE for A: ", mse_a)
mse_b = mean_squared_error(y_val_b, model_b.predict(x_val_b))
print("MSE for B: ", mse_b)
mse_c = mean_squared_error(y_val_c, model_c.predict(x_val_c))
print("MSE for C: ", mse_c)
print("Mean MSE: ", (mse_a + mse_b + mse_c) / 3)

# Evaluate the predictions

score_a = model_a.score(x_val_a, y_val_a)
score_b = model_b.score(x_val_b, y_val_b)
score_c = model_c.score(x_val_c, y_val_c)

print("Score A: ", score_a)
print("Score B: ", score_b)
print("Score C: ", score_c)
print('')

# Get feature importance scores
models = [(model_a, 'A'), (model_b, 'B'), (model_c, 'C')]
for model in models:

    feature_importance_scores = model[0].feature_importances_

# Create a DataFrame to associate features with their importance scores
    feature_importance_df1 = pd.DataFrame({'Feature': x_train_a.columns, 'Importance': feature_importance_scores})

# Sort features by importance in descending order
    feature_importance_df1 = feature_importance_df1.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importance scores
    
    print(f'Model {model[1]}')
    print(feature_importance_df1.head(10))
    print('')


MSE for A:  153312.4186017793
MSE for B:  4016.0402458779813
MSE for C:  2535.913950434679
Mean MSE:  53288.12426603065
Score A:  0.8946682377882834
Score B:  0.9022832738988866
Score C:  0.9154874520770043

Model A
             Feature  Importance
10      direct_rad:W    0.393947
11   direct_rad_1h:J    0.126903
8      diffuse_rad:W    0.049902
5    clear_sky_rad:W    0.042737
18  is_in_shadow:idx    0.027637
20    precip_5min:mm    0.027032
41      cosinus_year    0.025505
29   sun_elevation:d    0.024073
28     sun_azimuth:d    0.024016
38         sinus_day    0.019216

Model B
              Feature  Importance
10       direct_rad:W    0.250389
5     clear_sky_rad:W    0.122218
18   is_in_shadow:idx    0.103694
29    sun_elevation:d    0.098762
17         is_day:idx    0.050124
16   fresh_snow_6h:cm    0.045439
14  fresh_snow_12h:cm    0.034243
41       cosinus_year    0.026628
11    direct_rad_1h:J    0.024638
8       diffuse_rad:W    0.021780

Model C
                  Feature  Im

# Create submission

In [19]:
y_pred = np.concatenate((y_pred_a, y_pred_b, y_pred_c), axis=0)

In [20]:
y_test_pred = y_pred

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission_xg_y.csv', index=False)