# Imports

In [87]:

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")



# Load datasets

In [88]:
X_train_estimated_a = pd.read_parquet('../data/A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('../data/B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('../data/C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('../data/A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('../data/B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('../data/C/X_train_observed.parquet')

X_test_estimated_a = pd.read_parquet('../data/A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('../data/B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('../data/C/X_test_estimated.parquet')

train_targets_a = pd.read_parquet('../data/A/train_targets.parquet')
train_targets_b = pd.read_parquet('../data/B/train_targets.parquet')
train_targets_c = pd.read_parquet('../data/C/train_targets.parquet')


# Data clean up

In [89]:

# Data set A, B and C clean up

def data_clean_up(x_train_est, x_train_observe, y_train):

  if 'date_calc' in x_train_est.columns:
    x_train_est.drop(columns="date_calc", inplace=True)

  x_train = pd.concat([x_train_observe, x_train_est])
  x_train = polynomial_feature(x_train, ['direct_rad:W', 'diffuse_rad:W'])


  # Group the rows into blocks of 4 and apply the aggregation function
  agg_func = {col: 'mean' for col in x_train.columns[1:]}
  X_train_downscaled = x_train.groupby(x_train.index // 4).agg({**{'date_forecast': 'first'}, **agg_func})

  y_train.dropna(inplace=True)
  combined_data = pd.merge(X_train_downscaled, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'date_forecast' and 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns="date_forecast", inplace=True)
    combined_data.drop(columns="time", inplace=True)
    combined_data.drop(columns="pv_measurement", inplace=True)


  return combined_data, y_train

x_train_a, y_train_a = data_clean_up(X_train_estimated_a, X_train_observed_a, train_targets_a)
x_train_b, y_train_b = data_clean_up(X_train_estimated_b, X_train_observed_b, train_targets_b)
x_train_c, y_train_c = data_clean_up(X_train_estimated_c, X_train_observed_c, train_targets_c)


def data_clean_up_test(x_test_est):

  x_test_est = polynomial_feature(x_test_est, ['direct_rad:W', 'diffuse_rad:W'])

  # Group the rows into blocks of 4 and apply the aggregation function
  agg_func = {col: 'mean' for col in x_test_est.columns[1:]}
  X_test_downscaled = x_test_est.groupby(x_test_est.index // 4).agg({**{'date_forecast': 'first'}, **agg_func})

  if 'date_forecast' in X_test_downscaled.columns:
    X_test_downscaled.drop(columns="date_forecast", inplace=True)

  return X_test_downscaled

X_test_estimated_a = data_clean_up_test(X_test_estimated_a)
X_test_estimated_b = data_clean_up_test(X_test_estimated_b)
X_test_estimated_c = data_clean_up_test(X_test_estimated_c)


# Feature engineering

In [90]:
# Do feature selection etc.

# Polynomial features of degree 2 of most important features

def polynomial_feature(x_dataset, features):
  
  for feature in features:
    x_dataset[feature + ':squared'] = x_dataset[feature] ** 2

  return x_dataset

# x_train_a = polynomial_feature(x_train_a, ['direct_rad:W', 'diffuse_rad:W'])

# x_train_b = polynomial_feature(x_train_b, ['direct_rad:W', 'diffuse_rad:W'])
# x_train_c = polynomial_feature(x_train_c, ['direct_rad:W', 'diffuse_rad:W'])

# X_test_estimated_a = polynomial_feature(X_test_estimated_a, ['direct_rad:W', 'diffuse_rad:W'])
# X_test_estimated_b = polynomial_feature(X_test_estimated_b, ['direct_rad:W', 'diffuse_rad:W'])
# X_test_estimated_c = polynomial_feature(X_test_estimated_c, ['direct_rad:W', 'diffuse_rad:W'])


# Training the model

In [91]:

model_a = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model_b = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model_c = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

model_a.fit(x_train_a, y_train_a)
model_b.fit(x_train_b, y_train_b)
model_c.fit(x_train_c, y_train_c)

# Make predictions

In [92]:


y_pred_a = model_a.predict(X_test_estimated_a)
y_pred_b = model_b.predict(X_test_estimated_b)
y_pred_c = model_c.predict(X_test_estimated_c)

y_pred = np.concatenate((y_pred_a, y_pred_b, y_pred_c), axis=0)

# Evaluate prediction

In [93]:
score_a = model_a.score(x_train_a, y_train_a)
score_b = model_b.score(x_train_b, y_train_b)
score_c = model_c.score(x_train_c, y_train_c)

print("Score A: ", score_a)
print("Score B: ", score_b)
print("Score C: ", score_c)

# Get feature importance scores
model_a.fit(x_train_a, y_train_a)
feature_importance_scores = model_a.feature_importances_

# Create a DataFrame to associate features with their importance scores
feature_importance_df = pd.DataFrame({'Feature': x_train_a.columns, 'Importance': feature_importance_scores})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importance scores
print(feature_importance_df)


Score A:  0.961681689548398
Score B:  0.9610641128057382
Score C:  0.986476100342621
                           Feature  Importance
10                    direct_rad:W    0.346458
45            direct_rad:W:squared    0.220429
46           diffuse_rad:W:squared    0.065861
4                  clear_sky_rad:W    0.035590
8                    diffuse_rad:W    0.026510
30               snow_density:kgm3    0.024261
23            precip_type_5min:idx    0.024014
35                   sun_azimuth:d    0.017253
22                  precip_5min:mm    0.016460
20                is_in_shadow:idx    0.016233
7                   dew_point_2m:K    0.016019
16               fresh_snow_24h:cm    0.013379
27                 rain_water:kgm2    0.010945
15                fresh_snow_1h:cm    0.009567
24               pressure_100m:hPa    0.008595
40                    visibility:m    0.007645
34                 snow_water:kgm2    0.007326
31                   snow_depth:cm    0.007158
39             total_c

# Create submission

In [94]:
y_test_pred = y_pred

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission.csv', index=False)