# Imports

In [1]:

import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

import warnings
import data_func.read_data as read_data
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")



# Load datasets

In [2]:
dataframes = read_data.get_training_data()
X_frames_train = dataframes[0]
Y_frames_train = dataframes[1]
X_frames_test = read_data.get_test_data()


# Data clean up an aggregation

In [3]:
# making shure that target values line up with x_values

def data_allign(x_train, y_train):

  y_train.dropna(inplace=True)
  combined_data = pd.merge(x_train, y_train, left_on='date_forecast', right_on='time')
  y_train = combined_data['pv_measurement']

  if 'time' and 'pv_measurement' in combined_data.columns:
    combined_data.drop(columns=['time', 'pv_measurement'], inplace=True)
    
  return combined_data, y_train

import data_func.aggregation as data_agg

for i in range(len(X_frames_train)):
    X_frames_train[i] = data_agg.gen_agg(X_frames_train[i], 'mean')
    X_frames_train[i], Y_frames_train[i] = data_allign(X_frames_train[i], Y_frames_train[i])


for j in range(len(X_frames_test)):
    X_frames_test[j] = data_agg.gen_agg(X_frames_test[j], 'mean')

print(len(X_frames_train[0]))
print(len(Y_frames_train[0]))
print(len(X_frames_test[0]))


29667
29667
720


# Feature engineering

In [4]:
import data_func.timeseasonality as DTS
for i in range(len(X_frames_train)):
    X_frames_train[i] = DTS.append_seasonal_columns(X_frames_train[i])
    X_frames_train[i].drop(columns=['date_forecast'], inplace=True)

for i in range(len(X_frames_test)):
    X_frames_test[i] = DTS.append_seasonal_columns(X_frames_test[i])
    X_frames_test[i].drop(columns=['date_forecast'], inplace=True)

# Training the model

In [5]:

model_a = HistGradientBoostingRegressor()
model_b = HistGradientBoostingRegressor()
model_c = HistGradientBoostingRegressor()

# Make predictions

In [6]:
# max_depth = 6 gives best
model_a.fit(X_frames_train[0], Y_frames_train[0])
model_b.fit(X_frames_train[1], Y_frames_train[1])
model_c.fit(X_frames_train[2], Y_frames_train[2])

y_pred_a = model_a.predict(X_frames_test[0])
y_pred_b = model_b.predict(X_frames_test[1])
y_pred_c = model_c.predict(X_frames_test[2])
print(len(y_pred_a))

y_pred = np.concatenate((y_pred_a, y_pred_b, y_pred_c), axis=0)



720


In [7]:
print(len(y_pred_a), len(X_frames_test[0]))
print(len(y_pred_b), len(X_frames_test[1]))
print(len(y_pred_c), len(X_frames_test[2]))

720 720
720 720
720 720


In [8]:
for i in range(len(y_pred)):
    if y_pred[i] < 0: 
        y_pred[i] = 0



# Create submission

In [9]:
y_test_pred = y_pred
print(len(y_test_pred))

test = pd.read_csv('../data/test.csv')
test['prediction'] = y_test_pred
sample_submission = pd.read_csv('../data/sample_submission.csv')
submission = sample_submission[['id']].merge(test[['id', 'prediction']], on='id', how='left')
submission.to_csv('submission_3.csv', index=False)

2160
