In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [5]:
test_read = pd.read_csv('new_features_datasets/F_test.csv')
training_and_validation_read = pd.read_csv('new_features_datasets/F_training_and_validation.csv')

In [6]:
common_columns = training_and_validation_read.columns.intersection(test_read.columns)

# Reorder the columns in both DataFrames to match the order of 'common_columns'
test = test_read[common_columns]
training_and_validation = training_and_validation_read[common_columns]

# Add the 'pv_measurement' column from 'test' to 'training_and_validation'
training_and_validation['pv_measurement'] = training_and_validation_read['pv_measurement']

In [7]:
training_and_validation['date_forecast'] = pd.to_datetime(training_and_validation['date_forecast'])

# Define the date for the split
split_date = pd.to_datetime('2022-10-22')

# Split the DataFrame into training and test sets
train_fit = training_and_validation[training_and_validation['date_forecast'] <= split_date]
train_fit.reset_index(drop=True, inplace=True)
X_train = train_fit.drop(columns=['pv_measurement'])
X_train = X_train.drop(columns=['date_forecast'])

y_train = train_fit['pv_measurement']  # Target variable
test_fit = training_and_validation[training_and_validation['date_forecast'] > split_date]
test_fit.reset_index(drop=True, inplace=True)
X_test = test_fit.drop(columns=['pv_measurement'])
X_test = X_test.drop(columns=['date_forecast'])

y_test = test_fit['pv_measurement']  # Target variable

### Random forest

In [8]:
# Step 2: Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=400, max_depth=10, min_samples_split=2, min_samples_leaf=2)  # You can adjust hyperparameters
rf_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = rf_model.predict(X_test)

# Calculate the mean squared error (MSE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 68.24


In [9]:
training2 = training_and_validation.drop(columns=['pv_measurement'])
x_training = training2.drop(columns=['date_forecast'])
x_training.reset_index(drop=True, inplace=True)

y_training = training_and_validation['pv_measurement']  # Target variable
y_training.reset_index(drop=True, inplace=True)

test = test.drop(columns = ['date_forecast'])

In [10]:
rf_model2 = RandomForestRegressor(n_estimators=400, max_depth=10, min_samples_split=2, min_samples_leaf=2)  # You can adjust hyperparameters
rf_model2.fit(x_training, y_training)

# Step 3: Evaluate the model's performance on the validation data
y_pred_rf = rf_model2.predict(test)

In [11]:
test2 = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test2['prediction'] = y_pred_rf
sample_submission = sample_submission[['id']].merge(test2[['id', 'prediction']], on='id', how='left')
#sample_submission.to_csv('Submissions/submission_7.csv', index=False)

#### HGB regression

In [12]:
# Initialize the HistGradientBoostingRegressor model
hgb_regressor = HistGradientBoostingRegressor()

# Train the model
hgb_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = hgb_regressor.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 70.07


In [13]:
hgb_regressor.fit(x_training, y_training)

# Step 3: Evaluate the model's performance on the validation data
y_pred_hgb = hgb_regressor.predict(test)

In [None]:
test3 = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test3['prediction'] = y_pred_hgb
sample_submission = sample_submission[['id']].merge(test3[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('Submissions/submission_9.csv', index=False)