In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from statsmodels.tsa.arima_model import ARIMA
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [6]:
test_read = pd.read_csv('new_features_datasets/F_test.csv')
training_and_validation_read = pd.read_csv('new_features_datasets/F_training_and_validation.csv')

In [7]:
common_columns = training_and_validation_read.columns.intersection(test_read.columns)

# Reorder the columns in both DataFrames to match the order of 'common_columns'
test = test_read[common_columns]
training_and_validation = training_and_validation_read[common_columns]

# Add the 'pv_measurement' column from 'test' to 'training_and_validation'
training_and_validation['pv_measurement'] = training_and_validation_read['pv_measurement']

In [8]:
training_and_validation['date_forecast'] = pd.to_datetime(training_and_validation['date_forecast'])

# Define the date for the split
split_date = pd.to_datetime('2022-10-22')

# Split the DataFrame into training and test sets
train_fit = training_and_validation[training_and_validation['date_forecast'] <= split_date]
train_fit.reset_index(drop=True, inplace=True)
X_train = train_fit.drop(columns=['pv_measurement'])
X_train = X_train.drop(columns=['date_forecast'])

y_train = train_fit['pv_measurement']  # Target variable
test_fit = training_and_validation[training_and_validation['date_forecast'] > split_date]
test_fit.reset_index(drop=True, inplace=True)
X_test = test_fit.drop(columns=['pv_measurement'])
X_test = X_test.drop(columns=['date_forecast'])

y_test = test_fit['pv_measurement']  # Target variable

### Random forest

In [9]:
# Step 2: Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=400, max_depth=10, min_samples_split=2, min_samples_leaf=2)  # You can adjust hyperparameters
rf_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = rf_model.predict(X_test)

# Calculate the mean squared error (MSE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 65.15


In [12]:
feature_importance = rf_model.feature_importances_
your_feature_names = X_test.columns.tolist()
feature_names = pd.DataFrame({'Feature': your_feature_names, 'Importance': feature_importance})
# Sort by importance in descending order
feature_names = feature_names.sort_values(by='Importance', ascending=False)
# Print or view the sorted feature importance
print(feature_names)

                            Feature    Importance
8                      direct_rad:W  4.203274e-01
42                       Location_A  2.171567e-01
11                      elevation:m  1.977701e-01
6                     diffuse_rad:W  7.541393e-02
3                   clear_sky_rad:W  1.887394e-02
32                    sun_azimuth:d  1.410063e-02
33                  sun_elevation:d  4.159436e-03
2             clear_sky_energy_1h:J  3.831982e-03
37                     visibility:m  3.725661e-03
35                      t_1000hPa:K  3.651602e-03
9                   direct_rad_1h:J  3.399684e-03
36              total_cloud_cover:p  3.207193e-03
5                    dew_point_2m:K  3.157795e-03
39              wind_speed_u_10m:ms  2.719442e-03
7                  diffuse_rad_1h:J  2.663120e-03
10          effective_cloud_cover:p  2.496259e-03
45                             Year  1.997852e-03
26      relative_humidity_1000hPa:p  1.904930e-03
38                wind_speed_10m:ms  1.452486e-03


In [18]:
# Set your chosen threshold
threshold = 0.001  # Adjust this value based on your specific needs
# Select the features with importance scores above the threshold
selected_features = feature_names[feature_names['Importance'] > threshold]['Feature']

In [36]:
training2 = training_and_validation.drop(columns=['pv_measurement'])
x_training = training2.drop(columns=['date_forecast'])
x_training.reset_index(drop=True, inplace=True)

y_training = training_and_validation['pv_measurement']  # Target variable
y_training.reset_index(drop=True, inplace=True)

#test = test.drop(columns = ['date_forecast'])

In [37]:
rf_model2 = RandomForestRegressor(n_estimators=400, max_depth=10, min_samples_split=2, min_samples_leaf=2)  # You can adjust hyperparameters
rf_model2.fit(x_training, y_training)

# Step 3: Evaluate the model's performance on the validation data
y_pred_rf = rf_model2.predict(test)

In [38]:
test2 = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test2['prediction'] = y_pred_rf
sample_submission = sample_submission[['id']].merge(test2[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('Submissions/submission_10.csv', index=False)

#### HGB regression

In [39]:
# Initialize the HistGradientBoostingRegressor model
hgb_regressor = HistGradientBoostingRegressor()

# Train the model
hgb_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = hgb_regressor.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 64.06


In [40]:
hgb_regressor.fit(x_training, y_training)

# Step 3: Evaluate the model's performance on the validation data
y_pred_hgb = hgb_regressor.predict(test)

In [42]:
test3 = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test3['prediction'] = y_pred_hgb
sample_submission = sample_submission[['id']].merge(test3[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('Submissions/submission_11.csv', index=False)

In [47]:
from sklearn.inspection import permutation_importance
# Calculate permutation feature importances
result = permutation_importance(hgb_regressor, X_train, y_train, n_repeats=30, random_state=0)

# Get the importances and feature names
importances = result.importances_mean
feature_names = X_train.columns  # Assuming X_train is a DataFrame

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df

Unnamed: 0,Feature,Importance
11,elevation:m,0.653411
8,direct_rad:W,0.521204
3,clear_sky_rad:W,0.081528
6,diffuse_rad:W,0.041569
32,sun_azimuth:d,0.033536
42,Location_A,0.020005
10,effective_cloud_cover:p,0.011847
35,t_1000hPa:K,0.010164
39,wind_speed_u_10m:ms,0.009569
45,Year,0.009412


In [56]:
# Select features with importance values above 0
selected_features = feature_importance_df[feature_importance_df['Importance'] > 0]
new_features = ['Location_B', 'Location_C']
selected_features = pd.concat([selected_features, pd.DataFrame({'Feature': new_features, 'Importance': [0, 0]})], ignore_index=True)

In [57]:
train_X2 = X_train[selected_features['Feature']]

In [58]:
# Initialize the HistGradientBoostingRegressor model
#hgb_regressor2 = HistGradientBoostingRegressor()

# Train the model
hgb_regressor.fit(X_train[selected_features['Feature']], y_train)

# Make predictions on the test set
y_pred = hgb_regressor.predict(X_test[selected_features['Feature']])

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 63.21


In [59]:
hgb_regressor.fit(x_training[selected_features['Feature']], y_training)

# Step 3: Evaluate the model's performance on the validation data
y_pred_hgb = hgb_regressor.predict(test[selected_features['Feature']])

In [60]:
test3 = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
test3['prediction'] = y_pred_hgb
sample_submission = sample_submission[['id']].merge(test3[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('Submissions/submission_13.csv', index=False)