In [17]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from statsmodels.tsa.arima_model import ARIMA
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [18]:
test_read = pd.read_csv('new_features_datasets/F_test.csv')
training_and_validation_read = pd.read_csv('new_features_datasets/F_training_and_validation.csv')

In [19]:
common_columns = training_and_validation_read.columns.intersection(test_read.columns)

# Reorder the columns in both DataFrames to match the order of 'common_columns'
test = test_read[common_columns]
training_and_validation = training_and_validation_read[common_columns]

# Add the 'pv_measurement' column from 'test' to 'training_and_validation'
training_and_validation['pv_measurement'] = training_and_validation_read['pv_measurement']

In [20]:
training_and_validation['date_forecast'] = pd.to_datetime(training_and_validation['date_forecast'])

# Define the date for the split
split_date = pd.to_datetime('2022-10-22')

# Split the DataFrame into training and test sets
train_fit = training_and_validation[training_and_validation['date_forecast'] <= split_date]
train_fit.reset_index(drop=True, inplace=True)
X_train = train_fit.drop(columns=['pv_measurement'])
X_train = X_train.drop(columns=['date_forecast'])

y_train = train_fit['pv_measurement']  # Target variable
test_fit = training_and_validation[training_and_validation['date_forecast'] > split_date]
test_fit.reset_index(drop=True, inplace=True)
X_test = test_fit.drop(columns=['pv_measurement'])
X_test = X_test.drop(columns=['date_forecast'])

y_test = test_fit['pv_measurement']  # Target variable

### DecisionTree

In [21]:
from sklearn.tree import DecisionTreeRegressor

# Step 2: Create and train the Random Forest model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = dt_model.predict(X_test)

# Calculate the mean squared error (MSE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 83.32


### GradientBooster

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
# Step 2: Create and train the Gradient Boosting model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = gb_model.predict(X_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 79.68


In [28]:
feature_importance = gb_model.feature_importances_
your_feature_names = X_test.columns.tolist()
feature_names = pd.DataFrame({'Feature': your_feature_names, 'Importance': feature_importance})
# Sort by importance in descending order
feature_names = feature_names.sort_values(by='Importance', ascending=False)
# Print or view the sorted feature importance
print(feature_names)

                            Feature  Importance
8                      direct_rad:W    0.369722
42                       Location_A    0.283722
11                      elevation:m    0.166571
3                   clear_sky_rad:W    0.104497
6                     diffuse_rad:W    0.050281
32                    sun_azimuth:d    0.006933
37                     visibility:m    0.005336
33                  sun_elevation:d    0.002193
35                      t_1000hPa:K    0.001213
45                             Year    0.001080
2             clear_sky_energy_1h:J    0.001030
7                  diffuse_rad_1h:J    0.000990
21             precip_type_5min:idx    0.000900
9                   direct_rad_1h:J    0.000843
0          absolute_humidity_2m:gm3    0.000738
5                    dew_point_2m:K    0.000662
23                 pressure_50m:hPa    0.000547
39              wind_speed_u_10m:ms    0.000318
38                wind_speed_10m:ms    0.000315
22                pressure_100m:hPa    0

In [25]:
training2 = training_and_validation.drop(columns=['pv_measurement'])
x_training = training2.drop(columns=['date_forecast'])
x_training.reset_index(drop=True, inplace=True)

y_training = training_and_validation['pv_measurement']  # Target variable
y_training.reset_index(drop=True, inplace=True)

#test = test.drop(columns = ['date_forecast'])

### KNN

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingRegressor
# Step 2: Create and train the Gradient Boosting model
knn_model=KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = knn_model.predict(X_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 147.92


### SVC

In [8]:
"""# Step 2: Create and train the Gradient Boosting model
sv_model=SVC(C=5, gamma =1, kernel= 'rbf')
sv_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = sv_model.predict(X_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')"""

"# Step 2: Create and train the Gradient Boosting model\nsv_model=SVC(C=5, gamma =1, kernel= 'rbf')\nsv_model.fit(X_train, y_train)\n\n# Step 3: Evaluate the model's performance on the validation data\ny_pred = sv_model.predict(X_test)\n\n# Calculate the mean absolute error (MAE)\nmae = mean_absolute_error(y_test, y_pred)\nprint(f'Mean absolute error on validation data: {mae:.2f}')"

### GaussianNB

In [9]:
# Step 2: Create and train the Gradient Boosting model
nb_model=GaussianNB()
nb_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = nb_model.predict(X_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 273.89


### XG BOOST

In [10]:
from xgboost import XGBClassifier

# Initialize a label encoder
encoder = LabelEncoder()

# Fit the encoder to y_train and transform it
y_train_encoded = encoder.fit_transform(y_train)

# Transform y_test with the same encoder
y_test_encoded = encoder.transform(y_test)

# Now, proceed with training
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train_encoded)
y_pred_encoded = xgb_model.predict(X_test)

# Calculate MAE with the encoded labels
mae = mean_absolute_error(y_test_encoded, y_pred_encoded)
print(f'Mean absolute error on validation data: {mae:.2f}')

ValueError: y contains previously unseen labels: [1085, 1222, 1369, 1394, 1532, 1548, 1653, 1708, 1812, 1814, 1933, 1950, 1978, 1992, 1993, 2034, 2050, 2052, 2167, 2199, 2318, 2435, 2445, 2493, 2567, 2586, 2795, 2810, 2825, 2864, 2866, 2914, 2939, 2968, 2970, 2972, 3057, 3181, 3254, 3334, 3374, 3385, 3416, 3419, 3526, 3608, 3619, 3647, 3666, 3747, 3765, 3846, 3866, 3948, 3949, 3953, 3968, 3984, 4008, 4209, 4221, 4250, 4318, 4335, 4413, 4420, 4475, 4510, 4533, 4565, 4569, 4574, 4608, 4612, 4635, 4690, 4706, 4709, 4807, 4842, 4848, 4861, 4881, 4890, 4895, 4897, 4976, 5002, 5043]

### ARIMA

In [6]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error

# Ensure your data is sorted chronologically if it's a time series.

# Step 2: Create and train the ARIMA model
# Here, p, d, and q are the parameters for ARIMA:
# p is the number of lag observations included (lag order)
# d is the number of times that the raw observations are differenced (degree of differencing)
# q is the size of the moving average window (order of moving average)
# You may need to experiment and tune these parameters.
p, d, q = 1, 1, 1

arima_model = ARIMA(y_train, order=(p,d,q))
arima_model_fit = arima_model.fit()

# Step 3: Evaluate the model's performance on the validation data
y_pred = arima_model_fit.forecast(steps=len(y_test))

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Mean absolute error on validation data: 169.24


In [9]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error

# Ensure your data is sorted chronologically.

# Step 2: Create and train the SARIMA model
p, d, q = 1, 1, 1
P, D, Q, s = 1, 1, 1, 8760  # s is set to 8760 for hourly data with yearly seasonality

sarima_model = SARIMAX(y_train, order=(p,d,q), seasonal_order=(P,D,Q,s), initialization='approximate_diffuse')
sarima_model_fit = sarima_model.fit(low_memory=True)

# Step 3: Evaluate the model's performance on the validation data
y_pred = sarima_model_fit.forecast(steps=len(y_test))

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

MemoryError: Unable to allocate 11.9 GiB for an array with shape (90785, 17521) and data type float64

### SVM

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Scaling data can often help SVM converge faster and perform better
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Create and train the Support Vector Machine model with a linear kernel and other modified parameters
svr_model = SVR(kernel='linear', C=1.0, tol=0.01)  # Adjust parameters as needed
svr_model.fit(X_train_scaled, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = svr_model.predict(X_test_scaled)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

In [8]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error

# Define alphas that you want to test
alphas = [0.01, 0.1, 1.0, 10.0, 100.0]

# Step 2: Create and train the Ridge Regression model with cross-validation
ridge_cv_model = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv_model.fit(X_train, y_train)

# Print the best alpha value
print(f'Best alpha: {ridge_cv_model.alpha_}')

# Step 3: Evaluate the model's performance on the validation data
y_pred = ridge_cv_model.predict(X_test)

# Calculate the mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

Best alpha: 100.0
Mean absolute error on validation data: 6905.65


In [14]:
# Install and import necessary libraries
!pip install fbprophet

Collecting fbprophet

  error: subprocess-exited-with-error
  
  × Building wheel for fbprophet (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [60 lines of output]
        from pkg_resources import (
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib
      creating build\lib\fbprophet
      creating build\lib\fbprophet\stan_model
      Traceback (most recent call last):
        File "c:\users\malin\onedrive - ntnu\5.klasse\hÃ¸st 2023\maskinlÃ¦ring\main exercise\solar prediction\.venv\lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 353, in <module>
          main()
        File "c:\users\malin\onedrive - ntnu\5.klasse\hÃ¸st 2023\maskinlÃ¦ring\main exercise\solar prediction\.venv\lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 335, in main
          json_out['return_val'] = hook(**hook_input['kwargs'])
        File "c:\users\malin\onedrive - ntnu\5.klasse\hÃ¸st 202


  Using cached fbprophet-0.7.1.tar.gz (64 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting Cython>=0.22 (from fbprophet)
  Obtaining dependency information for Cython>=0.22 from https://files.pythonhosted.org/packages/bd/48/a32de24cffce9da801a7fe9c509d81c2d9fdc63954155076b061091c0d5b/Cython-3.0.4-cp37-cp37m-win_amd64.whl.metadata
  Using cached Cython-3.0.4-cp37-cp37m-win_amd64.whl.metadata (3.2 kB)
Collecting cmdstanpy==0.9.5 (from fbprophet)
  Using cached cmdstanpy-0.9.5-py3-none-any.whl (37 kB)
Collecting pystan>=2.14 (from fbprophet)
  Using cached pystan-3.3.0-py3-none-any.whl (13 kB)
Collecting LunarCalendar>=0.0.9 (from fbprophet)
  Using cached LunarCalendar-0.0.9-py2.py3-none-any.whl 

In [15]:


import pandas as pd
from fbprophet import Prophet
from sklearn.metrics import mean_absolute_error

# Assuming X_train, y_train, X_test, and y_test are already loaded as pandas DataFrames or Series

# Format data for Prophet: 'ds' for timestamp and 'y' for the value
train_df = pd.DataFrame({
    'ds': X_train['your_timestamp_column'].values,
    'y': y_train.values
})

test_df = pd.DataFrame({
    'ds': X_test['your_timestamp_column'].values,
    'y': y_test.values
})

# Create and train the Prophet model with yearly seasonality
prophet = Prophet(yearly_seasonality=True)
prophet.fit(train_df)

# Generate future dataframe for predictions
future = prophet.make_future_dataframe(periods=len(test_df))

# Predict
forecast = prophet.predict(future)

# Extract predicted values for the test set
y_pred = forecast['yhat'][-len(test_df):].values

# Evaluate the model's performance on the validation data
mae = mean_absolute_error(test_df['y'], y_pred)
print(f'Mean absolute error on validation data: {mae:.2f}')

ModuleNotFoundError: No module named 'fbprophet'