In [None]:
# Libraries import

import warnings
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
warnings.filterwarnings('ignore')

In [None]:
# Selecting folder

data_dir = os.getcwd()[0:-10] + "\\data\\raw"
files = os.listdir(data_dir)
files.pop(0)

In [None]:
# Loading the data (change 'files' index to modify meteorological station)

df = pd.read_excel(data_dir + "\\" + files[0], header=1)

In [None]:
# Standardizing variable names, defining date index and adding new time variables

df.columns = ['date', 'wind speed', 't max', 't min', 'humidity max', 'humidity min', 'vpd', 'evaporation', 'solar radiation']
aux = df['date']
df = df.set_index('date')
df['year'] = df.index.year
df['month'] = df.index.month
df['weekday name'] = df.index.weekday_name

df

In [None]:
# Data split

X_train, X_test = df.loc['1998':'2008'], df.loc['2009':'2012']
y_train, y_test = X_train['solar radiation'], X_test['solar radiation']
X_train = X_train.drop(['year', 'month', 'weekday name', 'solar radiation'], axis=1)
X_test = X_test.drop(['year', 'month', 'weekday name', 'solar radiation'], axis=1)

In [None]:
X_train

### A) Multiple Linear Regression Model Using The Original Variables

In [None]:
# Fitting multiple linear regression model to the training set

regressor_orig = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
regressor_orig.fit(X_train, y_train)

In [None]:
# Predicting the test set results

y_pred_orig = regressor_orig.predict(X_test)

In [None]:
# Calculation of metrics

print("MSE:", metrics.mean_squared_error(y_test, y_pred_orig, squared=True))
print("RMSE:", metrics.mean_squared_error(y_test, y_pred_orig, squared=False))
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_orig))
print("R²:", metrics.r2_score(y_test, y_pred_orig))
print("R²_adj:", (1 - (1 - metrics.r2_score(y_test, y_pred_orig)) * ((X_train.shape[0] - 1) / (X_train.shape[0] - X_train.shape[1] - 1))))

In [None]:
# Estimated coefficients and independent term values

print("Coefficients:", regressor_orig.coef_)
print("Intercept:", regressor_orig.intercept_)

In [None]:
y_pred_orig_df = pd.DataFrame(data=y_pred_orig, index=aux[4018:])

plt.figure()
plt.plot(y_test)
plt.plot(y_pred_orig_df, color='red')
plt.title('Prediction vs Real Values')
plt.xlabel('Date')
plt.ylabel('Solar radiation')
plt.show()

### B) Multiple Linear Regression Model Adding Polynomial Features

In [None]:
# Creating polynomial features (degree = 2, interaction_only=False)

poly_train = PolynomialFeatures(1, interaction_only=False)
poly_test = PolynomialFeatures(1, interaction_only=False)
X_train_pol = poly_train.fit_transform(X_train)
X_test_pol = poly_test.fit_transform(X_test)

In [None]:
# Fitting multiple linear regression model to the training set

regressor_pol = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
regressor_pol.fit(X_train_pol, y_train)

In [None]:
# Predicting the test set results

y_pred_pol = regressor_pol.predict(X_test_pol)

In [None]:
# Calculation of metrics

print("MSE:", metrics.mean_squared_error(y_test, y_pred_pol, squared=True))
print("RMSE:", metrics.mean_squared_error(y_test, y_pred_pol, squared=False))
print("MAE:", metrics.mean_absolute_error(y_test, y_pred_pol))
print("R²:", metrics.r2_score(y_test, y_pred_pol))
print("R²_adj:", (1 - (1 - metrics.r2_score(y_test, y_pred_pol)) * ((X_train_pol.shape[0] - 1) / (X_train_pol.shape[0] - X_train_pol.shape[1] - 1))))

In [None]:
# Estimated coefficients and independent term values

print("Coefficients:", regressor_pol.coef_)
print("Intercept:", regressor_pol.intercept_)

In [None]:
y_pred_pol_df = pd.DataFrame(data=y_pred_pol, index=aux[4018:])

plt.figure()
plt.plot(y_test)
plt.plot(y_pred_pol_df, color='red')
plt.title('Prediction vs Real Values')
plt.xlabel('Date')
plt.ylabel('Solar radiation')
plt.show()

In [None]:
# Insert KFold + Pipeline procedure