## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Preprocesing

In [None]:
csv_path = '../data/household_data_60min_singleindex.csv'

df = pd.read_csv(csv_path, parse_dates=["utc_timestamp", "cet_cest_timestamp"], index_col="utc_timestamp")

#residential building in suburban area
columns_to_drop = [col for col in df.columns if not col.startswith('DE_KN_residential2')]

# Clear data
df = df.drop(columns= columns_to_drop)
df = df.dropna(how='all')

#Data shape and sample
print(f'Data shape: {df.shape}')
df.head()

## Processing data

In [None]:
df = df.diff().fillna(0)
#sum days 
df = df.resample('D').sum()
df.head()

In [None]:
daily_usage_filtered = []
K = 8 #standard deviations

for column_name in df:
    mean = df[column_name].mean()
    std_dev = df[column_name].std()

    # Define a range for normal values (e.g., within 2 standard deviations)
    lower_bound = mean - K * std_dev
    upper_bound = mean + K * std_dev

    # Filter out values outside the normal range
    df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

In [None]:
df.head()

In [None]:
grid_import = df['DE_KN_residential2_grid_import']

df = df.drop(columns= 'DE_KN_residential2_grid_import')

## Graphs

In [None]:
plt.figure(figsize=(10, 6))

plt.plot(df.index, df['DE_KN_residential2_circulation_pump'], label='Pompa obiegowa')
plt.plot(df.index, df['DE_KN_residential2_freezer'], label='Zamrażarka')

plt.xlabel('Data')
plt.ylabel('Zużycie')
plt.title('Zużycie energi elektrycznej w czasie')

plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Normality

In [None]:
from scipy.stats  import normaltest

statistic, p_value = normaltest(df)

print(f'Test statistic: {statistic}')
print(f'P-values, if lower than 0.05 then its normal distribution: {p_value}')

## Split data 

In [None]:
# Prepare features and target
features = (df.index.astype(np.int64) // 10**9).values.reshape(-1,1)
#features = grid_import.values.reshape(-1,1)
target = df

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=11)


## LinearRegression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create Linear Regression model
model_LR = LinearRegression()

# Train the model on the training data
model_LR.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model_LR.predict(X_test)

# Evaluate model performance using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
plt.scatter(y_test, y_pred)

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

In [None]:
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.axhline(y=0, color='r', linestyle='--')
plt.title("Residual Plot")
plt.show()

In [None]:
coefficients = model_LR.coef_
intercept = model_LR.intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

## DecisionTreeRegressor Model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model_tree = DecisionTreeRegressor(max_depth=8)  # You can adjust hyperparameters like max_depth
model_tree.fit(X_train, y_train)

In [None]:
y_pred = model_tree.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
plt.scatter(y_test, y_pred)

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

In [None]:
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

In [None]:
plt.hist(residuals, bins=10)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.show()

## GaussianProcessRegressor Model

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

In [None]:
# Tworzenie modelu GaussianProcessRegressor z domyślnym jądrem RBF
kernel = 1.0 * RBF(length_scale=1.0)
model_GP = GaussianProcessRegressor(kernel=kernel)


In [None]:
model_GP.fit(X_train, y_train)

In [None]:
y_pred_GP, sigma = model_GP.predict(X_test.reshape(-1, 1), return_std=True)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual')
plt.plot(y_test, y_test, color='red', label='Perfect Prediction Line')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values')
plt.legend()
plt.show()

In [None]:
# Obliczanie błędu średniokwadratowego
mse_GP = mean_squared_error(y_test, y_pred_GP)
print("Mean Squared Error (Gaussian Process):", mse_GP)