## Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import normaltest

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

## Pre-processing

In [None]:
csv_path = '../data/household_data_60min_singleindex.csv'

df = pd.read_csv(csv_path, parse_dates=["utc_timestamp", "cet_cest_timestamp"], index_col="utc_timestamp")
print('input data: ', df.shape)
#residential building in suburban area
columns_to_drop = [col for col in df.columns if not col.startswith('DE_KN_residential2')]

# Clear data
df = df.drop(columns= columns_to_drop)
df = df.dropna(how='all')

#Data shape and sample
print(f'Data shape after droping NaN {df.shape}')
df.head()

## Processing data

In [None]:
df.describe()

### NaN values

In [None]:
df.isna().sum()

In [None]:
df = df.diff().fillna(0)
df.head()

In [None]:
daily_usage_filtered = []
K = 2 #standard deviations

for column_name in df:
    mean = df[column_name].mean()
    std_dev = df[column_name].std()

    # Define a range for normal values
    lower_bound = mean - K * std_dev
    upper_bound = mean + K * std_dev

    # Filter out values outside the normal range
    df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

In [None]:
grid_import = df['DE_KN_residential2_grid_import']

df = df.drop(columns= 'DE_KN_residential2_grid_import')

In [None]:
df['sum'] = df.sum(axis=1, numeric_only=True)
df.head()

## Graphs

In [None]:
# Energy consumption plot
df_daily_sum = df.resample('D').sum()
plt.figure(figsize=(10, 6))

plt.plot(df_daily_sum.index, df_daily_sum['DE_KN_residential2_dishwasher'], label='Zmywarka')
plt.plot(df_daily_sum.index, df_daily_sum['DE_KN_residential2_freezer'], label='Zamrażarka')
plt.plot(df_daily_sum.index, df_daily_sum['DE_KN_residential2_circulation_pump'], label='Pompa obiegowa')
plt.plot(df_daily_sum.index, df_daily_sum['DE_KN_residential2_washing_machine'], label='Pralka')

plt.xlabel('Data')
plt.ylabel('Zużycie')
plt.title('Zużycie energi elektrycznej w czasie')

plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df_daily_sum.shape

In [None]:
# Sum energy consumption plot
plt.figure(figsize=(10, 6))

plt.plot(df_daily_sum.index, df_daily_sum['sum'], label='Domostwo')

plt.xlabel('Data')
plt.ylabel('Zużycie energii [kWh]')
plt.title('Zużycie energi elektrycznej w czasie')

plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = df.corr()
correlation_matrix.head()

In [None]:
df.hist(figsize=(10, 8))
plt.suptitle("Histograms of Data Columns")
plt.show()

In [None]:
# Create heatmap correlation matrix 
plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Correlation matrix using scatter_matrix
scatter_matrix(df[['DE_KN_residential2_circulation_pump','DE_KN_residential2_dishwasher','DE_KN_residential2_freezer','DE_KN_residential2_washing_machine','sum']], figsize=(15, 15))
plt.suptitle("Scatter Matrix of Selected Features")
plt.show()

## Split data 

In [None]:
df = pd.DataFrame(df_daily_sum)

# Prepare features and target
features = (df.index.astype(np.int64) // 10**9).values.reshape(-1,1)

#features = grid_import.values.reshape(-1,1)
target = df['sum']

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=11)


## LinearRegression model

In [None]:
# Create Linear Regression model
model_LR = LinearRegression()

# Train the model on the training data
model_LR.fit(X_train, y_train)

print(f'R^2: {model_LR.score(X_test,y_test)}')

# Make predictions on the testing data
y_pred = model_LR.predict(X_test)

# Evaluate model performance using Mean Squared Error
mse_LR = mean_squared_error(y_test, y_pred)

In [None]:
y_pred = model_LR.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
plt.figure(figsize=(12, 6))
sns.lineplot(data=results_df)
plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Actual vs Predicted Values Over Time for Linear Regression model")
plt.legend()
plt.show()

In [None]:
plt.scatter(y_test, y_pred, label='Predicted vs Actual for ')
plt.plot(y_test, y_test, color='r', label='Perfect Prediction Line')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values for Linear Regression model")
plt.legend()
plt.show()

## DecisionTreeRegressor Model

In [None]:
model_tree = DecisionTreeRegressor(max_depth=15)  # You can adjust hyperparameters like max_depth
model_tree.fit(X_train, y_train)

In [None]:
print(f'R^2: {model_tree.score(X_test,y_test)}')
y_pred = model_tree.predict(X_test)
mse_DTR = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_DTR)

In [None]:
y_pred = model_tree.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
plt.figure(figsize=(12, 6))
sns.lineplot(data=results_df)
plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Actual vs Predicted Values Over Time")
plt.legend()
plt.show()

In [None]:
plt.scatter(y_test, y_pred, label='Predicted vs Actual')
plt.plot(y_test, y_test, color='r', label='Perfect Prediction Line')

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

## RandomForestRegressor Model

In [None]:
model_rfr = RandomForestRegressor(n_estimators=100, random_state=100)

model_rfr.fit(X_train, y_train)
print(f'R^2: {model_rfr.score(X_test,y_test)}')

y_pred = model_rfr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
y_pred = model_rfr.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
plt.figure(figsize=(12, 6))
sns.lineplot(data=results_df)
plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Actual vs Predicted Values Over Time")
plt.legend()
plt.show()

In [None]:
plt.scatter(y_test, y_pred, label='Predicted vs Actual')
plt.plot(y_test, y_test, color='r', label='Perfect Prediction Line')

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

## GaussianProcessRegressor Model

In [None]:
# Prepare features and target
features = (df.index.astype(np.int64) // 10**9).values.reshape(-1,1)

#features = grid_import.values.reshape(-1,1)
target = df['sum']

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=11)


In [None]:
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic

kernel_rbf = RBF(length_scale=1.0)
model_rbf = GaussianProcessRegressor(kernel=kernel_rbf, n_restarts_optimizer=10)
model_rbf.fit(X_train, y_train)
y_pred_rbf = model_rbf.predict(X_test)
print(f'R^2: {model_rbf.score(X_test,y_test)}')

In [None]:
y_pred = model_rbf.predict(X_test)
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
plt.figure(figsize=(12, 6))
sns.lineplot(data=results_df)
plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Actual vs Predicted Values Over Time")
plt.legend()
plt.show()

In [None]:
plt.scatter(y_test, y_pred, label='Predicted vs Actual')
plt.plot(y_test, y_test, color='r', label='Perfect Prediction Line')

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

In [None]:
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Ustaw styl Seaborn
sns.set_style('whitegrid')

plt.figure(figsize=(10, 6))
sns.residplot(x=results_df['utc_timestamp'], y=results_df['Actual'] - results_df['Predicted'], lowess=True)
plt.xlabel("Date")
plt.ylabel("Residuals")
plt.title("Residuals of Predicted Energy Consumption Over Time")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()