## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Preprocesing

In [None]:
csv_path = '../data/household_data_60min_singleindex.csv'

df = pd.read_csv(csv_path, parse_dates=["utc_timestamp", "cet_cest_timestamp"], index_col="utc_timestamp")


columns_to_drop = [col for col in df.columns if not col.startswith('DE_KN_residential2')]
#columns_to_drop.append('DE_KN_residential2_grid_import')

# Clear data
df = df.drop(columns= columns_to_drop)
df = df.dropna(how='all')

#Data shape and sample
print(f'Data shape: {df.shape}')
df.head()

## Processing data

In [None]:
# Sum day
df = df.resample('D').sum()

In [None]:
# Create Series
DE_KN_residential2_circulation_pump, DE_KN_residential2_dishwasher, DE_KN_residential2_freezer, DE_KN_residential2_washing_machine, DE_KN_residential2_grid_import = df['DE_KN_residential2_circulation_pump'], df['DE_KN_residential2_dishwasher'], df['DE_KN_residential2_freezer'], df['DE_KN_residential2_washing_machine'], df['DE_KN_residential2_grid_import']

# Change Series to DataFrame
daily_usage_circulation_pump = pd.DataFrame(DE_KN_residential2_circulation_pump).diff().fillna(0)
daily_usage_dishwasher = pd.DataFrame(DE_KN_residential2_dishwasher).diff().fillna(0)
daily_usage_freezer = pd.DataFrame(DE_KN_residential2_freezer).diff().fillna(0)
daily_usage_washing_machine = pd.DataFrame(DE_KN_residential2_washing_machine).diff().fillna(0)
daily_grid_import = pd.DataFrame(DE_KN_residential2_grid_import).diff().fillna(0)

print(f'Data shape: {daily_usage_circulation_pump.shape}')
daily_usage_circulation_pump.head()

In [None]:
mean = daily_usage_circulation_pump['DE_KN_residential2_circulation_pump'].mean()
std_dev = daily_usage_circulation_pump['DE_KN_residential2_circulation_pump'].std()

# Define a range for normal values (e.g., within 2 standard deviations)
lower_bound = mean - 5 * std_dev
upper_bound = mean + 5 * std_dev

# Filter out values outside the normal range
df_filtered = daily_usage_circulation_pump[(daily_usage_circulation_pump['DE_KN_residential2_circulation_pump'] >= lower_bound) & (daily_usage_circulation_pump['DE_KN_residential2_circulation_pump'] <= upper_bound)]


## Graphs

In [None]:
print(df_filtered.sum())

plt.figure(figsize=(10,6))

plt.plot(df_filtered.index, df_filtered['DE_KN_residential2_circulation_pump'])

plt.xlabel('Czas')
plt.ylabel('Wartość')
plt.title('Wykres liniowy z kilkoma seriami danych')

plt.tight_layout()
plt.show()

## Normality

In [None]:
from scipy.stats  import normaltest

statistic, p_value = normaltest(df)

print(f'Test statistic: {statistic}')
print(f'P-values, if lower than 0.05 then its normal distribution: {np.sum(p_value)}')

## Split data 

In [None]:
# Prepare features and target
features = df.index
target = df

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=22)


## LinearRegression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train Linear Regression model 
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)

# Prediction
y_pred = model_LR.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)