## Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Preprocesing

In [None]:
csv_path = '../data/household_data_60min_singleindex.csv'

df = pd.read_csv(csv_path, parse_dates=["utc_timestamp", "cet_cest_timestamp"], index_col="utc_timestamp")


columns_to_drop = [col for col in df.columns if not col.startswith('DE_KN_residential2')]

# Clear data
df = df.drop(columns= columns_to_drop).fillna(0)

#Data shape and sample
print(f'Data shape: {df.shape}')
df.sample()

## Graphs

In [None]:
plt.figure(figsize=(30, 6))

fig, ax = plt.subplots()
for column in df.columns:
    ax.plot(df.index, df[column], label=column)

# Formatowanie osi x
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))

df.plot(kind='bar', stacked=True)

plt.xlabel('Czas')
plt.ylabel('Wartość')
plt.title('Wykres kolumnowy z kilkoma seriami danych')
plt.legend(loc='upper left')


plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))

for column in df.columns:
    plt.plot(df.index, df[column], label=column)

plt.xlabel('Czas')
plt.ylabel('Wartość')
plt.title('Wykres liniowy z kilkoma seriami danych')
plt.legend()

plt.tight_layout()
plt.show()

## Normality

In [None]:
from scipy.stats  import normaltest

statistic, p_value = normaltest(df)

print(f'Test statistic: {statistic}')
print(f'P-values, if lower than 0.05 then its normal distribution: {np.sum(p_value)}')

## Split data 

In [None]:
# Prepare features and target
features = df
target = df.index

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=22)


## LinearRegression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train Linear Regression model 
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)

# Prediction
y_pred = model_LR.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)