In [1]:
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import calendar

In [2]:
list_of_loads = [
    "P_CabinePower_L1",
    "P L1 WS Emergency Light",
    "P L1 WS Light",
    "P L1 WS FanCoil",
    "P L1 WS FM",
    "P_HeatPump",
    "P L2 WS Emergency Light",
    "P L2 WS Light",
    "P L2 WS FanCoil",
    "P L2 WS FM",
    "P_CabineLight_L3",
    "P L3 WS Emergency Light",
    "P L3 WS Light",
    "P L3 WS FanCoil",
    "P L3 WS FM"
]

In [3]:
min_power = {
    "P_CabinePower_L1": 50,
    "P L1 WS Emergency Light": 150,
    "P L1 WS Light": 200,
    "P L1 WS FanCoil": 0,
    "P L1 WS FM": 0,
    "P_HeatPump": 0,
    "P L2 WS Emergency Light": 200,
    "P L2 WS Light": 200,
    "P L2 WS FanCoil": 0,
    "P L2 WS FM": 0,
    "P_CabineLight_L3": 0,
    "P L3 WS Emergency Light": 0,
    "P L3 WS Light": 0,
    "P L3 WS FanCoil": 0,
    "P L3 WS FM": 200
}

In [4]:
max_power = {
    "P_CabinePower_L1": 150,
    "P L1 WS Emergency Light": 1500,
    "P L1 WS Light": 350,
    "P L1 WS FanCoil": 200,
    "P L1 WS FM": 4000,
    "P_HeatPump": 1000,
    "P L2 WS Emergency Light": 1700,
    "P L2 WS Light": 400,
    "P L2 WS FanCoil": 200,
    "P L2 WS FM": 1300,
    "P_CabineLight_L3": 10,
    "P L3 WS Emergency Light": 1000,
    "P L3 WS Light": 350,
    "P L3 WS FanCoil": 200,
    "P L3 WS FM": 2000
}

In [14]:
dataset_path = os.path.join("Preprocessing", "Dataset", "df.csv")
df = pd.read_csv(dataset_path, index_col="Time")
df.index = pd.to_datetime(df.index)

In [15]:
len(df)

4083840

In [16]:
df.isnull().sum()

P_CabinePower_L1           0
P L1 WS Emergency Light    0
P L1 WS Light              0
P L1 WS FanCoil            0
P L1 WS FM                 0
P_HeatPump                 0
P L2 WS Emergency Light    0
P L2 WS Light              0
P L2 WS FanCoil            0
P L2 WS FM                 0
P_CabineLight_L3           0
P L3 WS Emergency Light    0
P L3 WS Light              0
P L3 WS FanCoil            0
P L3 WS FM                 0
dtype: int64

In [17]:
for load in list_of_loads:
    df[load] = df[load].where(
        (df[load] >= min_power[load]) & (df[load] <= max_power[load]), np.nan
    )

In [18]:
len(df)

4083840

In [19]:
df.isnull().sum()

P_CabinePower_L1              327
P L1 WS Emergency Light     36338
P L1 WS Light              120652
P L1 WS FanCoil                18
P L1 WS FM                  15247
P_HeatPump                   3302
P L2 WS Emergency Light     29187
P L2 WS Light              293640
P L2 WS FanCoil                 0
P L2 WS FM                 267757
P_CabineLight_L3             3341
P L3 WS Emergency Light      1142
P L3 WS Light              169762
P L3 WS FanCoil               178
P L3 WS FM                 197503
dtype: int64

In [20]:
def distance(d1, d2):
    sum_of_squares = 0
    for field in list_of_loads:
        diff = d1[field] - d2[field]
        sum_of_squares += diff ** 2
    return np.sqrt(sum_of_squares)

In [21]:
best_days = [("",np.inf),("",np.inf),("",np.inf),("",np.inf),("",np.inf),("",np.inf),("",np.inf)]
df_mean_day_of_week = df.groupby(df.index.dayofweek).mean()
df_mean_day = df.resample("D").mean()
for index, row in df_mean_day.iterrows():
    dist = distance(row, df_mean_day_of_week.loc[index.dayofweek])
    if dist < best_days[index.dayofweek][1]:
        best_days[index.dayofweek] = (str(index.year)+"-"+str(index.month)+"-"+str(index.day), dist)

In [22]:
start_date = df.index.min().date()
end_date = df.index.max().date()
complete_index = pd.date_range(start=start_date, end=end_date, freq="D")
missing_dates = complete_index.difference(df.index.date)

list_of_df_missing_date = []
for miss_date in missing_dates:
    df_tmp = df.loc[best_days[miss_date.weekday()][0]]
    df_tmp.index = df_tmp.index.to_series().apply(
        lambda x: x.replace(year=miss_date.year, month=miss_date.month, day=miss_date.day)
    )
    list_of_df_missing_date.append(df_tmp)

df = pd.concat([df] + list_of_df_missing_date)
df.sort_index(inplace=True)

In [23]:
len(df)

4383360

In [24]:
df.isnull().sum()

P_CabinePower_L1              327
P L1 WS Emergency Light     36338
P L1 WS Light              120652
P L1 WS FanCoil                18
P L1 WS FM                  15247
P_HeatPump                   3302
P L2 WS Emergency Light     29594
P L2 WS Light              297826
P L2 WS FanCoil                 0
P L2 WS FM                 267757
P_CabineLight_L3             3348
P L3 WS Emergency Light      1142
P L3 WS Light              169818
P L3 WS FanCoil               178
P L3 WS FM                 232942
dtype: int64

In [25]:
df_avg = df.resample("H").mean()

In [26]:
df_weekday_avg = df_avg[df_avg.index.weekday < 5]
df_weekend_avg = df_avg[df_avg.index.weekday >= 5]

In [27]:
df_tmp_weekday = df[df.index.weekday < 5]
df_tmp_weekend = df[df.index.weekday >= 5]

In [28]:
df_weekday_avg_hour = df_tmp_weekday.groupby(df_tmp_weekday.index.hour).mean()
df_weekend_avg_hour = df_tmp_weekend.groupby(df_tmp_weekend.index.hour).mean()

In [29]:
def fill_empty_value(df, df_tmp):
    for index, row in df.iterrows():
        for load in list_of_loads:
            if pd.isna(row[load]):
                df.loc[index][load] = df_tmp.loc[index.hour][load]
    return df

In [30]:
df_weekday_avg = fill_empty_value(df_weekday_avg, df_weekday_avg_hour)
df_weekend_avg = fill_empty_value(df_weekend_avg, df_weekend_avg_hour)

In [32]:
df_avg_hour = pd.concat([df_weekday_avg, df_weekend_avg])
df_avg_hour.sort_index(inplace=True)