In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Set plot style
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.family'] = 'STIXGeneral'
plt.rcParams['font.size'] = 12
%config InlineBackend.figure_format = 'retina'

# Load cleaned data
Data is preprocessed in the [data preprocessing](./create_dataset.ipynb) notebook. This includes concatenating the data, removing outliers, missing values, and irrelevant columns and generating relevant features based on given data.

In [None]:
data_per_day = pd.read_pickle('../data/processed/data.pickle')

In [None]:
data_per_day.info()

In [None]:
# convert date columns from object to datetime
date_cols = ['Zeitstempel', 'Sicherheitsbestand wird erreicht am', 'Meldebestand wird erreicht am']
for col in date_cols:
    data_per_day[col] = pd.to_datetime(data_per_day[col])

In [None]:
data_per_day.head()

# Build target variable "Verbrauch" per day
Difference of the "Füllstand" the current day to the next day. This is the oil consumption per day. Implementation also see [data preprocessing](./create_dataset.ipynb), calculate "Füllstand" difference from day before to current day, then shift by one day to the past, since this difference is the consumption of the day before.

# Concat matching historical weather data
We implemented a wrapper to gain historical and forecast weather data per day using an open source API, see in [weather data](../src/api/weather.py) notebook. This then can be easily concatenated as an external feature to the data.

In [None]:
data_per_day.head(10)

In [None]:
from src.api import WeatherAPI

weather_api = WeatherAPI()

for id in data_per_day["Tank-ID"].unique():
    # get attributes
    latitude = data_per_day[data_per_day["Tank-ID"] == id]["Längengrad"].iloc[0]
    longitude = data_per_day[data_per_day["Tank-ID"] == id]["Breitengrad"].iloc[0]
    if data_per_day["Zeitstempel"].dtypes == 'object':
        data_per_day["Zeitstempel"] = pd.to_datetime(data_per_day["Zeitstempel"])
    start_date = data_per_day["Zeitstempel"].min().strftime("%Y-%m-%d")
    end_date = data_per_day["Zeitstempel"].max().strftime("%Y-%m-%d")
    print("Start date:", start_date, "End date:", end_date)


    # get matching weather data
    weather_data = weather_api.get_data(latitude, longitude, start_date, end_date)

    # remove timezone information
    weather_data['date'] = weather_data['date'].dt.tz_localize(None)

    # join data
    print("Weather data for tank ID", id, "shape:", weather_data.shape)
    print("Data per day for tank ID", id, "shape:", data_per_day[data_per_day["Tank-ID"] == id].shape)
    data_per_day = data_per_day.merge(weather_data, left_on='Zeitstempel', right_on='date', how='left')
    print("Merged data shape:", data_per_day.shape)

In [None]:
weather_data.info()

# Prepare Train and Test Data
For forcasting following steps are necessary:
1. Split data into train and test data
2. Normalize data
3. (Create sequences of data)
5. Split data into X and y
8. Save data

In [None]:
# drop irrelevant columns
cols_to_drop = ["Sicherheitsbestand wird erreicht am", "Meldebestand wird erreicht am"] # "Füllstand"
df = data_per_day.drop(cols_to_drop, axis=1)
# drop ID 5 - remove later on, since it is included in the data cleaning process
df = df[df['Tank-ID'] != 5]

In [None]:
# set index to "Zeitstempel"
df = df.set_index('Zeitstempel')

In [None]:
# plot time series based on tank ID
fig, ax = plt.subplots(figsize=(12, 6))
for tank_id in df["Tank-ID"].unique():
    df[df["Tank-ID"] == tank_id]["Füllstand"].plot(ax=ax, label=f'Tank {tank_id}')
plt.xlabel('Date')
plt.ylabel('Füllstand')
plt.title('"Füllstand" over time')
plt.legend()
plt.show()

In [None]:
# split data into train and test data, based on tank ID
train_data = df[df['Tank-ID'] != 2]
test_data = df[df['Tank-ID'] == 2]

# get x and y values
X_train = train_data.drop('Verbrauch', axis=1).values
y_train = train_data['Verbrauch'].values
X_test = test_data.drop('Verbrauch', axis=1).values
y_test = test_data['Verbrauch'].values

In [None]:
# normalize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Feature Selection
* via correlation
* via feature importance


## Option 1: ARIMA model


## Option 2: Prophet model


## Option 3: Simple ML model
