# Prepare Train Data

In [1]:
from datetime import datetime, timezone, date
from datetime import timedelta
from pathlib import Path
import pandas as pd

In [2]:
CONSUMPTION_DATA_PATH = Path("../data/consumption")
CONSUMPTION_NE5_FILE = CONSUMPTION_DATA_PATH / "NE5_Export.csv"
CONSUMPTION_NE7_FILE = CONSUMPTION_DATA_PATH / "NE7_Export.csv"
TRAIN_DATA_PATH = Path('../data/consumption/final_train.parquet')
DATA_PATH_2022 = Path('../data/consumption/2022.parquet')

METEO_PATH = Path("../data/meteoswiss/reh_nzz.csv")
METEO_STATION = "REH"
METEO_TEMP_PARAMETER = "tre200h0"

TIME_FORMAT = "dd.MM.yyyy"
WINDOW_DAYS = 1
START_DATE = date(2010, 1, 1)
END_DATE = date(2022, 1, 1)

## Load Energy Data

In [3]:
consumption = pd.read_csv('https://data.stadt-zuerich.ch/dataset/ewz_stromabgabe_netzebenen_stadt_zuerich/download/ewz_stromabgabe_netzebenen_stadt_zuerich.csv')
consumption['Date'] = pd.to_datetime(consumption['Timestamp'], errors='raise', utc=True)
consumption['Date'] = consumption['Date'].dt.tz_convert('Europe/Berlin')
consumption.rename(columns={'Value_NE5': 'NE5Consumption', 'Value_NE7': 'NE7Consumption'}, inplace=True)
consumption = consumption.groupby(consumption.Date.dt.date).sum(numeric_only=True)
consumption.tail()

Unnamed: 0_level_0,NE5Consumption,NE7Consumption
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-12-08,2618021.0,5651042.0
2022-12-09,2620804.0,5763255.0
2022-12-10,2206686.0,5216579.0
2022-12-11,2104165.0,4843438.0
2022-12-12,22443.87,42466.4


### Station REH

In [4]:
meteoData = pd.read_csv(METEO_PATH, encoding='iso-8859-1', sep=';')
meteoData = meteoData[meteoData.abbr == METEO_STATION]
meteoData['Date'] = pd.to_datetime(meteoData['time'], format='%Y%m%d%H%M', utc=True)

# Convert Timezone!
#meteoData['Date'] = meteoData['Date'].dt.tz_convert('Europe/Berlin')

meteoData = meteoData.groupby(meteoData.Date.dt.date).agg(Temperature = (METEO_TEMP_PARAMETER, 'mean'))

meteoData.tail()

Unnamed: 0_level_0,Temperature
Date,Unnamed: 1_level_1
2022-10-27,12.5
2022-10-28,12.583333
2022-10-29,13.066667
2022-10-30,11.270833
2022-10-31,10.4


### Station SMA
Data for station REH (Zürich-Affoltern) is not open source. If you dont have it, you can use SMA (Zürich-Fluntern)

In [None]:
"""
meteoData = pd.concat([
    pd.read_csv('https://data.geo.admin.ch/ch.meteoschweiz.klima/nbcn-tageswerte/nbcn-daily_SMA_previous.csv', encoding='utf-8', sep=';'),
    pd.read_csv('https://data.geo.admin.ch/ch.meteoschweiz.klima/nbcn-tageswerte/nbcn-daily_SMA_current.csv', encoding='utf-8', sep=';')
])
meteoData['Date'] = pd.to_datetime(meteoData['date'], format='%Y%m%d')
meteoData['Date'] = meteoData['Date'].dt.date
meteoData = meteoData[meteoData.Date >= date(2009, 1, 1)]
meteoData.rename(columns={'tre200d0': 'Temperature'}, inplace=True)
meteoData = meteoData[['Date', 'Temperature']]
meteoData = meteoData.set_index('Date')

meteoData.tail()
"""

## Export

In [5]:
data = consumption.join(meteoData, "Date").reset_index(drop=False)

In [6]:
# Trainingset
data[(data.Date >= START_DATE) & (data.Date < END_DATE)].to_parquet(TRAIN_DATA_PATH, engine='pyarrow')

# 2022
data[(data.Date >= date(2021, 12, 26)) & (data.Date < date(2022, 10, 31))].to_parquet(DATA_PATH_2022, engine='pyarrow')