In [1]:
import os
from datetime import datetime

import holidays
import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup

In [5]:
data_path = "../data"
folder_path = "../data/hdd"

In [2]:
# Download the latest heating degree day data from the DWD FTP server
url = "https://opendata.dwd.de/climate_environment/CDC/derived_germany/techn/daily/heating_degreedays/hdd_3807/recent/"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Iterate through the links in the HTML
    for link in tqdm.tqdm(soup.find_all("a")):
        file_name = link.get("href")
        if file_name.endswith(".csv"):
            # Download the CSV file and save it in the output directory
            file_response = requests.get(url + file_name)
            with open(os.path.join(folder_path, file_name), "wb") as output_file:
                output_file.write(file_response.content)
else:
    print(f"Error: {response.status_code}")

100%|██████████| 78/78 [00:07<00:00, 10.64it/s]


In [6]:
# List all files in the folder
files = os.listdir(folder_path)

# Initialize an empty list to store the results
results = []
# Iterate through the files and read the CSVs
for file in files:
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, sep=";", skiprows=3)

        year = int(file[-10:-6])
        month = int(file[-6:-4])

        daily_average = df.iloc[:, 4:].mean()
        days_in_month = len(daily_average)
        dates = pd.date_range(datetime(year, month, 1), periods=days_in_month)

        df_result = pd.DataFrame({"Date": dates, "Average_HDD": daily_average})
        results.append(df_result)

# Concatenate the results into a single DataFrame
results_df = pd.concat(results)
results_df = results_df.sort_values("Date")
results_df = results_df.reset_index(drop=True)
results_df.to_csv(f"{data_path}/average_hdd_data.csv", index=False)

In [7]:
def is_weekend_or_holiday(date_str):
    date_obj = pd.to_datetime(date_str)
    if date_obj.weekday() >= 5:
        return 1
    de_holidays = holidays.Germany(years=date_obj.year)
    if date_obj in de_holidays:
        return 1
    return 0


# Read the CSV into a DataFrame
main_data_source = f"{data_path}/daily_NGD.csv"
df = pd.read_csv(main_data_source)

df["Is_Weekend_or_Holiday"] = df["Date"].apply(is_weekend_or_holiday)

df.to_csv(main_data_source, index=False)