In [7]:

import pandas as pd
import numpy as np
import os, sys
import plotly.express as px
import h5py
from sklearn.ensemble import HistGradientBoostingRegressor
from scipy import stats
# pvlib imports
import pvlib
from pvlib.pvsystem import PVSystem
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS

In [8]:

# Set working directory
os.chdir(r"..") # should be the git repo root directory
print("Current working directory: " + os.getcwd())
repo_name = 'net-load-forecasting'
assert os.getcwd()[-len(repo_name):] == "net-load-forecasting", "Working directory is not the git repo root directory"

Current working directory: c:\Users\nik\Desktop\Berkeley_Projects\net-load-forecasting


In [9]:
save_path = os.path.join(os.getcwd(),'data','clean_data')
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print("Created directory: " + save_path)

In [10]:
# Import custom functions
from utils.utils import *

# Data Imports & Cleaning for the Project 

## &#x2460; Energy Community Load Data - Germany

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5642902.svg)](https://doi.org/10.5281/zenodo.5642902) 

Note: Due to their size we have not included the datasets in the repo, but the above link will allow you to download them, np.

We have downloaded the load data for [2018,2019,2020] in [1 minute*, 15 minutes] resolution:

* 2018_data_1min.zip, 2019_data_1min.zip, 2020_data_1min.zip
* 2018_data_15min.zip, 2019_data_15min.zip, 2020_data_15min.zip,

The goal here is to import them, select the useful data, impute missing data where plausible and aggregate to one community for both temporal resolutions

In [None]:
#h5py extraction
resolutions = ['1min', '15min']

df_load_per_resolution = {}

for resolution in resolutions:

    dfs_load = []
    for year in [2018, 2019, 2020]:
        # Load
        filename = os.path.join("data","raw_data","households_GER","power",f"{year}_data_{resolution}.hdf5")
        f = h5py.File(filename)
        group_no_pv = f["NO_PV"] #Only regard those profiles that are not mixed with PV generation
        dfs = {}

        for key in group_no_pv.keys():
            table = group_no_pv[key]["HOUSEHOLD"]
            table = table["table"][:]
            df = pd.DataFrame(table).dropna().set_index("index")[["P_TOT"]]
            df.index = pd.to_datetime(df.index, unit = "s")
            dfs[key] = df

        df_load = pd.concat(list(dfs.values()), axis=1)
        df_load.columns = list(dfs.keys())
        dfs_load.append(df_load)


    df_load_final = pd.concat(dfs_load, axis = 0)
    # There are many households in the neighborhood, but only the following have been determined as valid data
    columns_neighborhood = ['SFH3', 'SFH4', 'SFH5', 'SFH9', 'SFH10', 'SFH12', 'SFH16','SFH18','SFH19', 'SFH21',
                            'SFH22', 'SFH23', 'SFH27', 'SFH28', 'SFH29', 'SFH30', 'SFH31',
                            'SFH32', 'SFH36', 'SFH38']

    # for later use in the notebook
    df_load_energy_community = df_load_final[columns_neighborhood].sum(axis=1).to_frame("demand") / 1e3 # convert to kW
    df_load_per_resolution[resolution] = df_load_energy_community

## &#x2461; PV Power Data - Netherlands

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6906504.svg)](https://doi.org/10.5281/zenodo.6906504)

We have downloaded the 'filtered_pv_power_measurements_ac.csv' file

Note: While meta data is available for download, exact locations of individual PV systems are not included. Since one of the key research questions of this project is to investigate the impact of using exact locations in modeling, we recieved special permission to use the longitude and latitude of PV systems. Unfortunately we cannot share these here. The rest of the code is executable with the boundary box locations provided in the 'metadata.csv', however. 

Note: Due to their size we have not included the datasets in the repo, but the above link will allow you to download them, np.

In [11]:
df_pv = pd.read_csv(os.path.join(os.getcwd(), 'data', 'raw_data', 'pv', 'filtered_pv_power_measurements_ac.csv'), index_col=0, parse_dates=True)

In [12]:
df_pv

Unnamed: 0_level_0,ID001,ID002,ID003,ID004,ID005,ID006,ID007,ID008,ID009,ID010,...,ID166,ID167,ID168,ID169,ID170,ID171,ID172,ID173,ID174,ID175
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01 00:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
2014-01-01 00:01:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
2014-01-01 00:02:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
2014-01-01 00:03:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
2014-01-01 00:04:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-12-31 23:54:00+00:00,,,0.0,0.0,0.0,,0.0,,0.0,0.0,...,,,0.0,,0.0,,,0.0,,
2017-12-31 23:55:00+00:00,,,0.0,0.0,0.0,,0.0,,0.0,0.0,...,,,0.0,,0.0,,,0.0,,
2017-12-31 23:56:00+00:00,,,0.0,0.0,0.0,,0.0,,0.0,0.0,...,,,0.0,,0.0,,,0.0,,
2017-12-31 23:57:00+00:00,,,0.0,0.0,0.0,,0.0,,0.0,0.0,...,,,0.0,,0.0,,,0.0,,


## &#x2462; Irradiance Data - Netherlands

We have included the irradiance data 'ghi_dni_dhi.csv' for the general coordinates for Utrecht (lat = 52.092876, lon = 5.104480), obtained from https://solcast.com/

This is the data that is used throughout the project.

# Data Merging & Saving 