In [1]:

import pandas as pd
import numpy as np
import os, sys
import plotly.express as px
import h5py
from sklearn.ensemble import HistGradientBoostingRegressor
from scipy import stats
# pvlib imports
import pvlib
from pvlib.pvsystem import PVSystem
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS

import zipfile

In [2]:

# Set working directory
os.chdir(r"..") # should be the git repo root directory
print("Current working directory: " + os.getcwd())
repo_name = 'net-load-forecasting'
assert os.getcwd()[-len(repo_name):] == "net-load-forecasting", "Working directory is not the git repo root directory"

Current working directory: c:\Users\nik\Desktop\Berkeley_Projects\net-load-forecasting


In [3]:
save_path = os.path.join(os.getcwd(),'data','clean_data')
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print("Created directory: " + save_path)

In [4]:
# Import custom functions
from utils.utils import *

# Data Imports & Cleaning for the Project 

## &#x2460; Energy Community Load Data - Germany

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5642902.svg)](https://doi.org/10.5281/zenodo.5642902) 

Note: Due to their size we have not included the datasets in the repo, but the above link will allow you to download them, np.

We have downloaded the load data for [2018,2019,2020] in [1 minute*, 15 minutes] resolution:

* 2018_data_1min.zip, 2019_data_1min.zip, 2020_data_1min.zip
* 2018_data_15min.zip, 2019_data_15min.zip, 2020_data_15min.zip,

The goal here is to import them, select the useful data, impute missing data where plausible and aggregate to one community for both temporal resolutions

In [20]:
#h5py extraction
resolutions = [
                #'1min',
                '15min']

df_load_per_resolution = {}
store = pd.HDFStore(os.path.join(save_path, "df_load_per_resolution.h5"))

for resolution in resolutions:

    dfs_load = []
    for year in [2018]:

        with zipfile.ZipFile(f"data/raw_data/load/{year}_data_{resolution}.zip") as zip_file:

            hdf5_file = zip_file.open(f"{year}_data_{resolution}.hdf5")

            f = h5py.File(hdf5_file)
            group_no_pv = f["NO_PV"] #Only regard those profiles that are not mixed with PV generation
            dfs = {}
            community_members = [ # these are the households with reliable data for the considered duration
                        'SFH3', 'SFH4', 'SFH5', 'SFH9', 'SFH10',
                        'SFH12', 'SFH16','SFH18','SFH19', 'SFH21',
                        'SFH22', 'SFH23', 'SFH27', 'SFH28', 'SFH29',
                        'SFH30', 'SFH31','SFH32', 'SFH36', 'SFH38'
                        ]
            
            for member in community_members:
                table = f["NO_PV"][member]["HOUSEHOLD"]["table"][:]
                df = pd.DataFrame(table).dropna().set_index("index")[["P_TOT"]]
                df.index = pd.to_datetime(df.index, unit = "s")
                dfs[member] = df
                break

            df_load = pd.concat(dfs, axis=1).sum(axis=1).to_frame('total_load')
            dfs_load.append(df_load)

    df_load_total = pd.concat(dfs_load, axis=0)
    df_load_per_resolution[resolution] = df_load_total
    store.put(f'{resolution}/data', df_load_total, format='table')
store.close()   

In [21]:
df_load_per_resolution['15min']

Unnamed: 0_level_0,total_load
index,Unnamed: 1_level_1
2018-05-02 14:30:00,0.000000
2018-05-02 14:45:00,0.000000
2018-05-02 15:00:00,0.000000
2018-05-02 15:15:00,0.000000
2018-05-02 15:30:00,0.000000
...,...
2018-12-31 22:45:00,237.117112
2018-12-31 23:00:00,231.250000
2018-12-31 23:15:00,231.250000
2018-12-31 23:30:00,231.250000


In [22]:
pd.read_hdf(os.path.join(save_path, "df_load_per_resolution.h5"), key='15min')

ValueError: The file 'c:\Users\nik\Desktop\Berkeley_Projects\net-load-forecasting\data\clean_data\df_load_per_resolution.h5' is already opened, but not in read-only mode (as requested).

## &#x2461; PV Power Data - Netherlands

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6906504.svg)](https://doi.org/10.5281/zenodo.6906504)

We have downloaded the 'filtered_pv_power_measurements_ac.csv' file

Note: While meta data is available for download, exact locations of individual PV systems are not included. Since one of the key research questions of this project is to investigate the impact of using exact locations in modeling, we recieved special permission to use the longitude and latitude of PV systems. Unfortunately we cannot share these here. The rest of the code is executable with the boundary box locations provided in the 'metadata.csv', however. 

Note: Due to their size we have not included the datasets in the repo, but the above link will allow you to download them, np.

In [11]:
df_pv = pd.read_csv(os.path.join(os.getcwd(), 'data', 'raw_data', 'pv', 'filtered_pv_power_measurements_ac.csv'), index_col=0, parse_dates=True)

## &#x2462; Irradiance Data - Netherlands



# Data Merging & Saving 