# Dataset Construction: Energy Demand and Supply In Spain

### Demand Load & Forecast Cleaning

Steps taken to prepare the energy demand and energy demand forecast into one dataframe.

In [130]:
import pandas as pd
import numpy as np

In [2]:
#inspect the files to be merged and cleaned
!ls -l ./raw/demand_load_forecast

total 4848
-rw-r--r--@ 1 ns  staff  473123 18 Aug 20:03 Total Load - Day Ahead _ Actual_2015.csv
-rw-r--r--@ 1 ns  staff  474463 18 Aug 20:03 Total Load - Day Ahead _ Actual_2016.csv
-rw-r--r--@ 1 ns  staff  473173 18 Aug 20:03 Total Load - Day Ahead _ Actual_2017.csv
-rw-r--r--@ 1 ns  staff  473173 18 Aug 20:00 Total Load - Day Ahead _ Actual_2018.csv
-rw-r--r--@ 1 ns  staff  449407 29 Aug 17:59 Total Load - Day Ahead _ Actual_2019.csv


In [178]:
#load the files
path = './raw/demand_load_forecast/'
files = ['Total Load - Day Ahead _ Actual_2015.csv',
            'Total Load - Day Ahead _ Actual_2016.csv',
            'Total Load - Day Ahead _ Actual_2017.csv',
            'Total Load - Day Ahead _ Actual_2018.csv',
            'Total Load - Day Ahead _ Actual_2019.csv']

dataset = [pd.read_csv(path+file) for file in files]
data = dataset.copy()

In [179]:
#inspect the first list pandas element 
data[0].head(3)

Unnamed: 0,Time (CET),Day-ahead Total Load Forecast [MW] - BZN|ES,Actual Total Load [MW] - BZN|ES
0,01.01.2015 00:00 - 01.01.2015 01:00,26118.0,25385.0
1,01.01.2015 01:00 - 01.01.2015 02:00,24934.0,24382.0
2,01.01.2015 02:00 - 01.01.2015 03:00,23515.0,22734.0


In [180]:
def format_load_forecast_data(data):
    '''
    Input: A dataframe of Day Ahead Total Load, and Actual Load obtained from csv data obtained from the entsoe Transparency Platform.
    
    Descrption:
    Input is a 3 column dataframe consisting of text time stamps with hourly frequency. 
    - Function formats the string in order to be formatted into a datetime.
    - Appends a datetime index and drops the time strings
    
    Output: A 2 column dataframe with a DatetimeIndex
    
    '''
    
    #set column names to something simple
    data.columns = ['time', 'day_forecast',
       'actual_load']

    #set the time to the first element in the time string. 
    #So 01.01.2018 00:00 - 01.01.2018 01:00 becomes 01.01.2018 00:00
    data['time'] = data['time'].str.split('-').apply(lambda x: x[0]).str.strip()
     
    #set the time strings to datetime obejects and set index as date time
    datetimes = pd.to_datetime(data['time'], format='%d-%m-%Y %H%M', errors='ignore')
    data_ = data.set_index(pd.DatetimeIndex(datetimes))
    
    #remove extra time column with original string objects
    data_time = data_[['day_forecast', 'actual_load']]
    
    return data_time


#get the numberof input files processed
years = range(len(files))

#create a dictionary of formatted pandas dataframes where key is each year
data_formatted = {year: format_load_forecast_data(data) for year,data in zip(years, data)}

In [181]:
def combine_annual_data(dictionary):
    """
    Input: a dictionary of dataframes.
    
    Output: a single dataframe
    """
    
    all_data_list = []
    
    for key in dictionary.keys():
        all_data_list.append(dictionary[key])
        
    data_all_years = pd.concat(all_data_list)
    
    return data_all_years

data = combine_annual_data(data_formatted)

In [21]:
data.head()

Unnamed: 0_level_0,day_forecast,actual_load
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01 00:00:00,26118,25385
2015-01-01 01:00:00,24934,24382
2015-01-01 02:00:00,23515,22734
2015-01-01 03:00:00,22642,21286
2015-01-01 04:00:00,21785,20264


In [182]:
#export the data to the processed file
actual_demand_and_forecast = data['2015':'2018'].copy()

save_path = './processed/'
actual_demand_and_forecast.to_csv(save_path + 'actual-demand-forecast-demand-2015-2018.csv')

In [196]:
!ls -l ./processed

total 58880
-rw-r--r--  1 ns  staff   1262170  8 Oct 18:11 actual-demand-forecast-demand-2015-2018.csv
-rw-r--r--  1 ns  staff   4599191  8 Oct 18:12 generation-production-type-2015-2018.csv
-rw-r--r--  1 ns  staff   1164005  8 Oct 18:10 prices_2015_2018.csv
-rw-r--r--  1 ns  staff   1583537  7 Oct 18:16 solar_wind_forecasts.csv
-rw-r--r--  1 ns  staff  21046004  7 Oct 18:49 weather_2015_2019.csv


### Generation Source Cleaning



In [26]:
#inspect the files to be merged and cleaned
!ls -l ./raw/generation

total 18080
-rw-r--r--@ 1 ns  staff  1456154  7 Oct 16:26 Actual Generation per Production Type_2015-2016.csv
-rw-r--r--@ 1 ns  staff  1458242  7 Oct 15:28 Actual Generation per Production Type_2016-2017.csv
-rw-r--r--@ 1 ns  staff  1458263  7 Oct 15:28 Actual Generation per Production Type_2017-2018.csv
-rw-r--r--@ 1 ns  staff  1459284  7 Oct 15:27 Actual Generation per Production Type_2018-2019.csv
-rw-r--r--@ 1 ns  staff  1388375  7 Oct 15:27 Actual Generation per Production Type_2019-2020.csv


In [184]:
#load the files for generation
path = './raw/generation/'
files = ['Actual Generation per Production Type_2015-2016.csv',
         'Actual Generation per Production Type_2016-2017.csv',
        'Actual Generation per Production Type_2017-2018.csv',
            'Actual Generation per Production Type_2018-2019.csv',
            'Actual Generation per Production Type_2019-2020.csv']

generation_dataset = [pd.read_csv(path+file) for file in files]
data_gen = generation_dataset.copy()

In [185]:
data_gen[0].head(3)

Unnamed: 0,Area,MTU,Biomass - Actual Aggregated [MW],Fossil Brown coal/Lignite - Actual Aggregated [MW],Fossil Coal-derived gas - Actual Aggregated [MW],Fossil Gas - Actual Aggregated [MW],Fossil Hard coal - Actual Aggregated [MW],Fossil Oil - Actual Aggregated [MW],Fossil Oil shale - Actual Aggregated [MW],Fossil Peat - Actual Aggregated [MW],...,Hydro Run-of-river and poundage - Actual Aggregated [MW],Hydro Water Reservoir - Actual Aggregated [MW],Marine - Actual Aggregated [MW],Nuclear - Actual Aggregated [MW],Other - Actual Aggregated [MW],Other renewable - Actual Aggregated [MW],Solar - Actual Aggregated [MW],Waste - Actual Aggregated [MW],Wind Offshore - Actual Aggregated [MW],Wind Onshore - Actual Aggregated [MW]
0,BZN|ES,01.01.2015 00:00 - 01.01.2015 01:00 (CET),447.0,329.0,0.0,4844.0,4821.0,162.0,0.0,0.0,...,1051.0,1899.0,0.0,7096.0,43.0,73.0,49.0,196.0,0.0,6378.0
1,BZN|ES,01.01.2015 01:00 - 01.01.2015 02:00 (CET),449.0,328.0,0.0,5196.0,4755.0,158.0,0.0,0.0,...,1009.0,1658.0,0.0,7096.0,43.0,71.0,50.0,195.0,0.0,5890.0
2,BZN|ES,01.01.2015 02:00 - 01.01.2015 03:00 (CET),448.0,323.0,0.0,4857.0,4581.0,157.0,0.0,0.0,...,973.0,1371.0,0.0,7099.0,43.0,73.0,50.0,196.0,0.0,5461.0


In [186]:
data_gen[0].columns.str.lower()

Index(['area', 'mtu', 'biomass  - actual aggregated [mw]',
       'fossil brown coal/lignite  - actual aggregated [mw]',
       'fossil coal-derived gas  - actual aggregated [mw]',
       'fossil gas  - actual aggregated [mw]',
       'fossil hard coal  - actual aggregated [mw]',
       'fossil oil  - actual aggregated [mw]',
       'fossil oil shale  - actual aggregated [mw]',
       'fossil peat  - actual aggregated [mw]',
       'geothermal  - actual aggregated [mw]',
       'hydro pumped storage  - actual aggregated [mw]',
       'hydro pumped storage  - actual consumption [mw]',
       'hydro run-of-river and poundage  - actual aggregated [mw]',
       'hydro water reservoir  - actual aggregated [mw]',
       'marine  - actual aggregated [mw]', 'nuclear  - actual aggregated [mw]',
       'other  - actual aggregated [mw]',
       'other renewable  - actual aggregated [mw]',
       'solar  - actual aggregated [mw]', 'waste  - actual aggregated [mw]',
       'wind offshore  - actual 

In [187]:
data = data_gen.copy()

def format_generation_data(data):
    #set column names to something simple
    data.columns = ['area', 
                    'time', 
                    'biomass', 
                    'fossil brown coal/lignite', 
                    'fossil coal-derived gas',
                    'fossil gas',
                    'fossil hard coal',
                    'fossil oil',
                    'fossil oil shale',
                    'fossil peat',
                    'geothermal',
                    'hydro pumped storage',
                    'hydro pumped storage',
                    'hydro run-of-river and poundage',
                    'hydro water reservoir',
                    'marine', 
                    'nuclear',
                    'other',
                    'other renewable',
                    'solar', 
                    'waste',
                    'wind offshore',
                    'wind onshore']

    #set the time to the first element in the time string. 
    #So 01.01.2018 00:00 - 01.01.2018 01:00 becomes 01.01.2018 00:00
    data['time'] = data['time'].str.split('-').apply(lambda x: x[0]).str.strip()

    #set the time strings to datetime obejects and set index as date time
    datetimes = pd.to_datetime(data['time'], format='%d-%m-%Y %H%M', errors='ignore')
    data_ = data.set_index(pd.DatetimeIndex(datetimes))

    #remove area column and time string columns
    data_.drop(['area', 'time'], axis=1, inplace=True)

    return data_


#get the numberof input files processed
years = range(len(files))

#create a dictionary of formatted pandas dataframes where key is each year
data_generation = {year: format_generation_data(data) for year,data in zip(years, data_gen)}

In [189]:
#check the output
data_generation_all = combine_annual_data(data_generation)

data_generation_all.head(3)

Unnamed: 0_level_0,biomass,fossil brown coal/lignite,fossil coal-derived gas,fossil gas,fossil hard coal,fossil oil,fossil oil shale,fossil peat,geothermal,hydro pumped storage,...,hydro run-of-river and poundage,hydro water reservoir,marine,nuclear,other,other renewable,solar,waste,wind offshore,wind onshore
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,447,329,0,4844,4821,162,0,0,0,,...,1051,1899,0,7096,43,73,49,196,0,6378
2015-01-01 01:00:00,449,328,0,5196,4755,158,0,0,0,,...,1009,1658,0,7096,43,71,50,195,0,5890
2015-01-01 02:00:00,448,323,0,4857,4581,157,0,0,0,,...,973,1371,0,7099,43,73,50,196,0,5461


In [190]:
data_generation_all['2015':'2018'].to_csv(save_path + 'generation-production-type-2015-2018.csv')

In [194]:
!ls -l ./processed

total 61848
-rw-r--r--  1 ns  staff   1262170  8 Oct 18:11 actual-demand-forecast-demand-2015-2018.csv
-rw-r--r--  1 ns  staff   1518679  7 Oct 16:18 actual-demand-forecast-demand-2015-2019.csv
-rw-r--r--  1 ns  staff   4599191  8 Oct 18:12 generation-production-type-2015-2018.csv
-rw-r--r--  1 ns  staff   1164005  8 Oct 18:10 prices_2015_2018.csv
-rw-r--r--  1 ns  staff   1583537  7 Oct 18:16 solar_wind_forecasts.csv
-rw-r--r--  1 ns  staff  21046004  7 Oct 18:49 weather_2015_2019.csv


### Solar and Wind Generation Forecasts

In [68]:
!ls -l ./raw/generation_forecasts_wind_solar

total 8008
-rw-r--r--@ 1 ns  staff  816253  7 Oct 15:47 Generation Forecasts for Wind and Solar_201501010000-201601010000.csv
-rw-r--r--@ 1 ns  staff  818240  7 Oct 15:48 Generation Forecasts for Wind and Solar_201601010000-201701010000.csv
-rw-r--r--@ 1 ns  staff  816705  7 Oct 15:48 Generation Forecasts for Wind and Solar_201701010000-201801010000.csv
-rw-r--r--@ 1 ns  staff  834281  7 Oct 15:50 Generation Forecasts for Wind and Solar_201801010000-201901010000.csv
-rw-r--r--@ 1 ns  staff  804446  7 Oct 18:00 Generation Forecasts for Wind and Solar_201901010000-202001010000.csv


In [197]:
#load the files for generation
path = './raw/generation_forecasts_wind_solar/'
files = ['Generation Forecasts for Wind and Solar_201501010000-201601010000.csv',
         'Generation Forecasts for Wind and Solar_201601010000-201701010000.csv',
        'Generation Forecasts for Wind and Solar_201701010000-201801010000.csv',
            'Generation Forecasts for Wind and Solar_201801010000-201901010000.csv',
            'Generation Forecasts for Wind and Solar_201901010000-202001010000.csv']

wind_solar_forecast_dataset = [pd.read_csv(path+file) for file in files]
data_gen_forecasts = wind_solar_forecast_dataset.copy()

In [198]:
data_gen_forecasts[0].head(3)

Unnamed: 0,MTU (CET),Generation - Solar [MW] Day Ahead/ BZN|ES,Generation - Solar [MW] Intraday / BZN|ES,Generation - Solar [MW] Current / BZN|ES,Generation - Wind Offshore [MW] Day Ahead/ BZN|ES,Generation - Wind Offshore [MW] Intraday / BZN|ES,Generation - Wind Offshore [MW] Current / BZN|ES,Generation - Wind Onshore [MW] Day Ahead/ BZN|ES,Generation - Wind Onshore [MW] Intraday / BZN|ES,Generation - Wind Onshore [MW] Current / BZN|ES
0,01.01.2015 00:00 - 01.01.2015 01:00,17.0,n/e,n/e,n/e,n/e,n/e,6436.0,n/e,n/e
1,01.01.2015 01:00 - 01.01.2015 02:00,16.0,n/e,n/e,n/e,n/e,n/e,5856.0,n/e,n/e
2,01.01.2015 02:00 - 01.01.2015 03:00,8.0,n/e,n/e,n/e,n/e,n/e,5454.0,n/e,n/e


In [199]:
data_gen_forecasts[0].columns.str.lower()

Index(['mtu (cet)', 'generation - solar  [mw] day ahead/ bzn|es',
       'generation - solar  [mw] intraday / bzn|es',
       'generation - solar  [mw] current / bzn|es',
       'generation - wind offshore  [mw] day ahead/ bzn|es',
       'generation - wind offshore  [mw] intraday / bzn|es',
       'generation - wind offshore  [mw] current / bzn|es',
       'generation - wind onshore  [mw] day ahead/ bzn|es',
       'generation - wind onshore  [mw] intraday / bzn|es',
       'generation - wind onshore  [mw] current / bzn|es'],
      dtype='object')

In [200]:
data_sw_forecast = data_gen_forecasts.copy()

def format_solar_wind_forecast_data(data):
    #set column names to something simple
    data.columns = ['time', 'solar day ahead',
       'solar intraday',
       'solar current',
       'wind offshore eday ahead',
       'wind offshore intraday',
       'wind offshore current',
       'wind onshore day ahead',
       'wind onshore intraday',
       'wind onshore current']

    #set the time to the first element in the time string. 
    #So 01.01.2018 00:00 - 01.01.2018 01:00 becomes 01.01.2018 00:00
    data['time'] = data['time'].str.split('-').apply(lambda x: x[0]).str.strip()

    #set the time strings to datetime obejects and set index as date time
    datetimes = pd.to_datetime(data['time'], format='%d-%m-%Y %H%M', errors='ignore')
    data_ = data.set_index(pd.DatetimeIndex(datetimes))

    #remove area column and time string columns
    data_.drop(['time'], axis=1, inplace=True)

    return data_


#get the numberof input files processed
years = range(len(files))

#create a dictionary of formatted pandas dataframes where key is each year
data_solar_wind_forecast = {year: format_solar_wind_forecast_data(data) for year,data in zip(years, data_sw_forecast)}

In [201]:
#check the output
data_wind_solar_15_19 = combine_annual_data(data_solar_wind_forecast)

data_wind_solar_15_19.head(3)

Unnamed: 0_level_0,solar day ahead,solar intraday,solar current,wind offshore eday ahead,wind offshore intraday,wind offshore current,wind onshore day ahead,wind onshore intraday,wind onshore current
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01 00:00:00,17,n/e,n/e,n/e,n/e,n/e,6436,n/e,n/e
2015-01-01 01:00:00,16,n/e,n/e,n/e,n/e,n/e,5856,n/e,n/e
2015-01-01 02:00:00,8,n/e,n/e,n/e,n/e,n/e,5454,n/e,n/e


In [202]:
drop_cols = ['solar intraday', 
             'solar current', 
             'wind offshore intraday', 
             'wind offshore current', 
             'wind onshore intraday', 
             'wind onshore current']

data_wind_solar_15_19.drop(drop_cols, axis=1, inplace=True)
data_wind_solar_15_19.head(3)

Unnamed: 0_level_0,solar day ahead,wind offshore eday ahead,wind onshore day ahead
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-01 00:00:00,17,n/e,6436
2015-01-01 01:00:00,16,n/e,5856
2015-01-01 02:00:00,8,n/e,5454


In [203]:
data_wind_solar_15_19['2015':'2018'].to_csv(save_path + 'solar_wind_forecasts_2015-2018.csv')

In [204]:
!ls -l ./processed

total 61424
-rw-r--r--  1 ns  staff   1262170  8 Oct 18:11 actual-demand-forecast-demand-2015-2018.csv
-rw-r--r--  1 ns  staff   4599191  8 Oct 18:12 generation-production-type-2015-2018.csv
-rw-r--r--  1 ns  staff   1164005  8 Oct 18:10 prices_2015_2018.csv
-rw-r--r--  1 ns  staff   1583537  7 Oct 18:16 solar_wind_forecasts.csv
-rw-r--r--  1 ns  staff   1302335  8 Oct 18:16 solar_wind_forecasts_2015-2018.csv
-rw-r--r--  1 ns  staff  21046004  7 Oct 18:49 weather_2015_2019.csv


In [206]:
%rm ./processed/solar_wind_forecasts.csv

### Weather Source Cleaning

In [97]:
!ls -l ./raw/weather/

total 77968
-rw-r--r--  1 ns  staff  39576929 29 Aug 10:52 weather_2013_2019.csv


In [207]:
path = './raw/weather/weather_2013_2019.csv'
weather_data = pd.read_csv(path)
weather_data.head(3)

Unnamed: 0.1,Unnamed: 0,dt,dt_iso,city_id,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,0,2013-10-01 02:00:00,2013-10-01 00:00:00 +0000 UTC,2509954,Valencia,299.15,299.15,299.15,1008,61,5,290,0.0,0.0,0.0,20,801,clouds,few clouds,02n
1,1,2013-10-01 03:00:00,2013-10-01 01:00:00 +0000 UTC,2509954,Valencia,298.15,298.15,298.15,1009,65,4,250,0.0,0.0,0.0,20,801,clouds,few clouds,02n
2,2,2013-10-01 04:00:00,2013-10-01 02:00:00 +0000 UTC,2509954,Valencia,296.161,296.161,296.161,1009,71,4,269,0.0,0.0,0.0,10,800,clear,sky is clear,02


In [208]:
weather_data.head(3)

Unnamed: 0.1,Unnamed: 0,dt,dt_iso,city_id,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,0,2013-10-01 02:00:00,2013-10-01 00:00:00 +0000 UTC,2509954,Valencia,299.15,299.15,299.15,1008,61,5,290,0.0,0.0,0.0,20,801,clouds,few clouds,02n
1,1,2013-10-01 03:00:00,2013-10-01 01:00:00 +0000 UTC,2509954,Valencia,298.15,298.15,298.15,1009,65,4,250,0.0,0.0,0.0,20,801,clouds,few clouds,02n
2,2,2013-10-01 04:00:00,2013-10-01 02:00:00 +0000 UTC,2509954,Valencia,296.161,296.161,296.161,1009,71,4,269,0.0,0.0,0.0,10,800,clear,sky is clear,02


In [209]:
datetimes = pd.to_datetime(weather_data['dt'], format='%d-%m-%Y %H%M', errors='ignore')
weather_data = weather_data.set_index(pd.DatetimeIndex(datetimes))
weather_data.drop(['Unnamed: 0', 'dt', 'dt_iso', 'weather_icon', 'city_id'], axis=1, inplace=True)

In [210]:
weather_data.head(3)

Unnamed: 0_level_0,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-10-01 02:00:00,Valencia,299.15,299.15,299.15,1008,61,5,290,0.0,0.0,0.0,20,801,clouds,few clouds
2013-10-01 03:00:00,Valencia,298.15,298.15,298.15,1009,65,4,250,0.0,0.0,0.0,20,801,clouds,few clouds
2013-10-01 04:00:00,Valencia,296.161,296.161,296.161,1009,71,4,269,0.0,0.0,0.0,10,800,clear,sky is clear


In [211]:
weather_data.index.min(),weather_data.index.max() 

(Timestamp('2013-10-01 02:00:00'), Timestamp('2019-08-26 02:00:00'))

In [212]:
#slice starting from 2015
weather_data = weather_data['2015':'2018']

In [213]:
weather_data.to_csv(save_path + 'weather_2015_2018.csv')

In [215]:
!ls -l ./processed

total 53360
-rw-r--r--  1 ns  staff   1262170  8 Oct 18:11 actual-demand-forecast-demand-2015-2018.csv
-rw-r--r--  1 ns  staff   4599191  8 Oct 18:12 generation-production-type-2015-2018.csv
-rw-r--r--  1 ns  staff   1164005  8 Oct 18:10 prices_2015_2018.csv
-rw-r--r--  1 ns  staff   1302335  8 Oct 18:16 solar_wind_forecasts_2015-2018.csv
-rw-r--r--  1 ns  staff  18154423  8 Oct 18:17 weather_2015_2018.csv


In [214]:
%rm ./processed/weather_2015_2019.csv

### Day ahead Price Data

In [14]:
!ls -l ./raw/prices

total 10960
-rw-r--r--@ 1 ns  staff   402981  8 Oct 15:35 Day-ahead Prices_201501010000-201601010000.csv
-rw-r--r--@ 1 ns  staff   403832  8 Oct 15:35 Day-ahead Prices_201601010000-201701010000.csv
-rw-r--r--@ 1 ns  staff   403025  8 Oct 15:35 Day-ahead Prices_201701010000-201801010000.csv
-rw-r--r--@ 1 ns  staff   402933  8 Oct 15:35 Day-ahead Prices_201801010000-201901010000.csv
-rw-r--r--@ 1 ns  staff   395061  8 Oct 15:35 Day-ahead Prices_201901010000-202001010000.csv
-rw-r--r--@ 1 ns  staff  3590440  8 Oct 16:05 export_HourlyAveragePriceFinalSumOfComponents_2019-10-08_16_05.csv


In [228]:
path = './raw/prices/'
files = ['Day-ahead Prices_201501010000-201601010000.csv',
         'Day-ahead Prices_201601010000-201701010000.csv',
        'Day-ahead Prices_201701010000-201801010000.csv',
            'Day-ahead Prices_201801010000-201901010000.csv',
            'Day-ahead Prices_201901010000-202001010000.csv']

day_ahead_prices_dataset = [pd.read_csv(path+file) for file in files]
day_ahead_prices = day_ahead_prices_dataset.copy()

In [229]:
day_ahead_prices[0].columns.str.lower()

Index(['mtu (cet)', 'day-ahead price [eur/mwh]'], dtype='object')

In [230]:
def format_prices_data(data):
    #set column names to something simple
    data.columns = ['time', 'day-ahead price']

    #set the time to the first element in the time string. 
    #So 01.01.2018 00:00 - 01.01.2018 01:00 becomes 01.01.2018 00:00
    data['time'] = data['time'].str.split('-').apply(lambda x: x[0]).str.strip()

    #set the time strings to datetime obejects and set index as date time
    datetimes = pd.to_datetime(data['time'], format='%d-%m-%Y %H%M', errors='ignore')
    data_ = data.set_index(pd.DatetimeIndex(datetimes))

    #remove area column and time string columns
    data_.drop(['time'], axis=1, inplace=True)

    return data_


#get the numberof input files processed
years = range(len(files))

#create a dictionary of formatted pandas dataframes where key is each year
day_ahead_prices = {year: format_prices_data(data) for year,data in zip(years, day_ahead_prices)}

In [231]:
day_ahead_prices = combine_annual_data(day_ahead_prices)

print(day_ahead_prices.index.min(), day_ahead_prices.index.max())
day_ahead_prices.head(3)

2015-01-01 00:00:00 2019-12-31 23:00:00


Unnamed: 0_level_0,day-ahead price
time,Unnamed: 1_level_1
2015-01-01 00:00:00,50.1
2015-01-01 01:00:00,48.1
2015-01-01 02:00:00,47.33


In [232]:
#load the price data
intraday_prices = pd.read_csv('./raw/prices/export_HourlyAveragePriceFinalSumOfComponents_2019-10-08_16_05.csv', delimiter=';')

#remove extra text from the datetime column values
intraday_prices.datetime = intraday_prices.datetime.str.split('+').apply(lambda x: x[0]).str.replace('T', ' ')

#create a datetime index 
datetimes = pd.to_datetime(intraday_prices['datetime'], format='%d-%m-%Y %H%M', errors='ignore')

#apply the datetime index
intraday_prices = intraday_prices.set_index(pd.DatetimeIndex(datetimes))

# drop unneeded columns
intraday_prices.drop(['datetime', 'id', 'name', 'geoid', 'geoname'], axis=1, inplace=True)
#rename the index and value columns
intraday_prices.index.name = 'time'
intraday_prices.columns = ['intraday price']

#disply sample of output
intraday_prices.head(2)

Unnamed: 0_level_0,intraday price
time,Unnamed: 1_level_1
2015-01-01 00:00:00,65.41
2015-01-01 01:00:00,64.92


#### Merge the price dataframes

In [234]:
#standardize df lengths
intraday_prices_2015_2018 = intraday_prices['2015':'2018'].copy()
day_ahead_prices_2015_2018 = day_ahead_prices['2015':'2018'].copy()

#check the lenghts
len(intraday_prices_2015_2018.index), len(day_ahead_prices_2015_2018.index)

(35064, 35068)

In [235]:
#get the difference between the two and remove these rows
spring_forward_drop_times = day_ahead_prices_2015_2018.index.difference(intraday_prices_2015_2018.index)

day_ahead_prices_2015_2018.drop(spring_forward_drop_times, axis=0, inplace=True)

day_ahead_prices_2015_2018['2015-03-29']

Unnamed: 0_level_0,day-ahead price
time,Unnamed: 1_level_1
2015-03-29 00:00:00,34.5
2015-03-29 01:00:00,32.68
2015-03-29 03:00:00,28.75
2015-03-29 04:00:00,29.48
2015-03-29 05:00:00,28.75
2015-03-29 06:00:00,30.45
2015-03-29 07:00:00,29.48
2015-03-29 08:00:00,28.5
2015-03-29 09:00:00,34.0
2015-03-29 10:00:00,34.5


In [74]:
intraday_prices['2015-03-29']

Unnamed: 0_level_0,intraday price
time,Unnamed: 1_level_1
2015-03-29 00:00:00,49.53
2015-03-29 01:00:00,48.1
2015-03-29 03:00:00,44.51
2015-03-29 04:00:00,44.17
2015-03-29 05:00:00,43.2
2015-03-29 06:00:00,44.51
2015-03-29 07:00:00,43.76
2015-03-29 08:00:00,42.92
2015-03-29 09:00:00,48.78
2015-03-29 10:00:00,49.33


In [238]:
len(day_ahead_prices_2015_2018.index), len(intraday_prices_2015_2018.index)

(35064, 35064)

In [294]:
intraday_prices_2015_2018 = intraday_prices_2015_2018[~intraday_prices_2015_2018.duplicated(keep='first')]

In [295]:
intraday_prices_2015_2018.shape

(6653, 1)

In [291]:
price_all=pd.concat([intraday_prices_2015_2018,day_ahead_prices_2015_2018], join='outer', axis=1)


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [282]:
price_all.shape

(35064, 2)

In [288]:
price_all['2017-10-30']

Unnamed: 0_level_0,intraday price,day-ahead price
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-10-30 00:00:00,52.91,
2017-10-30 01:00:00,46.46,
2017-10-30 02:00:00,42.59,32.37
2017-10-30 03:00:00,39.07,
2017-10-30 04:00:00,39.33,29.83
2017-10-30 05:00:00,43.15,
2017-10-30 06:00:00,55.46,
2017-10-30 07:00:00,66.35,60.54
2017-10-30 08:00:00,68.82,
2017-10-30 09:00:00,66.46,


In [241]:
price_all.to_csv('./processed/prices_2015_2018.csv')

In [242]:
!ls -l ./processed

total 53008
-rw-r--r--  1 ns  staff   1262170  8 Oct 18:11 actual-demand-forecast-demand-2015-2018.csv
-rw-r--r--  1 ns  staff   4599191  8 Oct 18:12 generation-production-type-2015-2018.csv
-rw-r--r--  1 ns  staff    971155  8 Oct 18:30 prices_2015_2018.csv
-rw-r--r--  1 ns  staff   1302335  8 Oct 18:16 solar_wind_forecasts_2015-2018.csv
-rw-r--r--  1 ns  staff  18154423  8 Oct 18:17 weather_2015_2018.csv


### Load and Merge Datasets

In [259]:
#load each dataset as its own dataframe
actual_demand_forecast = pd.read_csv('./processed/actual-demand-forecast-demand-2015-2018.csv', index_col='time', parse_dates=True)
generation_production_type = pd.read_csv('./processed/generation-production-type-2015-2018.csv', index_col='time', parse_dates=True)
solar_wind_forecasts = pd.read_csv('./processed/solar_wind_forecasts_2015-2018.csv', index_col='time', parse_dates=True)
weather = pd.read_csv('./processed/weather_2015_2018.csv', index_col='dt', parse_dates=True)
prices = pd.read_csv('./processed/prices_2015_2018.csv', index_col='time', parse_dates=True)

In [261]:
print('actual_demand {}'.format(actual_demand_forecast.shape))
print('generation_production {}'.format(generation_production_type.shape))
print('solar_wind_forecast {}'.format(solar_wind_forecasts.shape))
print('weather {}'.format(weather.shape))
print('prices {}'.format(prices.shape))

actual_demand (35068, 2)
generation_production (35068, 21)
solar_wind_forecast (35068, 3)
weather (178396, 15)
prices (35064, 2)


In [271]:
print('Actual vs generation: {}'.format(actual_demand_forecast.equals(generation_production_type)))
print('Actual vs solar wind forecast: {}'.format(actual_demand_forecast.equals(solar_wind_forecasts)))
print('Actual vs prices: {}'.format(actual_demand_forecast.equals(prices)))
print('generation vs solar: {}'.format(generation_production_type.equals(generation_production_type)))
print('generation vs prices: {}'.format(generation_production_type.equals(solar_wind_forecasts)))
print('solar vs prices: {}'.format(solar_wind_forecasts.equals(prices)))

Actual vs generation: False
Actual vs solar wind forecast: False
Actual vs prices: False
generation vs solar: True
generation vs prices: False
solar vs prices: False


In [279]:
generation_and_solar = pd.concat([generation_production_type, solar_wind_forecasts], axis=1)
generation_and_solar.head()

Unnamed: 0_level_0,biomass,fossil brown coal/lignite,fossil coal-derived gas,fossil gas,fossil hard coal,fossil oil,fossil oil shale,fossil peat,geothermal,hydro pumped storage,...,nuclear,other,other renewable,solar,waste,wind offshore,wind onshore,solar day ahead,wind offshore eday ahead,wind onshore day ahead
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,447.0,329.0,0.0,4844.0,4821.0,162.0,0.0,0.0,0.0,,...,7096.0,43.0,73.0,49.0,196.0,0.0,6378.0,17.0,n/e,6436.0
2015-01-01 01:00:00,449.0,328.0,0.0,5196.0,4755.0,158.0,0.0,0.0,0.0,,...,7096.0,43.0,71.0,50.0,195.0,0.0,5890.0,16.0,n/e,5856.0
2015-01-01 02:00:00,448.0,323.0,0.0,4857.0,4581.0,157.0,0.0,0.0,0.0,,...,7099.0,43.0,73.0,50.0,196.0,0.0,5461.0,8.0,n/e,5454.0
2015-01-01 03:00:00,438.0,254.0,0.0,4314.0,4131.0,160.0,0.0,0.0,0.0,,...,7098.0,43.0,75.0,50.0,191.0,0.0,5238.0,2.0,n/e,5151.0
2015-01-01 04:00:00,428.0,187.0,0.0,4130.0,3840.0,156.0,0.0,0.0,0.0,,...,7097.0,43.0,74.0,42.0,189.0,0.0,4935.0,9.0,n/e,4861.0


In [262]:
actual_demand_forecast['2015-03-29']

Unnamed: 0_level_0,day_forecast,actual_load
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-03-29 00:00:00,24472.0,24029.0
2015-03-29 01:00:00,22633.0,22123.0
2015-03-29 02:00:00,,
2015-03-29 03:00:00,20770.0,20906.0
2015-03-29 04:00:00,20396.0,20313.0
2015-03-29 05:00:00,20473.0,20452.0
2015-03-29 06:00:00,20745.0,20668.0
2015-03-29 07:00:00,21601.0,21125.0
2015-03-29 08:00:00,21506.0,21317.0
2015-03-29 09:00:00,22770.0,22828.0


In [276]:
generation_and_solar['2015-03-28']

Unnamed: 0_level_0,biomass,fossil brown coal/lignite,fossil coal-derived gas,fossil gas,fossil hard coal,fossil oil,fossil oil shale,fossil peat,geothermal,hydro pumped storage,...,nuclear,other,other renewable,solar,waste,wind offshore,wind onshore,solar day ahead,wind offshore eday ahead,wind onshore day ahead
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-03-28 00:00:00,487.0,0.0,0.0,4317.0,3234.0,315.0,0.0,0.0,0.0,,...,7094.0,82.0,67.0,510.0,145.0,0.0,8018.0,356.0,n/e,8051.0
2015-03-28 01:00:00,488.0,0.0,0.0,4368.0,2570.0,297.0,0.0,0.0,0.0,,...,7095.0,81.0,66.0,508.0,145.0,0.0,7547.0,455.0,n/e,7667.0
2015-03-28 02:00:00,487.0,0.0,0.0,4149.0,2314.0,296.0,0.0,0.0,0.0,,...,7095.0,81.0,66.0,442.0,150.0,0.0,7489.0,406.0,n/e,7468.0
2015-03-28 03:00:00,485.0,0.0,0.0,4208.0,2000.0,292.0,0.0,0.0,0.0,,...,7097.0,81.0,66.0,326.0,149.0,0.0,7326.0,277.0,n/e,7409.0
2015-03-28 04:00:00,482.0,0.0,0.0,4059.0,1919.0,283.0,0.0,0.0,0.0,,...,7097.0,82.0,66.0,172.0,148.0,0.0,7184.0,122.0,n/e,7262.0
2015-03-28 05:00:00,481.0,0.0,0.0,4268.0,2097.0,277.0,0.0,0.0,0.0,,...,7097.0,82.0,67.0,45.0,146.0,0.0,6909.0,24.0,n/e,7062.0
2015-03-28 06:00:00,479.0,0.0,0.0,4088.0,2242.0,276.0,0.0,0.0,0.0,,...,7098.0,81.0,65.0,42.0,142.0,0.0,6835.0,29.0,n/e,6803.0
2015-03-28 07:00:00,487.0,0.0,0.0,4127.0,2128.0,277.0,0.0,0.0,0.0,,...,7097.0,80.0,66.0,218.0,146.0,0.0,6821.0,260.0,n/e,6864.0
2015-03-28 08:00:00,487.0,0.0,0.0,3896.0,2246.0,279.0,0.0,0.0,0.0,,...,7100.0,80.0,63.0,1349.0,147.0,0.0,6403.0,1231.0,n/e,6498.0
2015-03-28 09:00:00,487.0,0.0,0.0,3520.0,2255.0,280.0,0.0,0.0,0.0,,...,7099.0,81.0,63.0,3231.0,143.0,0.0,6195.0,3073.0,n/e,6111.0


In [280]:
demand_generation_solar = pd.concat([generation_and_solar, actual_demand_forecast], axis=1, join='outer')
print(demand_generation_solar.shape)
demand_generation_solar.head(3)

(35068, 26)


Unnamed: 0_level_0,biomass,fossil brown coal/lignite,fossil coal-derived gas,fossil gas,fossil hard coal,fossil oil,fossil oil shale,fossil peat,geothermal,hydro pumped storage,...,other renewable,solar,waste,wind offshore,wind onshore,solar day ahead,wind offshore eday ahead,wind onshore day ahead,day_forecast,actual_load
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,447.0,329.0,0.0,4844.0,4821.0,162.0,0.0,0.0,0.0,,...,73.0,49.0,196.0,0.0,6378.0,17.0,n/e,6436.0,26118.0,25385.0
2015-01-01 01:00:00,449.0,328.0,0.0,5196.0,4755.0,158.0,0.0,0.0,0.0,,...,71.0,50.0,195.0,0.0,5890.0,16.0,n/e,5856.0,24934.0,24382.0
2015-01-01 02:00:00,448.0,323.0,0.0,4857.0,4581.0,157.0,0.0,0.0,0.0,,...,73.0,50.0,196.0,0.0,5461.0,8.0,n/e,5454.0,23515.0,22734.0


(35068, 26)

In [265]:
prices['2015-03-28']

Unnamed: 0_level_0,intraday price,day-ahead price
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-03-28 00:00:00,59.79,49.59
2015-03-28 01:00:00,54.35,43.41
2015-03-28 02:00:00,50.77,
2015-03-28 03:00:00,48.12,36.44
2015-03-28 04:00:00,46.08,
2015-03-28 05:00:00,46.54,
2015-03-28 06:00:00,46.58,
2015-03-28 07:00:00,48.86,37.69
2015-03-28 08:00:00,58.4,
2015-03-28 09:00:00,61.86,51.18


In [263]:
#drop the extra rows in the actual demand, generation demand, solar wind forecasts
spring_forward_drop_times = prices.index.difference(actual_demand_forecast.index)
spring_forward_drop_times
#actual_demand_forecast.drop(spring_forward_drop_times, axis=0)

DatetimeIndex([], dtype='datetime64[ns]', name='time', freq=None)

In [219]:
merge_dfs = [actual_demand_forecast, generation_production_type, solar_wind_forecasts, prices]

pd.concat(merge_dfs)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,actual_load,biomass,city_name,clouds_all,day-ahead price,day_forecast,fossil brown coal/lignite,fossil coal-derived gas,fossil gas,fossil hard coal,...,waste,weather_description,weather_id,weather_main,wind offshore,wind offshore eday ahead,wind onshore,wind onshore day ahead,wind_deg,wind_speed
2015-01-01 00:00:00,25385.0,,,,,26118.0,,,,,...,,,,,,,,,,
2015-01-01 01:00:00,24382.0,,,,,24934.0,,,,,...,,,,,,,,,,
2015-01-01 02:00:00,22734.0,,,,,23515.0,,,,,...,,,,,,,,,,
2015-01-01 03:00:00,21286.0,,,,,22642.0,,,,,...,,,,,,,,,,
2015-01-01 04:00:00,20264.0,,,,,21785.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-10-08 19:00:00,,,,,,,,,,,...,,,,,,,,,,
2019-10-08 20:00:00,,,,,,,,,,,...,,,,,,,,,,
2019-10-08 21:00:00,,,,,,,,,,,...,,,,,,,,,,
2019-10-08 22:00:00,,,,,50.25,,,,,,...,,,,,,,,,,
