In [6]:
import pandas as pd
from datetime import datetime, timedelta
import pytz
import dateutil

from data import data_utils

half_hour = timedelta(minutes=30)

Combine all csv files 

In [7]:
# from YYYYMMDDSP to datetime utc aware
date_parser = lambda x: pd.to_datetime(x[:8]).tz_localize('UTC') + (int(x[8:]) - 1) * half_hour
margin_imb = pd.read_csv("raw_data/elexon_margin_imbalance.csv", usecols=[1,2,3], 
                         dtype = {'index': str,'DA_margin': int, 'DA_imb': int}, 
                         index_col=0, parse_dates=True, date_parser=date_parser) ## CHECK IF THIS IS A DAY AHEAD FEATURE
bin_dinorwig = pd.read_csv("raw_data/elexon_bin_dinorwig.csv", 
                           dtype = {'Unnamed: 0': str,'dino_bin': int}, 
                           index_col=0, parse_dates=True, date_parser=date_parser)
elexon_data = pd.read_csv("raw_data/elexon_data.csv", 
                          dtype = {'Unnamed: 0': str}, index_col=0, parse_dates=True, date_parser=date_parser)
offers = pd.read_csv("raw_data/elexon_offers.csv",
                     dtype = {'Unnamed: 0': str}, index_col=0, parse_dates=True, date_parser=date_parser).rename(columns={"Offers": "offers"})

# from str of date to datetime utc aware
date_parser = lambda x: pd.to_datetime(x).tz_convert("utc")
price_france = pd.read_csv("raw_data/entsoe_france_prices.csv", nrows = 26281, index_col=0, parse_dates=True, date_parser=date_parser).rename(columns={"0": "price_france"})
gen_france = pd.read_csv("raw_data/entsoe_france_generation_forecast.csv", nrows = 26281, index_col=0, parse_dates=True, date_parser=date_parser).rename(columns={"0": "gen_france"})
load_france = pd.read_csv("raw_data/entsoe_france_load_forecast.csv", nrows = 26281, index_col=0, parse_dates=True, date_parser=date_parser).rename(columns={"0": "load_france"})

In [8]:
df = elexon_data.join(margin_imb).join(bin_dinorwig).join(price_france).join(gen_france).join(load_france).join(offers).ffill()

In [9]:
df.isna().sum()

Ren_R           0
APXP            0
APXV            0
Rene            0
TSDF            0
NIV             0
Im_Pr           0
In_gen          0
DRM             0
LOLP            0
DA_margin       0
DA_imb          0
dino_bin        0
price_france    0
gen_france      0
load_france     0
offers          0
dtype: int64

In [10]:
df

Unnamed: 0,Ren_R,APXP,APXV,Rene,TSDF,NIV,Im_Pr,In_gen,DRM,LOLP,DA_margin,DA_imb,dino_bin,price_france,gen_france,load_france,offers
2016-01-01 00:00:00+00:00,0.599948,31.10,6342.2,7511.619,28400.0,-253.3504,29.60000,2756.0,21738.455,0.0,31830.0,708.0,0.0,22.39,62205.0,56550.0,59.0
2016-01-01 00:30:00+00:00,0.562381,31.10,6342.2,7511.619,28600.0,55.8867,48.13640,2688.0,22010.775,0.0,31411.0,607.0,0.0,22.39,62205.0,56550.0,120.0
2016-01-01 01:00:00+00:00,0.538274,37.58,6109.5,6802.152,28994.0,239.7857,49.02361,2742.0,21193.720,0.0,30873.0,292.0,1.0,20.59,60615.0,56150.0,160.0
2016-01-01 01:30:00+00:00,0.528385,37.58,6109.5,6802.152,28033.0,18.1805,46.00000,2694.0,21154.959,0.0,31608.0,724.0,0.0,20.59,60615.0,56150.0,150.0
2016-01-01 02:00:00+00:00,0.519047,36.53,5927.6,5948.614,27012.0,89.0833,48.50000,2554.0,20805.762,0.0,32475.0,1052.0,1.0,16.81,56584.0,52600.0,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 21:30:00+00:00,0.702924,47.32,6256.4,11252.438,29156.0,-880.3353,40.10000,1724.0,19790.707,0.0,33528.0,592.0,0.0,49.80,66179.0,62950.0,73.0
2018-12-31 22:00:00+00:00,0.703644,40.81,6763.0,11384.837,28315.0,-562.3883,40.00000,-96.0,19500.754,0.0,32590.0,-422.0,1.0,44.26,65849.0,62950.0,98.0
2018-12-31 22:30:00+00:00,0.702286,40.81,6763.0,11384.837,27881.0,-601.8658,40.00000,-128.0,19362.640,0.0,32721.0,-342.0,0.0,44.26,65849.0,62950.0,84.5
2018-12-31 23:00:00+00:00,0.723989,52.23,4444.4,11570.606,27188.0,-694.6639,40.00000,162.0,18932.154,0.0,31738.0,26.0,0.0,51.00,64931.0,62950.0,107.0


Generate lagged offer features

In [11]:
# lags for shifts 
shifts = [4, 48, 336]

for lag, prefix in zip(shifts, ["prev_sp", "prev_day", "prev_week"]):
    df = data_utils.shift(df, ["offers"], lag, prefix)

In [12]:
df.fillna(df.mean(), inplace=True)

In [13]:
df.columns

Index(['Ren_R', 'APXP', 'APXV', 'Rene', 'TSDF', 'NIV', 'Im_Pr', 'In_gen',
       'DRM', 'LOLP', 'DA_margin', 'DA_imb', 'dino_bin', 'price_france',
       'gen_france', 'load_france', 'offers', 'prev_sp_offers',
       'prev_day_offers', 'prev_week_offers'],
      dtype='object')

In [15]:
df

Unnamed: 0,Ren_R,APXP,APXV,Rene,TSDF,NIV,Im_Pr,In_gen,DRM,LOLP,DA_margin,DA_imb,dino_bin,price_france,gen_france,load_france,offers,prev_sp_offers,prev_day_offers,prev_week_offers
2016-01-01 00:00:00+00:00,0.599948,31.10,6342.2,7511.619,28400.0,-253.3504,29.60000,2756.0,21738.455,0.0,31830.0,708.0,0.0,22.39,62205.0,56550.0,59.0,119.811026,119.833245,119.906374
2016-01-01 00:30:00+00:00,0.562381,31.10,6342.2,7511.619,28600.0,55.8867,48.13640,2688.0,22010.775,0.0,31411.0,607.0,0.0,22.39,62205.0,56550.0,120.0,119.811026,119.833245,119.906374
2016-01-01 01:00:00+00:00,0.538274,37.58,6109.5,6802.152,28994.0,239.7857,49.02361,2742.0,21193.720,0.0,30873.0,292.0,1.0,20.59,60615.0,56150.0,160.0,119.811026,119.833245,119.906374
2016-01-01 01:30:00+00:00,0.528385,37.58,6109.5,6802.152,28033.0,18.1805,46.00000,2694.0,21154.959,0.0,31608.0,724.0,0.0,20.59,60615.0,56150.0,150.0,119.811026,119.833245,119.906374
2016-01-01 02:00:00+00:00,0.519047,36.53,5927.6,5948.614,27012.0,89.0833,48.50000,2554.0,20805.762,0.0,32475.0,1052.0,1.0,16.81,56584.0,52600.0,140.0,59.000000,119.833245,119.906374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 21:30:00+00:00,0.702924,47.32,6256.4,11252.438,29156.0,-880.3353,40.10000,1724.0,19790.707,0.0,33528.0,592.0,0.0,49.80,66179.0,62950.0,73.0,106.000000,78.000000,100.000000
2018-12-31 22:00:00+00:00,0.703644,40.81,6763.0,11384.837,28315.0,-562.3883,40.00000,-96.0,19500.754,0.0,32590.0,-422.0,1.0,44.26,65849.0,62950.0,98.0,106.000000,97.000000,100.000000
2018-12-31 22:30:00+00:00,0.702286,40.81,6763.0,11384.837,27881.0,-601.8658,40.00000,-128.0,19362.640,0.0,32721.0,-342.0,0.0,44.26,65849.0,62950.0,84.5,79.950000,78.000000,97.000000
2018-12-31 23:00:00+00:00,0.723989,52.23,4444.4,11570.606,27188.0,-694.6639,40.00000,162.0,18932.154,0.0,31738.0,26.0,0.0,51.00,64931.0,62950.0,107.0,73.000000,97.000000,100.000000


In [14]:
# CHECK DATA TIMINGS AND MAKE CORRECT SHIFTS
# CHECK CORRELATIONS 
# CHECK DATA DISTRIBUTIONS

In [None]:

data['Offers'] = data['Offers'].shift(-3)
data['PrevDay'] = data['PrevDay'].shift(-3)
data['PrevWeek'] = data['PrevWeek'].shift(-3)
data['APXP'] = data['APXP'].shift(-3)
data['Rene'] = data['Rene'].shift(-3)
data['TSDF'] = data['TSDF'].shift(-3)
data['LOLP'] = data['LOLP'].shift(-3)
data['DA_margin'] = data['DA_margin'].shift(-3)
data['DA_imb'] = data['DA_imb'].shift(-3)
data['wind_peak_bin'] = data['wind_peak_bin'].shift(-3)
data['daily_exchange_rate'] = data['daily_exchange_rate'].shift(-3)
data['DA_price_france'] = data['DA_price_france'].shift(-3)
data['DA_imb_France'] = data['DA_imb_France'].shift(-3)


data['Ren_R'] = data['Ren_R'].shift(1)
data['NIV'] = data['NIV'].shift(1)
data['Im_Pr'] = data['Im_Pr'].shift(1)
data['In_gen'] = data['In_gen'].shift(1)
data['ratio_offers_vol'] = data['ratio_offers_vol'].shift(1)
data['ratio_bids_vol'] = data['ratio_bids_vol'].shift(1)
data['dino_bin'] = data['dino_bin'].shift(1)

# =============================================================================
# Missing values
# =============================================================================

# calculate missing values
missing_perc = (data.isna().sum()/data.count())*100

print('Missing values (%):\n {}'.format(missing_perc))


# =============================================================================
# CORR MATRIX - 2
# =============================================================================
corr_matrix_2 = data.corr()

# =============================================================================
# DELTA CORR MATRIX - changes in both correlation matrices
# =============================================================================
delta_matrix = corr_matrix_1 - corr_matrix_2

# =============================================================================
# SAVE data set 1 
# =============================================================================
data.to_csv('Data_set_1.csv')

# =============================================================================
# PLOTTING
# =============================================================================
# understand distribution of data from non binary Data Sets
data_nonbin = data.loc[:,['Ren_R','APXP','Rene', 'TSDF', 'NIV', 'Im_Pr', 'In_gen','ratio_offers_vol', 'ratio_bids_vol', 'DA_margin', 'DA_imb', 'daily_exchange_rate', 'DA_price_france', 'DA_imb_France', 'Offers']]
data_nonbin.hist(bins = 100, figsize = (30, 15))
plt.savefig('Distribution_plots.png')

# scatter mattrix - BIG MESS
# scatter_matrix(data, figsize = (20, 15))

# =============================================================================
# PLOT CORRELATION ANALYSIS RESULTS
# =============================================================================

values = corr_matrix_2.iloc[-1,:-1]

fontsize = 15

labels = ['Renewable Ratio',
          'Market Price',
          'Renewable Generation',
          'Transmission Demand',
          'Net Imbalance',
          'Imbalance Price',
          'Interconnectors',
          'Loss of Load Probability',
          'Previous Day SP',
          'Previous Week SP',
          'Grid Margin',
          'Grid Imbalance',
          'Wind Peak Time',
          'Exchange Rate',
          'France Market Price',
          'Dinorwig Plant Presence',
          'Ratio of Offers',
          'Ratio of Bids',
          'France Imbalance']


fig = plt.figure(figsize=(15, 7))
plt.bar(np.arange(1, 20), abs((values)), 
        edgecolor = 'black',
        linewidth = 1.2)
ax = plt.gca()
ax.set_facecolor('aliceblue')
plt.plot(np.arange(0.5, 20.5), np.ones(20) * 0.05, linewidth = 0.8, linestyle = 'dashed', color = 'black')
plt.ylabel('Correlation value', fontsize = fontsize)
plt.xticks(np.arange(1, 20), labels, rotation = 80, fontsize = fontsize )
plt.yticks(fontsize = fontsize)
plt.ylim(0,0.4)
plt.xlim(0.5, 19.5)
plt.title('Correlation Analysis\n', fontsize = fontsize + 2)
plt.tight_layout()
plt.savefig('Correlation_analysis_w_output.png')


# =============================================================================
# Data set TOO BIG - filter out features with coeff lower than 0.05 w/ output
# =============================================================================

# wind peak binary is very uncorrelated with all features
data.drop('wind_peak_bin', axis = 1, inplace = True)
# little correlated with Offers
data.drop('daily_exchange_rate', axis = 1, inplace = True)
data.drop('NIV', axis = 1, inplace = True)
data.drop('DA_imb', axis = 1, inplace = True)
data.drop('ratio_bids_vol', axis = 1, inplace = True)

# fourth correlation matrix 
corr_matrix_3 = data.corr()

# =============================================================================
# Plot the last Correlation matrix
# =============================================================================

fig, ax = plt.subplots(figsize=(13,10))
sn.heatmap(corr_matrix_3, annot=True, linewidths=.5, ax=ax, annot_kws={"size":14}, cmap="Blues", fmt='.2f')
ax.tick_params(axis = "both", labelsize = 14),
fig.show()
fig.savefig("Final_Correlation_Matrix.png", bbox_inches='tight')

# =============================================================================
# Save data set
# =============================================================================
data.to_csv('Data_set_1_smaller_(1).csv')