In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import urllib.request as url
import os
import sys
sys.path.append('..')
from visualization.visualize import visualize_nan
from data_processing.data_import import Elmy_import, download_read_csv

# Data Import

## ELMY Data

In [23]:
def count_days_between_dates(start_date, end_date):
    start_date_obj = datetime.strptime(start_date[:10], '%Y-%m-%d')
    end_date_obj = datetime.strptime(end_date[:10], '%Y-%m-%d')
    
    difference = end_date_obj - start_date_obj
    
    total_days = difference.days + 1
    
    return total_days

In [24]:
X_train_raw = Elmy_import('../data/raw/X_train.csv')
y_train_raw = Elmy_import('../data/raw/y_train.csv', target=True)
X_test_raw = Elmy_import('../data/raw/X_test.csv')
y_random_raw = Elmy_import('../data/raw/y_random.csv', target=True)

In [25]:
# Keep indexes for merge
X_train_index = X_train_raw.index
X_test_index = X_test_raw.index

# print shapes
print(f'X_train shape: {X_train_raw.shape}')
print(f'y_train shape: {y_train_raw.shape}')
print(f'[training] number of days: {count_days_between_dates(X_train_index[0], X_train_index[-1])} - first date: {X_train_index[0]} - last date: {X_train_index[-1]}\n')
print(f'X_test shape: {X_test_raw.shape}')
print(f'y_random shape: {y_random_raw.shape}')
print(f'[training] number of days: {count_days_between_dates(X_test_index[0], X_test_index[-1])} - first date: {X_test_index[0]} - last date: {X_test_index[-1]}')

# print head
display(X_train_raw.head())
display(y_train_raw.head())

X_train shape: (10605, 11)
y_train shape: (10605, 1)
[training] number of days: 453 - first date: 2022-01-01 02:00:00+01:00 - last date: 2023-03-29 23:00:00+02:00

X_test shape: (4942, 11)
y_random shape: (4942, 1)
[training] number of days: 206 - first date: 2023-04-02 00:00:00+02:00 - last date: 2023-10-24 23:00:00+02:00


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),Date
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,2022-01-01
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,2022-01-01
2022-01-01 04:00:00+01:00,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.0,,2022-01-01 03:00:00+00:00,2022-01-01
2022-01-01 05:00:00+01:00,44779.0,3386.0,11487.0,44118.0,3447.0,0.0,36.127588,0.0,,2022-01-01 04:00:00+00:00,2022-01-01
2022-01-01 06:00:00+01:00,45284.0,3386.0,11487.0,44118.0,3679.0,0.0,30.983023,0.0,,2022-01-01 05:00:00+00:00,2022-01-01


Unnamed: 0_level_0,spot_id_delta
DELIVERY_START,Unnamed: 1_level_1
2022-01-01 02:00:00+01:00,-36.87477
2022-01-01 03:00:00+01:00,-12.643588
2022-01-01 04:00:00+01:00,-1.950193
2022-01-01 05:00:00+01:00,1.938272
2022-01-01 06:00:00+01:00,0.199907


## ODRE (OPEN DATA RESEAU ENERGIE)

In [26]:
# Departemental daily temperature
temperature_quotidienne_departementale = download_read_csv('../data/external/temperature-quotidienne-departementale.csv')
display(temperature_quotidienne_departementale.head(2))

# Take the mean over departments
temperature_quotidienne = temperature_quotidienne_departementale.groupby('Date')[['TMin (°C)', 'TMax (°C)', 'TMoy (°C)']].mean()
temperature_quotidienne.index = pd.to_datetime(temperature_quotidienne.index, format='%Y-%m-%d')

display(temperature_quotidienne.head(2))

# Merge the daily average temperature with the training and test set
X_train = X_train_raw.merge(temperature_quotidienne, on='Date', how='left')
X_train.index = X_train_index
X_test = X_test_raw.merge(temperature_quotidienne, on='Date', how='left')
X_test.index = X_test_index 

display(X_train.head(2))

# Delete datsets
del temperature_quotidienne_departementale, temperature_quotidienne

Unnamed: 0,Date,Code INSEE département,Département,TMin (°C),TMax (°C),TMoy (°C)
0,2018-09-01,30,Gard,11.03,22.17,16.6
1,2018-09-01,33,Gironde,11.97,27.9,19.93


Unnamed: 0_level_0,TMin (°C),TMax (°C),TMoy (°C)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,5.844167,10.404167,8.124167
2018-01-02,5.530938,13.406562,9.46875


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),Date,TMin (°C),TMax (°C),TMoy (°C)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,2022-01-01,5.046667,14.886979,9.966771
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,2022-01-01,5.046667,14.886979,9.966771


In [27]:
# pic-journalier-consommation is very similar to X_train.groupby('Date').max()['Consommation brute électricité (MW) - RTE']
# correlation is 0.999377
# furthermore, there is clearly a look-ahead bias when we use such a feature
if True:
    # Daily pick gross consumption
    pic_journalier_consommation_brute = download_read_csv('../data/external/pic-journalier-consommation-brute.csv')
    pic_journalier_consommation_brute['Date'] = pd.to_datetime(pic_journalier_consommation_brute.Date, format='%Y-%m-%d')

    display(pic_journalier_consommation_brute.head(2))

    # Merge the daily pick gross consumption with the training and test set
    X_train = X_train.merge(pic_journalier_consommation_brute, on='Date', how='left')
    X_train.index = X_train_index
    X_test = X_test.merge(pic_journalier_consommation_brute, on='Date', how='left')
    X_test.index = X_test_index

    display(X_train.head(2))

    # Delete datset
    del pic_journalier_consommation_brute

Unnamed: 0,Date,Pic journalier consommation (MW),Température moyenne (°C),Température référence (°C)
0,2012-01-01,59610.0,11.7,4.6
1,2012-01-04,78337.0,8.1,4.6


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),Date,TMin (°C),TMax (°C),TMoy (°C),Pic journalier consommation (MW),Température moyenne (°C),Température référence (°C)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,2022-01-01,5.046667,14.886979,9.966771,55828.0,9.7,4.6
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,2022-01-01,5.046667,14.886979,9.966771,55828.0,9.7,4.6


In [28]:
# there is clearly a look-ahead bias when we use such a feature
if True:
    # Commercial flows
    extremas_quotidiens_flux_commerciaux = download_read_csv('../data/external/extremas-quotidiens-flux-commerciaux.csv')
    extremas_quotidiens_flux_commerciaux.drop(columns="Temperature moy (°C)", inplace=True)
    extremas_quotidiens_flux_commerciaux['Date'] = pd.to_datetime(extremas_quotidiens_flux_commerciaux.Date, format='%Y-%m-%d')

    display(extremas_quotidiens_flux_commerciaux.head(2))

    # Merge the commercial flows with the training and test set
    X_train = X_train.merge(extremas_quotidiens_flux_commerciaux, on='Date', how='left')
    X_train.index = X_train_index
    X_test = X_test.merge(extremas_quotidiens_flux_commerciaux, on='Date', how='left')
    X_test.index = X_test_index

    display(X_train.head(2))

    # Delete datset
    del extremas_quotidiens_flux_commerciaux

Unnamed: 0,Date,Solde min (MW),Solde max (MW)
0,2013-01-04,4131.0,6798.0
1,2013-01-05,4508.0,9969.0


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),Date,TMin (°C),TMax (°C),TMoy (°C),Pic journalier consommation (MW),Température moyenne (°C),Température référence (°C),Solde min (MW),Solde max (MW)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,2022-01-01,5.046667,14.886979,9.966771,55828.0,9.7,4.6,817.0,11247.0
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,2022-01-01,5.046667,14.886979,9.966771,55828.0,9.7,4.6,817.0,11247.0


In [29]:
# Mensual production of eolic energy
production_mensuelle_energie_eolienne = download_read_csv('../data/external/courbes-de-production-mensuelles-eolien-solaire-complement-de-remuneration.csv').dropna()
#display(production_mensuelle_energie_eolienne.head(2))
#display(production_mensuelle_energie_eolienne.tail(2))
production_mensuelle_energie_eolienne['Date (UTC)'] = pd.to_datetime(pd.to_datetime(production_mensuelle_energie_eolienne.Date + '-' + production_mensuelle_energie_eolienne.Heure).apply(lambda x: x.tz_localize('Europe/Paris', ambiguous='NaT')), utc=True)
#display(production_mensuelle_energie_eolienne.head(2))
#display(production_mensuelle_energie_eolienne.tail(2))
production_mensuelle_energie_eolienne.drop(['Heure', 'Date'], axis=1, inplace=True)
# Merge the mensual production of eolic energy with the training and test set
X_train = X_train.merge(production_mensuelle_energie_eolienne, on='Date (UTC)', how='left')
X_train.index = X_train_index
X_test = X_test.merge(production_mensuelle_energie_eolienne, on='Date (UTC)', how='left')
X_test.index = X_test_index

display(X_train.head(2))

# Delete datset
del production_mensuelle_energie_eolienne

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),...,TMin (°C),TMax (°C),TMoy (°C),Pic journalier consommation (MW),Température moyenne (°C),Température référence (°C),Solde min (MW),Solde max (MW),prod_eolienne_MWh,prod_solaire_MWh
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,...,5.046667,14.886979,9.966771,55828.0,9.7,4.6,817.0,11247.0,3187.0,1.0
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,...,5.046667,14.886979,9.966771,55828.0,9.7,4.6,817.0,11247.0,3390.0,1.0


In [30]:
# Daily Gross Consumption
consumption = download_read_csv('../data/external/consommation-quotidienne-brute.csv').dropna()
consumption['Date (UTC)'] = pd.to_datetime(consumption['Date - Heure'], utc=True)
# Do not include 'Consommation brute totale (MW)' as it is the sum of 'Consommation brute gaz totale' and 'Consommation brute électricité'
consumption = consumption[['Date (UTC)', 'Consommation brute gaz totale (MW PCS 0°C)', 'Consommation brute électricité (MW) - RTE']]
consumption.drop_duplicates(subset='Date (UTC)', keep='first', inplace=True)

display(consumption.head(2))

# Merge the daily gross consumption with the training and test set
X_train = X_train.merge(consumption, on='Date (UTC)', how='left')
X_train.index = X_train_index
X_test = X_test.merge(consumption, on='Date (UTC)', how='left')
X_test.index = X_test_index

display(X_train.head(2))

# Delete datset
del consumption

Unnamed: 0,Date (UTC),Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE
1,2023-11-30 22:00:00+00:00,67839.0,61153
3,2023-11-30 21:00:00+00:00,72563.0,61421


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),...,TMoy (°C),Pic journalier consommation (MW),Température moyenne (°C),Température référence (°C),Solde min (MW),Solde max (MW),prod_eolienne_MWh,prod_solaire_MWh,Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,...,9.966771,55828.0,9.7,4.6,817.0,11247.0,3187.0,1.0,45584.0,51279.0
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,...,9.966771,55828.0,9.7,4.6,817.0,11247.0,3390.0,1.0,46671.0,48288.0


In [31]:
regional_solar_and_wind_power = download_read_csv('../data/external/rayonnement-solaire-vitesse-vent-tri-horaires-regionaux.csv')
regional_solar_and_wind_power['Date (UTC)'] = pd.to_datetime(regional_solar_and_wind_power.Date, utc=True)
solar_and_wind_power = regional_solar_and_wind_power.groupby('Date (UTC)')[['Vitesse du vent à 100m (m/s)', 'Rayonnement solaire global (W/m2)']].mean()

display(solar_and_wind_power.head(2))

# Merge the daily average temperature with the training and test set
X_train = X_train.merge(solar_and_wind_power, on='Date (UTC)', how='left')
X_train.index = X_train_index
X_test = X_test.merge(solar_and_wind_power, on='Date (UTC)', how='left')
X_test.index = X_test_index

display(X_train.head(2))

del regional_solar_and_wind_power, solar_and_wind_power

Unnamed: 0_level_0,Vitesse du vent à 100m (m/s),Rayonnement solaire global (W/m2)
Date (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-01 03:00:00+00:00,8.39,0.0
2016-01-01 06:00:00+00:00,8.38,0.0


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),...,Température moyenne (°C),Température référence (°C),Solde min (MW),Solde max (MW),prod_eolienne_MWh,prod_solaire_MWh,Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE,Vitesse du vent à 100m (m/s),Rayonnement solaire global (W/m2)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,...,9.7,4.6,817.0,11247.0,3187.0,1.0,45584.0,51279.0,,
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,...,9.7,4.6,817.0,11247.0,3390.0,1.0,46671.0,48288.0,,


In [32]:
prod_nat_gaz_horaire = download_read_csv('../data/external/prod-nat-gaz-horaire-def.csv')
prod_nat_gaz_horaire.index = prod_nat_gaz_horaire.Date
prod_nat_gaz_horaire = prod_nat_gaz_horaire.drop('Date', axis=1).loc[:, '00_00_00': '23_00_00']
prod_nat_gaz_horaire.columns = prod_nat_gaz_horaire.columns.str.replace('_', ':')

melted_df = prod_nat_gaz_horaire.reset_index().melt(id_vars='Date', var_name='Hour', value_name='Production horaire de biométhane (MWh - 0°C PCS)')
melted_df['Date_and_hour'] = pd.to_datetime(melted_df['Date'].astype(str) + ' ' + melted_df['Hour'])

def localize_date(x: pd.Timestamp):
    try:
        return x.tz_localize('Europe/Paris', ambiguous='NaT')
    except:
        return pd.NaT  # Return NaT if NonExistentTimeError occurs

melted_df['Date_and_hour'] = melted_df['Date_and_hour'].apply(localize_date)
melted_df['Date (UTC)'] = pd.to_datetime(melted_df.Date_and_hour, utc=True)
melted_df = melted_df[['Date (UTC)', 'Production horaire de biométhane (MWh - 0°C PCS)']].dropna()
melted_df = melted_df.groupby('Date (UTC)')[['Production horaire de biométhane (MWh - 0°C PCS)']].sum()

X_train = X_train.merge(melted_df, on='Date (UTC)', how='left')
X_train.index = X_train_index
X_test = X_test.merge(melted_df, on='Date (UTC)', how='left')
X_test.index = X_test_index

display(X_train.head(3))

del melted_df, prod_nat_gaz_horaire

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,Date (UTC),...,Température référence (°C),Solde min (MW),Solde max (MW),prod_eolienne_MWh,prod_solaire_MWh,Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE,Vitesse du vent à 100m (m/s),Rayonnement solaire global (W/m2),Production horaire de biométhane (MWh - 0°C PCS)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,,2022-01-01 01:00:00+00:00,...,4.6,817.0,11247.0,3187.0,1.0,45584.0,51279.0,,,128.201328
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,,2022-01-01 02:00:00+00:00,...,4.6,817.0,11247.0,3390.0,1.0,46671.0,48288.0,,,124.860509
2022-01-01 04:00:00+01:00,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.0,,2022-01-01 03:00:00+00:00,...,4.6,817.0,11247.0,3558.0,1.0,48808.0,46282.0,7.754615,0.0,127.018472


In [33]:
col_order = ['Date (UTC)', 'Date', 'load_forecast', 'coal_power_available', 'gas_power_available', 'nucelear_power_available', 'wind_power_forecasts_average', 'solar_power_forecasts_average', 'wind_power_forecasts_std', 'solar_power_forecasts_std', 'predicted_spot_price', 'TMin (°C)', 'TMax (°C)', 'TMoy (°C)', 'Température moyenne (°C)', 'Température référence (°C)', 'prod_eolienne_MWh', 'prod_solaire_MWh', 'Production horaire de biométhane (MWh - 0°C PCS)', 'Consommation brute gaz totale (MW PCS 0°C)', 'Consommation brute électricité (MW) - RTE', 'Pic journalier consommation (MW)', 'Solde max (MW)', 'Solde min (MW)', 'Vitesse du vent à 100m (m/s)', 'Rayonnement solaire global (W/m2)']
print(X_train.columns.difference(col_order))

Index([], dtype='object')


In [34]:
X_train = X_train[col_order]
X_test = X_test[col_order]
display(X_train.head(6))
display(X_test.head(6))

Unnamed: 0_level_0,Date (UTC),Date,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,...,prod_eolienne_MWh,prod_solaire_MWh,Production horaire de biométhane (MWh - 0°C PCS),Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE,Pic journalier consommation (MW),Solde max (MW),Solde min (MW),Vitesse du vent à 100m (m/s),Rayonnement solaire global (W/m2)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 02:00:00+01:00,2022-01-01 01:00:00+00:00,2022-01-01,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.0,...,3187.0,1.0,128.201328,45584.0,51279.0,55828.0,11247.0,817.0,,
2022-01-01 03:00:00+01:00,2022-01-01 02:00:00+00:00,2022-01-01,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.0,...,3390.0,1.0,124.860509,46671.0,48288.0,55828.0,11247.0,817.0,,
2022-01-01 04:00:00+01:00,2022-01-01 03:00:00+00:00,2022-01-01,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.0,...,3558.0,1.0,127.018472,48808.0,46282.0,55828.0,11247.0,817.0,7.754615,0.0
2022-01-01 05:00:00+01:00,2022-01-01 04:00:00+00:00,2022-01-01,44779.0,3386.0,11487.0,44118.0,3447.0,0.0,36.127588,0.0,...,3776.0,2.0,121.807973,53658.0,46059.0,55828.0,11247.0,817.0,,
2022-01-01 06:00:00+01:00,2022-01-01 05:00:00+00:00,2022-01-01,45284.0,3386.0,11487.0,44118.0,3679.0,0.0,30.983023,0.0,...,3888.0,2.0,135.582119,62758.0,46588.0,55828.0,11247.0,817.0,,
2022-01-01 07:00:00+01:00,2022-01-01 06:00:00+00:00,2022-01-01,45648.0,3386.0,11487.0,44118.0,3902.0,0.0,27.624052,0.0,...,4380.0,3.0,133.258418,62893.0,47269.0,55828.0,11247.0,817.0,7.582308,0.0


Unnamed: 0_level_0,Date (UTC),Date,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,...,prod_eolienne_MWh,prod_solaire_MWh,Production horaire de biométhane (MWh - 0°C PCS),Consommation brute gaz totale (MW PCS 0°C),Consommation brute électricité (MW) - RTE,Pic journalier consommation (MW),Solde max (MW),Solde min (MW),Vitesse du vent à 100m (m/s),Rayonnement solaire global (W/m2)
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-04-02 00:00:00+02:00,2023-04-01 22:00:00+00:00,2023-04-02,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.0,...,8397.0,0.0,163.299167,40526.0,48277,53544.0,8571.0,-580.0,,
2023-04-02 01:00:00+02:00,2023-04-01 23:00:00+00:00,2023-04-02,44084.0,3386.0,10902.0,36705.0,6469.0,0.0,54.262133,0.0,...,8789.0,0.0,166.810006,40903.0,46025,53544.0,8571.0,-580.0,,
2023-04-02 02:00:00+02:00,2023-04-02 00:00:00+00:00,2023-04-02,43281.0,3386.0,10902.0,36705.0,6511.0,0.0,78.105928,0.0,...,8919.0,0.0,169.712048,41822.0,45482,53544.0,8571.0,-580.0,7.002308,0.0
2023-04-02 03:00:00+02:00,2023-04-02 01:00:00+00:00,2023-04-02,40825.0,3386.0,10902.0,36705.0,6628.0,0.0,78.187557,0.0,...,8956.0,0.0,168.154928,43582.0,42803,53544.0,8571.0,-580.0,,
2023-04-02 04:00:00+02:00,2023-04-02 02:00:00+00:00,2023-04-02,39181.0,3386.0,10902.0,36705.0,6700.0,0.0,96.765484,0.0,...,8862.0,0.0,166.209406,47576.0,41014,53544.0,8571.0,-580.0,,
2023-04-02 05:00:00+02:00,2023-04-02 03:00:00+00:00,2023-04-02,38928.0,3386.0,10902.0,36705.0,6750.0,0.0,133.267741,0.0,...,8982.0,0.0,155.185281,55588.0,40992,53544.0,8571.0,-580.0,6.315385,0.004615


In [35]:
if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')
X_train.to_csv('../data/processed/X_train_full.csv')
X_test.to_csv('../data/processed/X_test_full.csv')