# Imports

In [19]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import numpy as np
from statsmodels.formula.api import ols
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
import math
import random
from sklearn.model_selection import train_test_split
import logging

# Choix du projet

In [2]:
#Choisir le projet
project = 'ESPS'

# Récupération des données ERA5

In [3]:
S3_bucket = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_era5_folder = r'EtudeWindIndex/ERA5'

In [4]:
#Lecture de l'export horraire de ERA5
file_era5 = 'ERA5_' + project
S3_era5 = f's3://{S3_bucket}/{S3_era5_folder}/{file_era5}.csv'
era5 = pd.read_csv(S3_era5, sep=';')

In [5]:
#Visualisation
era5.sample(5)

Unnamed: 0,time,d2m,t2m,sp,ws100,wd100,ws10,wd10,E100,rh,density,E100_cor
190946,2021-10-13 02:00:00,279.97736,280.0895,96876.23,4.184271,358.0,1.901751,355.0,97.822592,99.43924,1.204899,96.217439
87656,2009-12-31 08:00:00,279.7407,281.10236,94336.74,7.01195,198.0,3.402876,186.0,584.644657,93.19168,1.169087,557.959441
104649,2011-12-09 09:00:00,279.4549,281.5478,96198.61,11.569339,221.0,6.890117,219.0,2372.166338,89.53552,1.190274,2304.921112
189967,2021-09-02 07:00:00,283.28552,286.05396,97208.94,3.729085,104.0,1.879926,103.0,56.139541,86.15784,1.183828,54.252699
34302,2003-11-30 06:00:00,273.99814,277.55856,95833.0,6.24904,176.0,3.244287,155.0,409.781084,82.197876,1.202793,402.352419


In [6]:
#On converti le timestamp
era5['year'] = era5.time.map(lambda date: int(date[:4]))
era5['month'] = era5.time.map(lambda date: int(date[5:7]))
era5['day'] = era5.time.map(lambda date: int(date[8:10]))
era5['hour'] = era5.time.map(lambda date: int(date[11:13]))

In [7]:
#On garde les informations utiles
era5.drop(['time','d2m','t2m','sp','wd100','ws10','wd10','rh','density','E100_cor'], axis=1, inplace=True)
era5.rename(columns={"E100": "energy", "ws100":"windspeed_era5"}, inplace=True)
era5 = era5[['year','month','day','hour','energy','windspeed_era5']]

In [8]:
#Visualisation
era5.sample(5)

Unnamed: 0,year,month,day,hour,energy,windspeed_era5
114894,2013,2,8,6,0.0,2.686011
120412,2013,9,26,4,114.13865,4.328661
63728,2007,4,9,8,0.0,2.16012
15961,2001,10,27,1,522.561146,6.743689
114996,2013,2,12,12,16.302567,3.211722


# Récupération des donées 10 min

In [9]:
#Lecture de l'export horraire turbine
S3_bucket = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_data_folder = r'EtudeWindIndex/Real'
file_data = '10min_' + project
S3_data = f's3://{S3_bucket}/{S3_data_folder}/{file_data}.csv'
data = pd.read_csv(S3_data, sep=';')

In [10]:
#Visualisation
data.sample(5)

Unnamed: 0,project,turbine,year,month,day,hour,minute,active_power_avg,wind_speed_avg
1983960,ESPS,3,2020,9,15,12,0,205.937,4.62285
2722783,ESPS,4,2020,8,6,15,20,0.0,1.37104
1022364,ESPS,1,2020,4,20,22,40,2713.85,10.5315
1308905,ESPS,1,2021,5,23,22,10,2898.61,10.8238
4115100,ESPS,4,2021,9,23,10,0,40.5665,3.45231


In [11]:
data[(data.year==2020)&(data.month==11)&(data.day==27)&(data.hour==0)].wind_speed_avg.mean()

4.797932000000003

# Synthèse des données utiles

In [13]:
#On fusionne les données météo et de production
df = pd.merge(era5, data, how='left', on=['year','month','day','hour'])

In [14]:
filter_data = True

In [15]:
#Filtrage des données manquantes ou éronnées
if filter_data :
    df= df[~((df.wind_speed_avg.isnull()) | (df.wind_speed_avg == 0))]

#Visualisation
df.sample(5)

Unnamed: 0,year,month,day,hour,energy,windspeed_era5,project,turbine,minute,active_power_avg,wind_speed_avg
1787079,2021,1,5,6,0.0,1.922915,ESPS,1.0,10.0,637.626,6.17148
1393618,2020,11,5,13,197.895741,5.04844,ESPS,4.0,50.0,2604.37,10.2875
3359220,2021,7,10,6,0.0,1.601989,ESPS,2.0,0.0,0.0,2.47481
3779142,2021,8,17,1,0.0,2.778846,ESPS,3.0,20.0,84.122,3.42813
5007670,2021,11,26,7,0.0,1.730498,ESPS,4.0,20.0,1111.59,7.77985


# Construction du dataframe de régression horraire

In [None]:
#Calcul des valeurs horaires
logging.basicConfig(format=' %(asctime)s -  %(levelname)s -  %(message)s', handlers = [logging.StreamHandler()])
logging.getLogger().setLevel(logging.INFO)

hourly = pd.DataFrame(columns=['year','month','day','hour','windspeed_data','windspeed_era5'])

for year in df.year.unique().tolist() :
    logging.info("year : {}".format(year))
    for month in df[df.year==year].month.unique().tolist() :
        logging.info("month : {}".format(month))
        for day in df[(df.year==year)&(df.month==month)].day.unique().tolist() :
            for hour in df[(df.year==year)&(df.month==month)&(df.day==day)].hour.unique().tolist() :
                
                mask_data = (df.year==year) & (df.month==month) & (df.day==day) & (df.hour==hour)
                windspeed_data = df.loc[mask_data].wind_speed_avg.mean()
                
                mask_era5 = (era5.year==year) & (era5.month==month) & (era5.day==day) & (era5.hour==hour)
                try :
                    windspeed_era5 = float(era5.loc[mask_era5].windspeed_era5)
                except :
                    windspeed_era5 = 0
                
                hourly = hourly.append({'year':year,'month':month,'day':day,'hour':hour,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

INFO:root:year : 2020
INFO:root:month : 1
INFO:root:month : 2
INFO:root:month : 3
INFO:root:month : 4
INFO:root:month : 5
INFO:root:month : 6
INFO:root:month : 7
INFO:root:month : 8
INFO:root:month : 9


In [26]:
hourly.year = hourly.year.apply(int)
hourly.month = hourly.month.apply(int)
hourly.day = hourly.day.apply(int)
hourly.hour = hourly.hour.apply(int)
hourly.sample(5)

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
10846,2021,3,28,18,5.763467,"186186 3.934382 Name: windspeed_era5, dtype..."
6033,2020,9,8,9,1.149172,"181353 1.401612 Name: windspeed_era5, dtype..."
15003,2021,9,18,7,3.229638,"190351 1.965344 Name: windspeed_era5, dtype..."
11598,2021,4,29,10,4.302785,"186946 6.800597 Name: windspeed_era5, dtype..."
11799,2021,5,7,19,2.704615,"187147 2.150003 Name: windspeed_era5, dtype..."


In [30]:
hourly.windspeed_era5

0        175320    3.456653
Name: windspeed_era5, dtype...
1        175321    3.275481
Name: windspeed_era5, dtype...
2        175322    2.995463
Name: windspeed_era5, dtype...
3        175323    2.939084
Name: windspeed_era5, dtype...
4        175324    3.038922
Name: windspeed_era5, dtype...
                               ...                        
16767    192115    9.858381
Name: windspeed_era5, dtype...
16768    192116    9.686021
Name: windspeed_era5, dtype...
16769    192117    9.686999
Name: windspeed_era5, dtype...
16770    192118    9.731694
Name: windspeed_era5, dtype...
16771    192119    10.273952
Name: windspeed_era5, dtyp...
Name: windspeed_era5, Length: 16772, dtype: object

In [None]:
#Sauvergarde des données dans SageMaker
hourly.to_csv('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/hourly.csv', sep=';')

# Création des trainset et testset

In [None]:
#pd.options.mode.chained_assignment = None
trainset, testset = train_test_split(hourly, test_size=0.33, random_state=42, stratify=hourly.loc[:,['year','month','day']])

In [None]:
#Visualisation
trainset.sample(5)

In [None]:
#Visualisation
testset.sample(5)

# Régression horraire

In [None]:
#définition du calcul de l'erreur (root minimal square error)
def rmse_calc(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [None]:
plots = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/plots_7.pdf')

In [None]:
params = []

with PdfPages(plots) as pdf:
    #On récupère le vecteur de données
    X = trainset
        
    #Calcul des outliers
    m = ols('windspeed_data ~ windspeed_era5',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('windspeed_data ~ windspeed_era5', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"windspeed_era5"]),X.loc[mask,"windspeed_data"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.windspeed_era5,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[mask,]) 
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.windspeed_era5, y=m.predict(X.windspeed_era5), color='r', alpha=0.3)
    sns.lineplot(x=X.windspeed_era5, y=m_clean.predict(X.windspeed_era5))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()

# Construction du dataframe de régression journalière

In [None]:
#Calcul des valeurs horaires
daily = pd.DataFrame(columns=['year','month','day','windspeed_data','windspeed_era5'])

for year in df.year.unique().tolist() :
    for month in df[df.year==year].month.unique().tolist() :
        for day in df[(df.year==year)&(df.month==month)].day.unique().tolist() :
                
                mask_data = (df.year==year) & (df.month==month) & (df.day==day)
                windspeed_data = df.loc[mask_data].wind_speed_avg.mean()
                
                mask_era5 = (era5.year==year) & (era5.month==month) & (era5.day==day)
                try :
                    windspeed_era5 = era5.loc[mask_era5].windspeed_era5.mean()
                except :
                    windspeed_era5 = 0
                
                daily = daily.append({'year':year,'month':month,'day':day,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [None]:
daily.year = daily.year.apply(int)
daily.month = daily.month.apply(int)
daily.day = daily.day.apply(int)
daily.sample(5)

# Création des trainset et testset

In [None]:
#pd.options.mode.chained_assignment = None
trainset, testset = train_test_split(hourly, test_size=0.33, random_state=42, stratify=hourly.loc[:,['year','month','day']])

In [None]:
#Visualisation
trainset.sample(5)

In [None]:
#Visualisation
testset.sample(5)

# Régression journalière

In [None]:
plot_daily = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/plots_day_m7.pdf')

In [None]:
params = []

with PdfPages(plot_daily) as pdf:
    #On récupère le vecteur de données
    X = trainset
        
    #Calcul des outliers
    m = ols('windspeed_data ~ windspeed_era5',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('windspeed_data ~ windspeed_era5', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"windspeed_era5"]),X.loc[mask,"windspeed_data"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.windspeed_era5,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[mask,]) 
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.windspeed_era5, y=m.predict(X.windspeed_era5), color='r', alpha=0.3)
    sns.lineplot(x=X.windspeed_era5, y=m_clean.predict(X.windspeed_era5))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()

# Régression mensuelle

In [None]:
#On crée un trainset journalier
train_monthly = pd.DataFrame(columns=['year','month','windspeed_data','windspeed_era5'])
for year in train_hourly.year.unique().tolist() :
    for month in train_hourly.month.unique().tolist() :
        for day in train_hourly.day.unique().tolist() :
            mask = (train_hourly.year==year) & (train_hourly.month==month)
            windspeed_data = train_hourly.loc[mask].windspeed_data.mean()
            windspeed_era5 = train_hourly.loc[mask].windspeed_era5.mean()
            train_monthly = train_monthly.append({'year':year,'month':month,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [None]:
#On  converti dans les dates en entiers
train_monthly.year = train_monthly.year.apply(int)
train_monthly.month = train_monthly.month.apply(int)

In [None]:
#Visualisation
train_monthly.dropna(inplace=True)
train_monthly.sample(5)

In [None]:
#On crée un testset mensuel
test_monthly = pd.DataFrame(columns=['year','month','windspeed_data','windspeed_era5'])
for year in test_hourly.year.unique().tolist() :
    for month in test_hourly.month.unique().tolist() :
        for day in test_hourly.day.unique().tolist() :
            mask = (test_hourly.year==year) & (test_hourly.month==month) & (test_hourly.day==day)
            windspeed_data = test_hourly.loc[mask].windspeed_data.mean()
            windspeed_era5 = test_hourly.loc[mask].windspeed_era5.mean()
            test_monthly = test_monthly.append({'year':year,'month':month,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [None]:
#On  converti dans les dates en entiers
test_monthly.year = test_monthly.year.apply(int)
test_monthly.month = test_monthly.month.apply(int)

In [None]:
#Visualisation
test_monthly.dropna(inplace=True)
test_monthly.sample(5)

In [None]:
plot_monthly = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/plots_month_m7.pdf')

In [None]:
params = []

with PdfPages(plot_monthly) as pdf:
    #On récupère le vecteur de données
    X = train_monthly
        
    #Calcul des outliers
    m = ols('windspeed_data ~ windspeed_era5',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('windspeed_data ~ windspeed_era5', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"windspeed_era5"]),X.loc[mask,"windspeed_data"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.windspeed_era5,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[mask,]) 
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.windspeed_era5, y=m.predict(X.windspeed_era5), color='r', alpha=0.3)
    sns.lineplot(x=X.windspeed_era5, y=m_clean.predict(X.windspeed_era5))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()