# Imports

In [59]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import numpy as np
from statsmodels.formula.api import ols
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
import math

# Création du dataframe ERA5

In [60]:
S3_bucket = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_era5_folder = r'EtudeWindIndex/ERA5'

In [61]:
#Choisir le projet
project = 'AMEL'

In [62]:
#Lecture de l'export horraire de ERA5
file = 'ERA5_' + project
S3_era5 = f's3://{S3_bucket}/{S3_era5_folder}/{file}.csv'
era5 = pd.read_csv(S3_era5, sep=';')

In [63]:
#Visualisation
era5.sample(5)

Unnamed: 0,time,d2m,t2m,sp,ws100,wd100,ws10,wd10,E100,rh,density,E100_cor
103979,2011-11-11 11:00:00,277.89478,281.87912,100099.57,2.64837,99.0,1.971885,98.0,0.0,80.07828,1.237085,0.0
143561,2016-05-17 17:00:00,276.7926,288.91675,99583.37,1.576423,310.0,1.098395,311.0,0.0,39.379272,1.200727,0.0
113777,2012-12-23 17:00:00,283.11725,285.7584,99277.5,10.405106,216.0,5.924946,212.0,1886.221204,86.79428,1.21027,1863.539993
58421,2006-08-31 05:00:00,281.9549,282.4117,100394.8,3.838355,208.0,1.752514,180.0,64.553318,97.71591,1.238394,65.259137
54841,2006-04-04 01:00:00,277.07352,278.77045,99790.86,4.848799,216.0,2.108058,194.0,172.914299,91.51535,1.247023,176.022911


In [64]:
#On converti le timestamp
era5['year'] = era5.time.map(lambda date: int(date[:4]))
era5['month'] = era5.time.map(lambda date: int(date[5:7]))
era5['day'] = era5.time.map(lambda date: int(date[8:10]))
era5['hour'] = era5.time.map(lambda date: int(date[11:13]))

In [65]:
#On garde les informations utiles
era5.drop(['time','d2m','t2m','sp','ws100','wd100','ws10','wd10','rh','density','E100_cor'], axis=1, inplace=True)
era5.rename(columns={"E100": "energy"}, inplace=True)
era5 = era5[['year','month','day','hour','energy']]

In [66]:
#Visualisation
era5.sample(5)

Unnamed: 0,year,month,day,hour,energy
74774,2008,7,12,14,142.945377
116238,2013,4,5,6,591.883235
84328,2009,8,14,16,0.0
49402,2005,8,20,10,12.675013
109843,2012,7,12,19,149.264026


# Création des trainset et testset

In [67]:
trainset_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Train_Test/hourly/trainset.csv')
testset_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Train_Test/hourly/testset.csv')

In [68]:
#Lecture de l'export horraire de ERA5
trainset = pd.read_csv(trainset_path, sep=';')
testset = pd.read_csv(testset_path, sep=';')

In [69]:
#Visualisation
trainset.dropna(inplace=True)
trainset.sample(5)

Unnamed: 0,project,turbine,year,month,day,hour,minute,active_power_avg,wind_speed_avg
268235,AUQB,7,2021,5,24,19,30,834.0,9.1
45550,AUQB,10,2021,10,10,8,50,0.0,3.5
473328,AUQB,10,2021,7,21,2,30,453.0,8.0
493034,AUQB,6,2021,3,5,8,10,0.0,2.4
680795,AUQB,9,2021,11,6,5,50,414.0,7.2


In [70]:
#Visualisation
testset.dropna(inplace=True)
testset.sample(5)

Unnamed: 0,project,turbine,year,month,day,hour,minute,active_power_avg,wind_speed_avg
105695,AUQB,6,2021,6,3,12,20,356.0,7.4
243293,AUQB,9,2021,2,2,4,50,933.0,9.6
370111,AUQB,10,2021,9,15,3,40,355.0,7.0
194324,AUQB,7,2021,9,13,9,0,0.0,6.3
299343,AUQB,9,2021,7,8,9,50,1594.0,12.0


In [13]:
#On garde l'energy du parc entier pour chaque heure
test_set = pd.DataFrame(columns=['year','month','day','hour','active_power', 'windspeed'])
for year in testset.year.unique().tolist() :
    for month in testset.month.unique().tolist() :
        for day in testset.day.unique().tolist() :
            for hour in testset.hour.unique().tolist() :
                mask = (testset.year==year) & (testset.month==month) & (testset.day==day) & (testset.hour==hour)
                active_power = testset.loc[mask].active_power_avg.sum()
                windspeed = testset.loc[mask].wind_speed_avg.mean()
                test_set = test_set.append({'year':year,'month':month,'day':day,'hour':hour,'active_power':active_power,'windspeed':windspeed},ignore_index=True)

In [14]:
#On  converti dans les dates en entiers
test_set.year = test_set.year.apply(int)
test_set.month = test_set.month.apply(int)
test_set.day = test_set.day.apply(int)
test_set.hour = test_set.hour.apply(int)

In [15]:
#Visualisation
test_set.sample(5)

Unnamed: 0,year,month,day,hour,active_power,windspeed
7783,2021,6,10,6,35.490909,3.981818
6483,2021,3,30,2,0.0,3.103846
5234,2021,10,4,7,-0.090909,7.072727
6939,2021,2,5,2,22.55,2.925
4430,2021,7,22,8,372.510204,7.285714


In [16]:
train_set = pd.DataFrame(columns=['year','month','day','hour','active_power', 'windspeed'])
for year in trainset.year.unique().tolist() :
    for month in trainset.month.unique().tolist() :
        for day in trainset.day.unique().tolist() :
            for hour in trainset.hour.unique().tolist() :
                mask = (trainset.year==year) & (trainset.month==month) & (trainset.day==day) & (trainset.hour==hour)
                active_power = trainset.loc[mask].active_power_avg.mean()
                windspeed = trainset.loc[mask].wind_speed_avg.mean()
                train_set = train_set.append({'year':year,'month':month,'day':day,'hour':hour,'active_power':active_power,'windspeed':windspeed},ignore_index=True)

In [17]:
#On  converti dans les dates en entiers
train_set.year = train_set.year.apply(int)
train_set.month = train_set.month.apply(int)
train_set.day = train_set.day.apply(int)
train_set.hour = train_set.hour.apply(int)

In [18]:
#Visualisation
train_set.sample(5)

Unnamed: 0,year,month,day,hour,active_power,windspeed
5149,2021,10,7,9,0.0,6.268041
1638,2021,9,13,20,0.0,5.293855
5117,2021,10,2,11,-0.112299,4.552941
7996,2021,2,24,0,4.534884,3.234884
446,2021,8,17,6,-0.049689,8.887578


# Synthèse des données utiles

In [19]:
#On fusionne les données météo et de production
#On ne garde que les données qui existent dans le trainset
train = pd.merge(era5.loc[era5.year==2021], train_set, how='left', on=['year','month','day','hour'])
#inner sans mask
mask = train['windspeed'].isnull()
train = train.loc[~mask]

#Tri des données par projet et date
train.sort_values(by=['year','month','day','hour'], inplace=True)

#On modifie l'index pour retrouver plus facilement les données
train.set_index(['year','month','day','hour'], inplace=True)

#Visualisation
train.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,energy,active_power,windspeed
year,month,day,hour,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021,8,5,15,347.347137,446.631902,7.738037
2021,11,23,20,278.856439,1688.406417,12.45615
2021,1,29,14,1789.861489,1252.5,10.885714
2021,4,29,13,690.46076,-0.506494,1.416883
2021,7,14,4,227.423025,387.061224,7.209524


In [20]:
#On fusionne les données météo et de production
#On ne garde que les données qui existent dans le trainset
test = pd.merge(era5.loc[era5.year==2021], test_set, how='left', on=['year','month','day','hour'])
#Vérifier le merge
mask = test['windspeed'].isnull()
train = test.loc[~mask]

#Tri des données par projet et date
test.sort_values(by=['year','month','day','hour'], inplace=True)

#On modifie l'index pour retrouver plus facilement les données
test.set_index(['year','month','day','hour'], inplace=True)

#Visualisation
test.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,energy,active_power,windspeed
year,month,day,hour,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021,10,30,18,548.247253,723.989796,8.644898
2021,9,19,0,139.857464,1079.025316,11.046835
2021,6,1,22,313.897986,-0.310345,4.282759
2021,10,17,21,0.0,0.0,1.527083
2021,7,9,1,131.597595,,


# Régression

In [21]:
#définition du calcul de l'erreur (root minimal square error)
def rmse_calc(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [22]:
plots = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_5/plots_m5.pdf')

In [23]:
params = []

with PdfPages(plots) as pdf:
    #On récupère le vecteur de données
    X = train
        
    #Calcul des outliers
    m = ols('active_power ~ energy',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('active_power ~ energy', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"energy"]),X.loc[mask,"active_power"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.energy,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='energy', y='active_power',data=X.loc[mask,]) 
    sns.scatterplot(x='energy', y='active_power',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.energy, y=m.predict(X.energy), color='r', alpha=0.3)
    sns.lineplot(x=X.energy, y=m_clean.predict(X.energy))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()

# Regression journalière

In [24]:
train = train.reset_index()

In [25]:
#On crée un trainset journalier
train_daily = pd.DataFrame(columns=['year','month','day','energy','active_power', 'windspeed'])
for year in train.year.unique().tolist() :
    for month in train.month.unique().tolist() :
        for day in train.day.unique().tolist() :
            mask = (train.year==year) & (train.month==month) & (train.day==day)
            energy = train.loc[mask].energy.mean()
            active_power = train.loc[mask].active_power.mean()
            windspeed = train.loc[mask].windspeed.mean()
            train_daily = train_daily.append({'year':year,'month':month,'day':day,'energy':energy,'active_power':active_power,'windspeed':windspeed},ignore_index=True)

In [26]:
#On  converti dans les dates en entiers
train_daily.year = train_daily.year.apply(int)
train_daily.month = train_daily.month.apply(int)
train_daily.day = train_daily.day.apply(int)

In [27]:
#Visualisation
train_daily.dropna(inplace=True)
train_daily.sample(5)

Unnamed: 0,year,month,day,energy,active_power,windspeed
241,2021,8,25,214.798227,14.846732,3.832172
78,2021,3,17,56.89194,1377.306317,11.625741
102,2021,4,10,611.729269,515.291183,7.666315
168,2021,6,14,74.753294,147.824109,5.168478
325,2021,11,16,0.0,387.64855,6.509393


In [28]:
test = test.reset_index()

In [29]:
#On crée un testset journalier
test_daily = pd.DataFrame(columns=['year','month','day','active_power','energy', 'windspeed'])
for year in test.year.unique().tolist() :
    for month in test.month.unique().tolist() :
        for day in test.day.unique().tolist() :
            mask = (test.year==year) & (test.month==month) & (test.day==day)
            energy = test.loc[mask].energy.mean()
            active_power = test.loc[mask].active_power.sum()
            windspeed = test.loc[mask].windspeed.mean()
            test_daily = test_daily.append({'year':year,'month':month,'day':day,'energy':energy,'active_power':active_power,'windspeed':windspeed},ignore_index=True)

In [30]:
#On  converti dans les dates en entiers
test_daily.year = test_daily.year.apply(int)
test_daily.month = test_daily.month.apply(int)
test_daily.day = test_daily.day.apply(int)

In [31]:
#Visualisation
test_daily.dropna(inplace=True)
test_daily.sample(5)

Unnamed: 0,year,month,day,active_power,energy,windspeed
216,2021,7,31,756.169211,156.438218,8.986736
157,2021,6,3,181.803475,42.802387,5.873759
52,2021,2,22,1589.432836,25.415649,12.420455
56,2021,2,26,155.303435,208.98786,4.149718
258,2021,9,11,85.297997,69.089061,4.980846


In [32]:
plot_daily = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_5/plots_day_m5.pdf')

In [33]:
params = []

with PdfPages(plot_daily) as pdf:
    #On récupère le vecteur de données
    X = train_daily
        
    #Calcul des outliers
    m = ols('active_power ~ energy',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('active_power ~ energy', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"energy"]),X.loc[mask,"active_power"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.energy,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='energy', y='active_power',data=X.loc[mask,]) 
    sns.scatterplot(x='energy', y='active_power',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.energy, y=m.predict(X.energy), color='r', alpha=0.3)
    sns.lineplot(x=X.energy, y=m_clean.predict(X.energy))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()

# Régression mensuelle

In [34]:
#On crée un trainset journalier
train_monthly = pd.DataFrame(columns=['year','month','energy','active_power', 'windspeed'])
for year in train.year.unique().tolist() :
    for month in train.month.unique().tolist() :
        for day in train.day.unique().tolist() :
            mask = (train.year==year) & (train.month==month)
            energy = train.loc[mask].energy.mean()
            active_power = train.loc[mask].active_power.mean()
            windspeed = train.loc[mask].windspeed.mean()
            train_monthly = train_monthly.append({'year':year,'month':month,'energy':energy,'active_power':active_power,'windspeed':windspeed},ignore_index=True)

In [35]:
#On  converti dans les dates en entiers
train_monthly.year = train_monthly.year.apply(int)
train_monthly.month = train_monthly.month.apply(int)

In [36]:
#Visualisation
train_monthly.dropna(inplace=True)
train_monthly.sample(5)

Unnamed: 0,year,month,energy,active_power,windspeed
336,2021,11,288.520321,497.043677,7.556042
316,2021,11,288.520321,497.043677,7.556042
42,2021,2,388.097487,422.254865,6.864625
22,2021,1,563.1897,574.01759,8.223691
231,2021,8,168.480772,177.283345,6.115512


In [37]:
#On crée un testset journalier
test_monthly = pd.DataFrame(columns=['year','month','active_power','energy', 'windspeed'])
for year in test.year.unique().tolist() :
    for month in test.month.unique().tolist() :
        for day in test.day.unique().tolist() :
            mask = (test.year==year) & (test.month==month) & (test.day==day)
            energy = test.loc[mask].energy.mean()
            active_power = test.loc[mask].active_power.mean()
            windspeed = test.loc[mask].windspeed.mean()
            test_monthly = test_monthly.append({'year':year,'month':month,'energy':energy,'active_power':active_power,'windspeed':windspeed},ignore_index=True)

In [38]:
#On  converti dans les dates en entiers
test_monthly.year = test_monthly.year.apply(int)
test_monthly.month = test_monthly.month.apply(int)

In [39]:
#Visualisation
test_monthly.dropna(inplace=True)
test_monthly.sample(5)

Unnamed: 0,year,month,active_power,energy,windspeed
170,2021,6,110.723767,13.372587,5.404887
84,2021,3,63.834899,0.023322,4.162586
164,2021,6,54.800724,14.81807,4.154521
53,2021,2,456.391558,148.567821,7.008538
277,2021,9,52.446519,51.671301,6.755584


In [40]:
plot_monthly = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_5/plots_month_m5.pdf')

In [41]:
params = []

with PdfPages(plot_monthly) as pdf:
    #On récupère le vecteur de données
    X = train_monthly
        
    #Calcul des outliers
    m = ols('active_power ~ energy',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('active_power ~ energy', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"energy"]),X.loc[mask,"active_power"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.energy,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='energy', y='active_power',data=X.loc[mask,]) 
    sns.scatterplot(x='energy', y='active_power',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.energy, y=m.predict(X.energy), color='r', alpha=0.3)
    sns.lineplot(x=X.energy, y=m_clean.predict(X.energy))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()