# Imports

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import numpy as np
from statsmodels.formula.api import ols
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
import math
import random
from sklearn.model_selection import train_test_split

# Choix du projet

In [2]:
#Choisir le projet
project = 'ESPS'

# Récupération des données ERA5

In [3]:
S3_bucket = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_era5_folder = r'EtudeWindIndex/ERA5'

In [4]:
#Lecture de l'export horraire de ERA5
file = 'ERA5_' + project
S3_era5 = f's3://{S3_bucket}/{S3_era5_folder}/{file}.csv'
era5 = pd.read_csv(S3_era5, sep=';')

In [5]:
#Visualisation
era5.sample(5)

Unnamed: 0,time,d2m,t2m,sp,ws100,wd100,ws10,wd10,E100,rh,density,E100_cor
110396,2012-08-04 20:00:00,284.96805,292.47858,96051.016,2.76102,45.0,1.956532,71.0,0.0,62.447357,1.144032,0.0
129351,2014-10-03 15:00:00,284.5456,293.03815,96968.59,3.45685,79.0,2.355711,74.0,35.177429,57.53723,1.152756,33.102843
106224,2012-02-13 00:00:00,257.0446,260.2281,97346.28,0.630605,132.0,0.971444,130.0,0.0,84.08249,1.303153,0.0
179883,2020-07-09 03:00:00,284.53528,287.29935,96634.984,2.887149,200.0,1.539518,159.0,0.0,86.17966,1.171737,0.0
171909,2019-08-11 21:00:00,284.65445,287.28375,96623.08,0.397837,193.0,0.365554,180.0,0.0,86.853485,1.171656,0.0


In [6]:
#On converti le timestamp
era5['year'] = era5.time.map(lambda date: int(date[:4]))
era5['month'] = era5.time.map(lambda date: int(date[5:7]))
era5['day'] = era5.time.map(lambda date: int(date[8:10]))
era5['hour'] = era5.time.map(lambda date: int(date[11:13]))

In [7]:
#On garde les informations utiles
era5.drop(['time','d2m','t2m','sp','wd100','ws10','wd10','rh','density','E100_cor'], axis=1, inplace=True)
era5.rename(columns={"E100": "energy", "ws100":"windspeed_era5"}, inplace=True)
era5 = era5[['year','month','day','hour','energy','windspeed_era5']]

In [8]:
#Visualisation
era5.sample(5)

Unnamed: 0,year,month,day,hour,energy,windspeed_era5
174550,2019,11,29,22,0.0,1.433306
70348,2008,1,10,4,130.029879,4.469291
100021,2011,5,30,13,23.306345,3.30268
7069,2000,10,21,13,0.0,0.772802
17744,2002,1,9,8,0.0,1.812202


# Récupération des donées 10 min

In [9]:
#Lecture de l'export horraire turbine
S3_bucket = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_real_folder = r'EtudeWindIndex/Real'
file = '10min_' + project
S3_real = f's3://{S3_bucket}/{S3_real_folder}/{file}.csv'
real_set = pd.read_csv(S3_real, sep=';')

In [10]:
#Visualisation
real_set.sample(5)

Unnamed: 0,project,turbine,year,month,day,hour,minute,active_power_avg,wind_speed_avg
4240401,ESPS,3,2021,7,15,10,40,1520.15,7.87712
2715000,ESPS,5,2020,7,26,20,0,-0.097867,2.69395
4116558,ESPS,3,2021,9,25,10,30,24.3657,3.13375
3553622,ESPS,4,2021,3,5,14,0,1266.83,7.42393
3591663,ESPS,5,2021,4,27,10,0,239.567,4.74059


In [127]:
real_set[(real_set.year==2020)&(real_set.month==11)&(real_set.day==27)&(real_set.hour==0)].wind_speed_avg.mean()

4.797932000000003

In [None]:
#repartir d'ici et recalculer les moyennes horaires de windspeed_data qui sont fausses

# Création des trainset et testset

In [19]:
Dic_turbines = {}
list_num_turbine = sorted(real_set.turbine.unique().tolist())

for num in list_num_turbine :
    
    pd.options.mode.chained_assignment = None
    
    tts = real_set[real_set.turbine==num]
    trainset, testset = train_test_split(tts, test_size=0.33, random_state=42, stratify=tts.loc[:,['year','month','day','hour']])
    trainset.sort_values(by=['year','month','day','hour'], inplace=True)
    testset.sort_values(by=['year','month','day','hour'], inplace=True)
    
    key_train  = 'trainset'+str(num)
    key_test = 'testset'+str(num)
    Dic_turbines[key_train] = trainset
    Dic_turbines[key_test] = testset

In [20]:
#Visualisation
Dic_turbines[random.choice(['trainset','testset'])+str(random.choice(list_num_turbine))].sample(5)

Unnamed: 0,project,turbine,year,month,day,hour,minute,active_power_avg,wind_speed_avg
3686812,ESPS,5,2021,9,6,13,40,412.376,5.55729
2351621,ESPS,5,2020,8,9,3,20,0.0,3.07536
2246085,ESPS,5,2021,9,14,13,30,288.775,5.0926
4864947,ESPS,5,2021,10,27,20,50,387.365,5.17706
3242235,ESPS,5,2021,3,29,2,30,451.339,5.89425


# Synthèse des données utiles

In [104]:
#On choisit une turbine
num_turbine = random.choice(list_num_turbine)

In [105]:
key_train  = 'trainset'+str(num_turbine)
key_test = 'testset'+str(num_turbine)
trainset_turbine = Dic_turbines[key_train]
testset_turbine = Dic_turbines[key_test]

In [106]:
#On fusionne les données météo et de production
train = pd.merge(era5, trainset_turbine, how='left', on=['year','month','day','hour'])

In [107]:
#Filtrage des données manquantes ou éronnées *
train = train[~((train.wind_speed_avg.isnull()) | (train.wind_speed_avg == 0))]
#Visualisation
train.sample(5)

Unnamed: 0,year,month,day,hour,energy,windspeed_era5,project,turbine,minute,active_power_avg,wind_speed_avg
251505,2020,7,23,8,0.0,2.057331,ESPS,2.0,50.0,0.0,1.84915
230430,2020,6,20,12,0.0,2.938889,ESPS,2.0,40.0,72.4639,3.6263
436163,2021,2,24,19,27.18722,3.353081,ESPS,2.0,10.0,991.007,7.45215
377143,2020,12,21,3,161.870479,4.751066,ESPS,2.0,10.0,2153.84,9.00865
307257,2020,10,1,6,222.540493,5.199635,ESPS,2.0,50.0,2701.74,10.1427


In [116]:
#On fusionne les données météo et de production
test = pd.merge(era5, testset_turbine, how='left', on=['year','month','day','hour'])

In [117]:
#Filtrage des données manquantes ou éronnées *
test = test[~((test.wind_speed_avg.isnull()) | (test.wind_speed_avg == 0))]
#Visualisation
test.sample(5)

Unnamed: 0,year,month,day,hour,energy,windspeed_era5,project,turbine,minute,active_power_avg,wind_speed_avg
396248,2021,7,22,15,87.198822,4.090255,ESPS,2.0,30.0,369.643,5.24355
468396,2021,10,24,14,0.0,2.331199,ESPS,2.0,50.0,633.626,6.04942
207665,2020,7,6,6,139.980606,4.55735,ESPS,2.0,10.0,417.872,5.24076
324444,2021,4,1,13,40.214065,3.522261,ESPS,2.0,0.0,683.751,6.32069
289978,2021,1,21,23,2226.808173,11.205033,ESPS,2.0,50.0,3585.52,11.6152


# Construction du dataframe de régression horraire

In [118]:
#Calcul des valeurs horaires *
train_hourly = pd.DataFrame(columns=['year','month','day','hour','windspeed_data','windspeed_era5'])
for year in train.year.unique().tolist() :
    for month in train[train.year==year].month.unique().tolist() :
        for day in train[(train.year==year)&(train.month==month)].day.unique().tolist() :
            for hour in train[(train.year==year)&(train.month==month)&(train.day==day)].hour.unique().tolist() :
                
                mask_data = (train.year==year) & (train.month==month) & (train.day==day) & (train.hour==hour)
                windspeed_data = train.loc[mask_data].wind_speed_avg.unique().mean()
                
                mask_era5 = (era5.year==year) & (era5.month==month) & (era5.day==day) & (era5.hour==hour)
                try :
                    windspeed_era5 = float(era5.loc[mask_era5].windspeed_era5)
                except :
                    windspeed_era5 = 0
                
                train_hourly = train_hourly.append({'year':year,'month':month,'day':day,'hour':hour,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [119]:
#Filtrage des données manquantes ou éronnées *
train_hourly = train_hourly[~((train_hourly.windspeed_data.isnull())|(train_hourly.windspeed_era5 == 0))]
#Visualisation
train_hourly.sample(5)

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
14891,2021.0,9.0,14.0,4.0,5.671295,4.565192
9403,2021.0,1.0,27.0,4.0,7.165912,4.70097
15350,2021.0,10.0,3.0,7.0,11.0193,6.219103
14242,2021.0,8.0,18.0,3.0,5.50715,3.781715
16307,2021.0,11.0,12.0,4.0,2.168762,0.69236


In [120]:
#Filtrage des données manquantes ou éronnées *
train_hourly = train_hourly[~((train_hourly.windspeed_data.isnull())|(train_hourly.windspeed_data == 0))]
train_hourly = train_hourly[~((train_hourly.windspeed_era5.isnull())|(train_hourly.windspeed_era5 == 0))]

#Visualisation
train_hourly.sample(5)

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
12336,2021.0,5.0,30.0,17.0,5.879693,6.362517
11595,2021.0,4.0,29.0,20.0,4.832913,5.164174
13468,2021.0,7.0,16.0,21.0,2.071508,3.952537
11266,2021.0,4.0,16.0,3.0,8.432582,4.508228
8204,2020.0,12.0,7.0,22.0,6.993253,3.004766


In [121]:
#Calcul de l'écart entre les données de vent météo et mesurées
lim=3.5 #écart > lim
sum(abs(train_hourly.windspeed_data - train_hourly.windspeed_era5)>lim)/len(train_hourly)

0.33522286532609347

In [122]:
train_hourly[abs((train_hourly.windspeed_data - train_hourly.windspeed_era5))>3.5].sample(10)

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
15346,2021.0,10.0,3.0,3.0,11.414483,6.877852
13913,2021.0,8.0,4.0,10.0,4.489635,0.365147
11218,2021.0,4.0,14.0,3.0,7.359227,3.112798
10315,2021.0,3.0,7.0,0.0,6.765373,2.503718
12609,2021.0,6.0,11.0,2.0,6.041292,0.753298
1099,2020.0,2.0,15.0,21.0,14.01455,8.711417
8475,2020.0,12.0,19.0,12.0,9.741603,4.441974
2207,2020.0,4.0,2.0,1.0,5.857233,0.165548
1116,2020.0,2.0,16.0,14.0,18.16274,10.669844
900,2020.0,2.0,7.0,14.0,7.24944,1.126753


In [123]:
train_hourly[(train_hourly.year==2020)&(train_hourly.month==11)&(train_hourly.day==27)]

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
7942,2020.0,11.0,27.0,0.0,4.263402,5.01903
7943,2020.0,11.0,27.0,1.0,3.784682,5.198243
7944,2020.0,11.0,27.0,2.0,4.302277,5.043698
7945,2020.0,11.0,27.0,3.0,3.592222,4.559973
7946,2020.0,11.0,27.0,4.0,3.093613,4.064987
7947,2020.0,11.0,27.0,5.0,3.664655,3.888199
7948,2020.0,11.0,27.0,6.0,3.124637,3.910433
7949,2020.0,11.0,27.0,7.0,3.056938,3.909505
7950,2020.0,11.0,27.0,8.0,3.477017,3.851302
7951,2020.0,11.0,27.0,9.0,2.459475,3.63037


In [33]:
#Calcul des valeurs horaires *
test_hourly = pd.DataFrame(columns=['year','month','day','hour','windspeed_data','windspeed_era5'])
for year in test.year.unique().tolist() :
    for month in test.month.unique().tolist() :
        for day in test.day.unique().tolist() :
            for hour in test.hour.unique().tolist() :
                
                mask_data = (test.year==year) & (test.month==month) & (test.day==day) & (test.hour==hour)
                windspeed_data = test.loc[mask_data].wind_speed_avg.mean()
                active_power = test.loc[mask_data].active_power_avg.mean()
                
                mask_era5 = (era5.year==year) & (era5.month==month) & (era5.day==day) & (era5.hour==hour)
                try :
                    energy = era5.loc[mask_era5].energy.iloc[0]
                    windspeed_era5 = float(era5.loc[mask_era5].windspeed_era5)
                except :
                    energy = 0
                    windspeed_era5 = 0
                
                test_hourly = test_hourly.append({'year':year,'month':month,'day':day,'hour':hour,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [34]:
#Filtrage des données manquantes ou éronnées *
test_hourly = test_hourly[~((test_hourly.windspeed_data.isnull())|(test_hourly.windspeed_era5 == 0))]
#Visualisation
test_hourly.sample(5)

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
7722,2021.0,11.0,12.0,18.0,6.585487,2.565752
787,2021.0,2.0,2.0,19.0,10.348876,9.494203
1833,2021.0,3.0,15.0,9.0,9.648277,7.046842
5252,2021.0,8.0,2.0,20.0,2.098112,1.332835
4278,2021.0,6.0,24.0,6.0,5.964879,1.360114


In [35]:
#Filtrage des données manquantes ou éronnées *
test_hourly = test_hourly[~((test_hourly.windspeed_data.isnull())|(test_hourly.windspeed_data == 0))]
test_hourly = test_hourly[~((test_hourly.windspeed_era5.isnull())|(test_hourly.windspeed_era5 == 0))]

#Visualisation
train_hourly.sample(5)

Unnamed: 0,year,month,day,hour,windspeed_data,windspeed_era5
671,2021.0,1.0,28.0,23.0,9.751768,8.144357
1111,2021.0,2.0,16.0,12.0,6.475092,4.495192
5267,2021.0,8.0,9.0,22.0,5.856495,5.33527
3276,2021.0,5.0,18.0,22.0,5.860248,3.552444
1691,2021.0,3.0,13.0,13.0,15.584833,14.349929


In [66]:
#Calcul de l'écart entre les données de vent météo et mesurées
lim=3.5 #écart > lim
sum(abs(test_hourly.windspeed_data - test_hourly.windspeed_era5)>lim)/len(test_hourly)

0.3157167963846347

# Régression

In [37]:
#définition du calcul de l'erreur (root minimal square error)
def rmse_calc(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [40]:
plots = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/plots_7.pdf')

In [41]:
params = []

with PdfPages(plots) as pdf:
    #On récupère le vecteur de données
    X = train_hourly
        
    #Calcul des outliers
    m = ols('windspeed_data ~ windspeed_era5',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('windspeed_data ~ windspeed_era5', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"windspeed_era5"]),X.loc[mask,"windspeed_data"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.windspeed_era5,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[mask,]) 
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.windspeed_era5, y=m.predict(X.windspeed_era5), color='r', alpha=0.3)
    sns.lineplot(x=X.windspeed_era5, y=m_clean.predict(X.windspeed_era5))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()

# Regression journalière

In [42]:
#On crée un trainset journalier
train_daily = pd.DataFrame(columns=['year','month','day', 'windspeed_data','windspeed_era5'])
for year in train_hourly.year.unique().tolist() :
    for month in train_hourly.month.unique().tolist() :
        for day in train_hourly.day.unique().tolist() :
            mask = (train_hourly.year==year) & (train_hourly.month==month) & (train_hourly.day==day)
            windspeed_data = train_hourly.loc[mask].windspeed_data.mean()
            windspeed_era5 = train_hourly.loc[mask].windspeed_era5.mean()
            train_daily = train_daily.append({'year':year,'month':month,'day':day,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [43]:
#On  converti dans les dates en entiers
train_daily.year = train_daily.year.apply(int)
train_daily.month = train_daily.month.apply(int)
train_daily.day = train_daily.day.apply(int)

In [44]:
#Visualisation
train_daily.dropna(inplace=True)
train_daily.sample(5)

Unnamed: 0,year,month,day,windspeed_data,windspeed_era5
17,2021,1,18,6.405827,5.128481
231,2021,8,15,6.30675,2.568916
174,2021,6,20,4.803425,3.681158
105,2021,4,13,6.787502,3.867172
128,2021,5,5,8.14062,7.049925


In [45]:
test = test.reset_index()

In [49]:
#On crée un trainset journalier
test_daily = pd.DataFrame(columns=['year','month','day','windspeed_data','windspeed_era5'])
for year in test_hourly.year.unique().tolist() :
    for month in test_hourly.month.unique().tolist() :
        for day in test_hourly.day.unique().tolist() :
            mask = (test_hourly.year==year) & (test_hourly.month==month) & (test_hourly.day==day)
            windspeed_data = test_hourly.loc[mask].windspeed_data.mean()
            windspeed_era5 = test_hourly.loc[mask].windspeed_era5.mean()
            test_daily = test_daily.append({'year':year,'month':month,'day':day,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [50]:
#On  converti dans les dates en entiers
test_daily.year = test_daily.year.apply(int)
test_daily.month = test_daily.month.apply(int)
test_daily.day = test_daily.day.apply(int)

In [51]:
#Visualisation
test_daily.dropna(inplace=True)
test_daily.sample(5)

Unnamed: 0,year,month,day,windspeed_data,windspeed_era5
21,2021,1,22,5.702821,8.288568
88,2021,3,27,7.551421,6.859766
216,2021,7,31,5.181902,3.182146
71,2021,3,10,8.787576,3.905288
115,2021,4,23,9.152554,3.417931


In [52]:
plot_daily = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/plots_day_m7.pdf')

In [53]:
params = []

with PdfPages(plot_daily) as pdf:
    #On récupère le vecteur de données
    X = train_daily
        
    #Calcul des outliers
    m = ols('windspeed_data ~ windspeed_era5',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('windspeed_data ~ windspeed_era5', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"windspeed_era5"]),X.loc[mask,"windspeed_data"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.windspeed_era5,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[mask,]) 
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.windspeed_era5, y=m.predict(X.windspeed_era5), color='r', alpha=0.3)
    sns.lineplot(x=X.windspeed_era5, y=m_clean.predict(X.windspeed_era5))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()

# Régression mensuelle

In [54]:
#On crée un trainset journalier
train_monthly = pd.DataFrame(columns=['year','month','windspeed_data','windspeed_era5'])
for year in train_hourly.year.unique().tolist() :
    for month in train_hourly.month.unique().tolist() :
        for day in train_hourly.day.unique().tolist() :
            mask = (train_hourly.year==year) & (train_hourly.month==month)
            windspeed_data = train_hourly.loc[mask].windspeed_data.mean()
            windspeed_era5 = train_hourly.loc[mask].windspeed_era5.mean()
            train_monthly = train_monthly.append({'year':year,'month':month,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [55]:
#On  converti dans les dates en entiers
train_monthly.year = train_monthly.year.apply(int)
train_monthly.month = train_monthly.month.apply(int)

In [56]:
#Visualisation
train_monthly.dropna(inplace=True)
train_monthly.sample(5)

Unnamed: 0,year,month,windspeed_data,windspeed_era5
30,2021,1,6.847007,5.173482
193,2021,7,6.241546,3.475684
116,2021,4,6.406096,4.020484
329,2021,11,5.804395,3.837535
194,2021,7,6.241546,3.475684


In [57]:
#On crée un testset mensuel
test_monthly = pd.DataFrame(columns=['year','month','windspeed_data','windspeed_era5'])
for year in test_hourly.year.unique().tolist() :
    for month in test_hourly.month.unique().tolist() :
        for day in test_hourly.day.unique().tolist() :
            mask = (test_hourly.year==year) & (test_hourly.month==month) & (test_hourly.day==day)
            windspeed_data = test_hourly.loc[mask].windspeed_data.mean()
            windspeed_era5 = test_hourly.loc[mask].windspeed_era5.mean()
            test_monthly = test_monthly.append({'year':year,'month':month,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)

In [58]:
#On  converti dans les dates en entiers
test_monthly.year = test_monthly.year.apply(int)
test_monthly.month = test_monthly.month.apply(int)

In [59]:
#Visualisation
test_monthly.dropna(inplace=True)
test_monthly.sample(5)

Unnamed: 0,year,month,windspeed_data,windspeed_era5
301,2021,10,4.214287,3.286425
294,2021,10,4.004319,1.960189
217,2021,8,3.306237,3.209843
133,2021,5,8.139401,3.471477
146,2021,5,8.565748,5.336631


In [60]:
plot_monthly = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_7/plots_month_m7.pdf')

In [63]:
params = []

with PdfPages(plot_monthly) as pdf:
    #On récupère le vecteur de données
    X = train_monthly
        
    #Calcul des outliers
    m = ols('windspeed_data ~ windspeed_era5',X).fit()
    infl = m.get_influence()
    sm_fr = infl.summary_frame()
    cooks_d_tresh = 4 / len(X)
    mask = sm_fr.cooks_d < cooks_d_tresh
    #Régression sans les outliers
    m_clean = ols('windspeed_data ~ windspeed_era5', X.loc[mask,]).fit()

    #On récupère les paramètres à afficher
    rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"windspeed_era5"]),X.loc[mask,"windspeed_data"]),0)
    intercept = round(m_clean.params.Intercept,0)
    slope = round(m_clean.params.windspeed_era5,2)
    rsquare = round(m_clean.rsquared, 2)
    #On sauvegarde les paramètres
    params.append([project, slope, intercept, rmse, rsquare])
        
    #On trace le graphique
    fig = plt.figure(figsize=(12, 5))
    plt.suptitle(project,fontsize=16, fontweight="bold")
    plt.subplot(1, 2, 1)
    plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[mask,]) 
    sns.scatterplot(x='windspeed_era5', y='windspeed_data',data=X.loc[~mask,], color='r')
    sns.lineplot(x=X.windspeed_era5, y=m.predict(X.windspeed_era5), color='r', alpha=0.3)
    sns.lineplot(x=X.windspeed_era5, y=m_clean.predict(X.windspeed_era5))
    plt.xticks(color='w')
    plt.yticks(color='w')
    #Turn off tick labels
    ax = plt.gca()
    ax.axes.xaxis.set_ticks([])
    ax.axes.yaxis.set_ticks([])
    #On sauvegarde le graphique
    pdf.savefig(fig)

    plt.close()