# Imports

In [2]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import numpy as np
from statsmodels.formula.api import ols
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns
import math

# Création du dataframe ERA5

In [3]:
ERA5_monthly_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/ERA5/ERA5_monthly/Clean/ERA5_monthly.csv')

In [4]:
#Lecture de l'export mensuel de ERA5
era5 = pd.read_csv(ERA5_monthly_path, sep=';')

#Visualisation
era5.head()

Unnamed: 0,project_code,period,wind_speed_100m,direction_100m,temperature_2m,pressure,air_density,relative_humidity,precipitation,snowfall,solar_radiation,energy_100m,energy_100m_density_corr,date
0,AUQB,2000-01,6.2,316.0,7.0,100925.0,1.26,78.0,0.029,0.001,58756.0,86972.0,88915.0,2000-01-01
1,AMEL,2000-01,6.2,218.0,2.0,99538.0,1.26,91.0,0.052,0.003,28493.0,74058.0,75129.0,2000-01-01
2,BOUS,2000-01,5.9,204.0,2.0,99510.0,1.26,90.0,0.047,0.001,26949.0,68181.0,68956.0,2000-01-01
3,BRIY,2000-01,6.6,204.0,4.0,101149.0,1.27,91.0,0.04,0.003,26518.0,89995.0,92293.0,2000-01-01
4,CLIT,2000-01,9.2,208.0,8.0,102479.0,1.27,83.0,0.041,0.0,31961.0,170361.0,175841.0,2000-01-01


In [14]:
#Création de la liste des projets
list_projects = pd.DataFrame({'project' : era5.project_code.unique().tolist()})
list_projects.head()

#Sauvegarde
list_projects_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/list_projects.csv')
list_projects.to_csv(list_projects_path,sep=';',index=False)

# Création du dataframe Windga

In [4]:
Windga_train_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Train_Test/windga_train.csv')
Windga_test_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Train_Test/windga_test.csv')

In [5]:
#Lecture des train et test set mensuel Windga
windga_train = pd.read_csv(Windga_train_path, sep=';')
windga_test = pd.read_csv(Windga_test_path, sep=';')

#Visualisation
windga_train.head()

Unnamed: 0,Nom centrale,PERIODE,MOIS,ANNEE,Dispo contrat realisee,Dispo technique realisee,Dispo energetique realisee,Production realisee,project_code,date,dispo_consolidated,prod_100p
0,Montlouby,01/12/2013,12,2013,0.979,,,2785295.0,MTL1,2013-12-01,0.979,2845041.0
1,SC - Puech Negre,01/01/2014,1,2014,0.98368,0.80376,0.96124,1812033.0,SAPN,2014-01-01,0.96124,1885099.0
2,Longue Epine,01/09/2019,9,2019,0.9575,0.95446,0.95182,1524427.0,LEPI,2019-09-01,0.95182,1601592.0
3,La Heroudiere,01/06/2021,6,2021,0.95051,0.75378,0.69297,391859.0,HERO,2021-06-01,0.69297,565477.6
4,Vanault,01/03/2018,3,2018,0.93266,,,1390367.0,VANA,2018-03-01,0.93266,1490754.0


In [6]:
windga_test.head()

Unnamed: 0,Nom centrale,PERIODE,MOIS,ANNEE,Dispo contrat realisee,Dispo technique realisee,Dispo energetique realisee,Production realisee,project_code,date,dispo_consolidated,prod_100p
0,Repowering Petit Canal,01/07/2021,7,2021,0.97596,0.7155,0.7104,1574549.0,PCR1,2021-07-01,0.7104,2216426.0
1,4 Bornes - Aumelas,01/12/2016,12,2016,0.92527,0.87059,0.93415,1247950.0,AUQB,2016-12-01,0.93415,1335920.0
2,Lou Paou 12MW,01/06/2018,6,2018,0.89905,0.81636,0.946,1314583.0,LOPV,2018-06-01,0.946,1389623.0
3,Plat des Graniers - Corbieres,01/01/2021,1,2021,0.98818,0.98818,0.99403,3063994.0,PLGR,2021-01-01,0.99403,3082396.0
4,Heninel,01/11/2018,11,2018,0.99915,0.99915,,1387786.0,HENI,2018-11-01,0.99915,1388967.0


# Synthèse des données utiles dans train set et test set

In [7]:
#On fusionne les données d'ERA5 et de WindGa
#On ne garde que les données qui existent dans WindGa trainset
train_set = pd.merge(era5, windga_train, how='left', on=['project_code', 'date'])
mask = train_set['prod_100p'].isnull()
train_set = train_set.loc[~mask]

#Tri des données par projet et date
train_set.sort_values(by=['project_code', 'date'], inplace=True)

#On simplifie le train set
#On enlève les mois ayant peu de disponibilité
train_set = train_set[['project_code','ANNEE','MOIS','wind_speed_100m','prod_100p']][train_set['dispo_consolidated']>0.6].copy()
train_set.rename(columns={'ANNEE':'year','MOIS':'month'}, inplace=True)
train_set['year'] = train_set['year'].apply(int)
train_set['month'] = train_set['month'].apply(int)

#On modifie l'index pour retrouver plus facilement les données
train_set.set_index(['project_code', 'year', 'month'], inplace=True)

#Visualisation
train_set.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed_100m,prod_100p
project_code,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1
LERO,2019,6,5.3,369451.5
BARB,2019,3,6.5,2286851.0
PDFE,2013,11,6.3,1712635.0
POUZ,2016,7,6.2,1140267.0
PEMO,2019,3,6.4,1758954.0


In [8]:
#On fusionne les données d'ERA5 et de WindGa
#On ne garde que les données qui existent dans WindGa testset
test_set = pd.merge(era5, windga_test, how='left', on=['project_code', 'date'])
mask = test_set['prod_100p'].isnull()
test_set = test_set.loc[~mask]

#Tri des données par projet et date
test_set.sort_values(by=['project_code', 'date'], inplace=True)

#On simplifie le test set
test_set = test_set[['project_code','ANNEE','MOIS','wind_speed_100m','prod_100p']].copy()
test_set.rename(columns={'ANNEE':'year','MOIS':'month'}, inplace=True)
test_set['year'] = test_set['year'].apply(int)
test_set['month'] = test_set['month'].apply(int)

#On modifie l'index pour retrouver plus facilement les données
test_set.set_index(['project_code', 'year', 'month'], inplace=True)

#Visualisation
test_set.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wind_speed_100m,prod_100p
project_code,year,month,Unnamed: 3_level_1,Unnamed: 4_level_1
NOUR,2018,9,5.6,1048235.0
STSI,2008,11,7.2,1823212.0
"MAG1, MAG3",2015,6,6.2,2373976.0
LUCO,2010,6,6.4,2585366.0
VEUL,2017,10,8.0,2362491.0


# Régression des vitesses de vent

In [9]:
#définition du calcul de l'erreur (root minimal square error)
def rmse_calc(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [10]:
plots = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_3/plots_m3.pdf')

In [None]:
params = []

with PdfPages(plots) as pdf:
        
    for project in train_set.index.get_level_values('project_code').unique() :
        
        #On récupère le vecteur de données
        X = train_set.loc[project]
        
        #On ignore les projets de moins de trois mois
        if len(X) <= 3:  
            continue
        
        #Calcul des outliers
        m = ols('prod_100p ~ wind_speed_100m',X).fit()
        infl = m.get_influence()
        sm_fr = infl.summary_frame()
        cooks_d_tresh = 4 / len(X)
        mask = sm_fr.cooks_d < cooks_d_tresh
        #Régression sans les outliers
        m_clean = ols('prod_100p ~ wind_speed_100m', X.loc[mask,]).fit()

        #On récupère les paramètres à afficher
        rmse = round(rmse_calc(m_clean.predict(X.loc[mask,"wind_speed_100m"]),X.loc[mask,"prod_100p"]),0)
        intercept = round(m_clean.params.Intercept,0)
        slope = round(m_clean.params.wind_speed_100m,2)
        rsquare = round(m_clean.rsquared, 2)
        #On sauvegarde les paramètres
        params.append([project, slope, intercept, rmse, rsquare])
        
        #On trace le graphique
        fig = plt.figure(figsize=(12, 5))
        plt.suptitle(project,fontsize=16, fontweight="bold")
        plt.subplot(1, 2, 1)
        plt.title(f" R²={rsquare}\n rmse = {round(rmse/1000,0)} MWh\n  y = {slope} x + {intercept}", size=12, fontweight="bold")
        sns.scatterplot(x='wind_speed_100m', y='prod_100p',data=X.loc[mask,]) 
        sns.scatterplot(x='wind_speed_100m', y='prod_100p',data=X.loc[~mask,], color='r')
        sns.lineplot(x=X.wind_speed_100m, y=m.predict(X.wind_speed_100m), color='r', alpha=0.3)
        sns.lineplot(x=X.wind_speed_100m, y=m_clean.predict(X.wind_speed_100m))
        plt.xticks(color='w')
        plt.yticks(color='w')
        #Turn off tick labels
        ax = plt.gca()
        ax.axes.xaxis.set_ticks([])
        ax.axes.yaxis.set_ticks([])
        #On sauvegarde le graphique
        pdf.savefig(fig)

    plt.close()

# Création du dataframe de statistiques de régression

In [12]:
#On renomme les informations à sauvegarder
col = ['project', 'slope', 'intercept', 'rmse', 'rsquare']
params = pd.DataFrame(params, columns=col)
params.set_index('project', inplace=True)
params.head()

Unnamed: 0_level_0,slope,intercept,rmse,rsquare
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALLA,730900.13,-1418682.0,297260.0,0.83
AMEL,689031.52,-2074164.0,148269.0,0.95
AUCO,783734.1,-2297247.0,266217.0,0.86
AUM3,901360.04,-2401743.0,268215.0,0.89
AUQB,602034.67,-1568160.0,219705.0,0.87


In [13]:
stat_m3 = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_3/stat_m3.csv')

In [14]:
#Sauvegarde
params.to_csv(stat_m3, sep=';')

# Création du dataframe d'erreur de prédiction

In [15]:
rmse_projects = pd.DataFrame(columns = ['project', 'year', 'month','train','rmse_windspeed'])

In [16]:
#Calcul du rmse pour chaque projet
for project in test_set.index.get_level_values('project_code').unique() :
    
    #On récupère la pente et l'interception
    try :
        s, i = params.loc[project][['slope','intercept']]
    except :
        continue
    
    for year in test_set.loc[project].index.get_level_values('year').unique() :
        for month in test_set.loc[project, year].index.get_level_values('month').unique() :
            #On récupère la vitesse de mois pour chaque projet, année et mois du test set
            x = test_set.loc[project, year, month].wind_speed_100m
            #On calcule la prédiction
            y = s*x+i
            #On calcule l'erreur
            rmse = rmse_calc(y, test_set.loc[project, year, month].prod_100p)
            rmse_projects = rmse_projects.append({'project':project,'year':year,'month':month,'train':0,'rmse_windspeed':rmse}, ignore_index=True)

In [17]:
#Visualisation de l'erreur en MWh pour chaque prédiction
rmse_projects.sample(5)

Unnamed: 0,project,year,month,train,rmse_windspeed
3260,RAM1,2021,10,0,188755.920663
4248,VANA,2018,2,0,36291.570119
1680,LANE,2012,6,0,24321.275472
3021,PLEU,2014,2,0,371796.875124
3212,QVA3,2021,10,0,373183.715565


In [None]:
#Calcul du rmse pour chaque projet
for project in train_set.index.get_level_values('project_code').unique() :
    
    #On récupère la pente et l'interception
    try :
        s, i = params.loc[project][['slope','intercept']]
    except :
        continue
    
    for year in train_set.loc[project].index.get_level_values('year').unique() :
        for month in train_set.loc[project, year].index.get_level_values('month').unique() :
            #On récupère la vitesse de mois pour chaque projet, année et mois du test set
            x = train_set.loc[project, year, month].wind_speed_100m
            #On calcule la prédiction
            y = s*x+i
            #On calcule l'erreur
            rmse = rmse_calc(y, train_set.loc[project, year, month].prod_100p)
            rmse_projects = rmse_projects.append({'project':project,'year':year,'month':month,'train':1,'rmse_windspeed':rmse}, ignore_index=True)

In [None]:
#Indexation par projets et période
rmse_projects.set_index(['project', 'year', 'month'], inplace=True)

In [None]:
#Visualisation de l'erreur en MWh pour chaque prédiction
rmse_projects.sample(5)

In [None]:
rmse_m3 = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Models/Model_3/rmse_m3.csv')

In [None]:
#Sauvegarde
rmse_projects.to_csv(rmse_m3, sep=';')