In [1]:
pip install xarray

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pymysql sqlalchemy-redshift pykeepass

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [27]:
pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
     |████████████████████████████████| 3.0 MB 28.6 MB/s            
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
import boto3
import xarray as xr #to read netcdf
import pandas as pd
import os
import logging
import numpy as np
from math import pi
import datetime
from dateutil.relativedelta import relativedelta
from pathlib import Path
from pykeepass import PyKeePass
import getpass
from datetime import datetime
import sqlalchemy
import matplotlib.pyplot as plt
import seaborn as sns
import boto3 #Save in S3
from sqlalchemy import create_engine

In [5]:
#Logger set-up
logging.basicConfig(format=' %(asctime)s -  %(levelname)s -  %(message)s', 
                    handlers = [logging.StreamHandler()])
logging.getLogger().setLevel(logging.INFO)

In [6]:
S3_bucket_name = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_CSV_FOLD_ERA5 = r'EtudeWindIndex/ERA5'
S3_CSV_FOLD_DATA = r'EtudeWindIndex/Real'

In [7]:
# read project information and compute associated nodes
S3_project_url = f's3://{S3_bucket_name}/ERA5/config/ERA5_project_list.csv'
projects = pd.read_csv(S3_project_url, index_col='project_code', sep=';')

 2022-01-31 12:46:38,532 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-31 12:46:38,533 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-31 12:46:38,538 -  INFO -  We detected language [('English', 1.0), ('Indonesian', 1.0), ('Simple English', 1.0)] using ascii
 2022-01-31 12:46:38,539 -  INFO -  ascii is most likely the one. Stopping the process.
 2022-01-31 12:46:38,546 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-31 12:46:38,547 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-31 12:46:38,551 -  INFO -  We detected language [('German', 0.8333), ('Hungarian', 0.8333), ('Slovak', 0.8333), ('English', 0.75), ('Dutch', 0.75), ('Italian', 0.75), ('Swedish', 0.75), ('Norwegian', 0.75), ('Czech', 0.75), ('Indonesian', 0.75), ('Danish', 0.75), ('Polish', 0.6667), ('Finnish', 0.6667), ('Slovene', 0.6667), ('Turkish', 0.5833), ('Vietnamese', 0.5), ('Lithu

In [8]:
projects.head(5)

Unnamed: 0_level_0,latitude,longitude
project_code,Unnamed: 1_level_1,Unnamed: 2_level_1
AUQB,43.6,3.622
AMEL,48.83,6.46
BOUS,49.19,6.52
BRIY,49.75,3.4
CLIT,49.66,-1.37


In [22]:
#On choisit la période
start_month = '2000-01'
end_month = '2021-11'

List_projects = ['CDBO','AMEL','ESPS']

In [10]:
#On récupère la courbe de puissance
S3_pc_url = f's3://{S3_bucket_name}/ERA5/config/power_curve_V90-3.0MW.csv'
power_curve = pd.read_csv(S3_pc_url, index_col='windspeed')

In [None]:
months_range = pd.date_range(start=start_month, end=end_month, freq='MS')

for code in projects.loc[List_projects].index :
    
    logging.info("project : {}".format(code))
    
    projects_ERA5 = pd.DataFrame(columns=['time','u100','v100','u10','v10','d2m','t2m','sf','sp','msdwswrf','tp'])
    projects_ERA5.set_index('time',inplace=True)

    for date in months_range :
        #Construction des chemins vers les données
        input_name_nc = f'ERA5_france_'+str(date)[0:7]+'.nc'
        bucket_name = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
        S3_origin = os.path.join('ERA5', 'netcdf', 'france', input_name_nc)  
        body = boto3.Session().resource('s3').Bucket(bucket_name).Object(S3_origin).get()['Body'].read()
        netcdf = xr.open_dataset(body, )
        #Récupération des données
        df_ERA5 = netcdf.sel(latitude=projects.loc[code].latitude, longitude=projects.loc[code].latitude, method="nearest").to_dataframe()
        df_ERA5.drop(columns=['longitude', 'latitude'], inplace=True)
        
        projects_ERA5 = pd.concat([projects_ERA5, df_ERA5])
    
    #On calcule les vitesses et direction de vent
    projects_ERA5['ws100'] = (projects_ERA5['u100']**2 + projects_ERA5['v100']**2)**0.5
    projects_ERA5['wd100'] = round(np.arctan2(projects_ERA5['u100'], projects_ERA5['v100'])*180/pi + 180,0)
    projects_ERA5['ws10'] = (projects_ERA5['u10']**2 + projects_ERA5['v10']**2)**0.5
    projects_ERA5['wd10'] = round(np.arctan2(projects_ERA5['u10'], projects_ERA5['v10'])*180/pi + 180,0)
    projects_ERA5['E100'] = np.interp(projects_ERA5['ws100'], power_curve.index, power_curve['power'])  # Energy (using a power curve)
    projects_ERA5['rh'] = 100 - 5 * (projects_ERA5['t2m'] - projects_ERA5['d2m'])
    projects_ERA5['density'] = projects_ERA5['sp'] /  ( 287.058 * projects_ERA5['t2m'])
    projects_ERA5['E100_cor'] = projects_ERA5['E100']*projects_ERA5['density']/1.225   
    
    #On enlève les colonnes en trop
    projects_ERA5.drop(columns=['u100', 'v100'], inplace=True)
    projects_ERA5.drop(columns=['u10', 'v10'], inplace=True)
    projects_ERA5.drop(columns=['sf', 'msdwswrf', 'tp'], inplace=True)
    
    #Sauvegarde 
    ERA5_hourly_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/ERA5/ERA5_hourly/Clean/ERA5_'+code+'.csv')
    projects_ERA5.to_csv(ERA5_hourly_path, index=True, sep=';')
    
    #Sauvegarde sur le S3
    outfile = 'ERA5_'+code+'.csv'
    projects_ERA5.to_csv(f's3://{S3_bucket_name}/{S3_CSV_FOLD_ERA5}/{outfile}', index=True, sep=';')
    
    logging.info("Complété à : "+str((List_projects.index(code)+1)/len(List_projects)*100)+'%')

 2022-01-31 14:59:40,118 -  INFO -  project : CDBO
 2022-01-31 15:02:29,451 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-31 15:02:29,452 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-31 15:02:29,454 -  INFO -  We detected language [('Indonesian', 1.0), ('Simple English', 1.0), ('English', 0.9524)] using ascii
 2022-01-31 15:02:29,457 -  INFO -  ascii is most likely the one. Stopping the process.
 2022-01-31 15:02:29,463 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-31 15:02:29,464 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-31 15:02:29,464 -  INFO -  We detected language [('German', 0.8333), ('Hungarian', 0.8333), ('Slovak', 0.8333), ('English', 0.75), ('Dutch', 0.75), ('Italian', 0.75), ('Swedish', 0.75), ('Norwegian', 0.75), ('Czech', 0.75), ('Indonesian', 0.75), ('Danish', 0.75), ('Polish', 0.6667), ('Finnish', 0.6667), ('Slovene', 0.66

# Accès aux données Keepass

In [11]:
#Get logging information stored in keepass
Configuration_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Windga/WindGa_hourly/Configuration')
KEEPASS_FILE = Configuration_path/'RDL.kdbx'

logging.basicConfig(format=' %(asctime)s -  %(levelname)s -  %(message)s', handlers = [logging.StreamHandler()])
logging.getLogger().setLevel(logging.INFO)

print('Provide password to get acces to Keepass file:')
password = getpass.getpass()

try:  
    kp = PyKeePass(KEEPASS_FILE, password=password)
    logging.info('Keepass data loaded')
except:
    logging.exception('Cannot open the keepass file:')

Provide password to get acces to Keepass file:


 ·········


 2022-01-31 12:47:05,280 -  INFO -  Keepass data loaded


# Connection à la base de données

In [12]:
def connexion_setup(kp_con_name='SBX-RDS'):

    credential = kp.find_entries(title=kp_con_name, first=True)

    con = credential.url.split('//')[0] + '//' + \
    credential.username +':'+ \
    credential.password + \
    credential.url.split('//')[1]
    logging.debug(f'connexion string: {con}')

    con_engine = sqlalchemy.create_engine(con)

    return con_engine

In [13]:
#Read mapping file
VAR_MAPPING_FILE = Configuration_path/'10min_variables_iec_to_std_names_mapping.csv'
df = pd.read_csv(VAR_MAPPING_FILE)

#Mapping dict
iec2std = df.loc[:,['iec_attribute61400','business_description']].set_index('iec_attribute61400')['business_description'].to_dict()
std2iec = df.loc[:,['iec_attribute61400','business_description']].set_index('business_description')['iec_attribute61400'].to_dict()

#Récupération des données utiles
mask_features = [False]
List_features = ['active_power_avg','wind_speed_avg']
for feature in List_features :
    mask_features = mask_features | (df.business_description == feature)
temperature_features = df.loc[mask_features,:]
features_selection_str = str(list(temperature_features.iec_attribute61400)).replace("'", '').replace("[", "").replace("]", "")

# Extraction de données

In [14]:
#Periode à extraire
start_month = '2020-01'
end_month = '2021-11'

months_range = pd.date_range(start=start_month, end=end_month, freq='MS')
months_list = [str(months_range[i])[0:7] for i in range(len(months_range))]

# Récupération des données

In [15]:
S3_era5_folder = r'EtudeWindIndex/ERA5'

In [26]:
for project in List_projects :
    
    logging.info("project : {}".format(project))
    
    logging.info("Lecture des données era5")
    
    #Lecture de l'export horraire de ERA5
    FILE_ERA5 = 'ERA5_' + project
    S3_era5 = f's3://{S3_bucket_name}/{S3_CSV_FOLD_ERA5}/{FILE_ERA5}.csv'
    era5 = pd.read_csv(S3_era5, sep=';')
    
    #On converti le timestamp
    era5['year'] = era5.time.map(lambda date: int(date[:4]))
    era5['month'] = era5.time.map(lambda date: int(date[5:7]))
    era5['day'] = era5.time.map(lambda date: int(date[8:10]))
    era5['hour'] = era5.time.map(lambda date: int(date[11:13]))
    
    #On garde les informations utiles
    era5.drop(['time','d2m','t2m','sp','wd100','ws10','wd10','rh','density',"E100",'E100_cor'], axis=1, inplace=True)
    era5.rename(columns={"ws100":"windspeed_era5"}, inplace=True)
    era5 = era5[['year','month','day','hour','windspeed_era5']]
    
    logging.info("Lecture des données turbine")
    
    #Lecture de l'export horraire turbine
    FILE_DATA = '10min_' + project
    S3_real = f's3://{S3_bucket_name}/{S3_CSV_FOLD_DATA}/{FILE_DATA}.csv'
    data = pd.read_csv(S3_real, sep=';')
    
    logging.info("Construction du fichier de comparaison")
    
    #Calcul des valeurs horaires *
    comp_ws = pd.DataFrame(columns=['year','month','day','hour','windspeed_data','windspeed_era5'])
    
    for year in data.year.unique().tolist() :
        logging.info("année "+str(year))
        for month in data[data.year==year].month.unique().tolist() :
            logging.info(str(int(month/len(data[data.year==year].month.unique().tolist())*100))+"%")
            for day in data[(data.year==year)&(data.month==month)].day.unique().tolist() :
                for hour in data[(data.year==year)&(data.month==month)&(data.day==day)].hour.unique().tolist() :
                    
                    mask_data = (data.year==year) & (data.month==month) & (data.day==day) & (data.hour==hour)
                    windspeed_data = data.loc[mask_data].wind_speed_avg.unique().mean()
                    
                    mask_era5 = (era5.year==year) & (era5.month==month) & (era5.day==day) & (era5.hour==hour)
                    try :
                        windspeed_era5 = float(era5.loc[mask_era5].windspeed_era5)
                    except :
                        windspeed_era5 = 0
                        
                    comp_ws = comp_ws.append({'year':year,'month':month,'day':day,'hour':hour,'windspeed_data':windspeed_data,'windspeed_era5':windspeed_era5},ignore_index=True)
        
    #Sauvegarde sur le notebook
    comp_ws_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/comp_ws_'+project+'.csv')
    comp_ws.to_csv(comp_ws_path, index=False, sep=';')

 2022-01-31 13:34:05,221 -  INFO -  project : CDBO
 2022-01-31 13:34:05,222 -  INFO -  Lecture des données era5
 2022-01-31 13:34:05,236 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-31 13:34:05,242 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-31 13:34:05,244 -  INFO -  We detected language [('English', 0.9545), ('Indonesian', 0.9545), ('Simple English', 0.9545)] using ascii
 2022-01-31 13:34:05,245 -  INFO -  ascii is most likely the one. Stopping the process.
 2022-01-31 13:34:05,254 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-31 13:34:05,255 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-31 13:34:05,256 -  INFO -  We detected language [('German', 0.8333), ('Hungarian', 0.8333), ('Slovak', 0.8333), ('English', 0.75), ('Dutch', 0.75), ('Italian', 0.75), ('Swedish', 0.75), ('Norwegian', 0.75), ('Czech', 0.75), ('Indonesian', 0.75), ('Danish