# ERA5
Extraction of ERA5 Netcdf File

## 0 Set-up

In [1]:
# installation

In [2]:
pip install pykeepass cdsapi

Note: you may need to restart the kernel to use updated packages.


In [3]:
# lib
import pandas as pd
import numpy as np
import logging
import os
# from datetime import datetime
from datetime import date
from dateutil.relativedelta import relativedelta

import getpass                   # to get password input directly from user
from pykeepass import PyKeePass  # credential
import boto3                     # save in S3
import cdsapi                    # Era5 Api
import urllib3                   # to diseable some warnings: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
import certifi
from time import time

In [4]:
# logger set-up
logging.basicConfig(format=' %(asctime)s -  %(levelname)s -  %(message)s', 
                    handlers = [logging.StreamHandler()])
logging.getLogger().setLevel(logging.INFO)


# Credential
KEEPASS_FILE = '/home/ec2-user/SageMaker/config/RDL.kdbx'
print('Provide password to get access to Keepass file:')
password = getpass.getpass()

try:  
    kp = PyKeePass(KEEPASS_FILE, password=password)
    logging.info('Keepass data loaded')
except:
    logging.exception('Cannot open the keepass file:')

Provide password to get access to Keepass file:


 ·········


 2022-01-10 08:32:08,480 -  INFO -  Keepass data loaded


In [5]:
# ERA5 API connection

# to avoid warning...but does not seems to work 
http = urllib3.PoolManager(     
     cert_reqs='CERT_REQUIRED',
     ca_certs=certifi.where())
urllib3.disable_warnings()

# Import cdsapi and create a Client instance
credential = kp.find_entries(title='Era5', first=True)
c = cdsapi.Client(key=credential.password, 
                  url=credential.url)

## 1 Data extraction

In [6]:
# read boxes to extracts
boxes_df = pd.read_csv('s3://edfred-edfre-sbx-eu-west-1-solar-radiation-data/ERA5/config/area_boxes_boundaries.csv', index_col='area_name')
boxes_df.head()

 2022-01-10 08:32:08,930 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-10 08:32:08,932 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-10 08:32:08,941 -  INFO -  We detected language [('English', 1.0), ('Indonesian', 1.0), ('Simple English', 1.0)] using ascii
 2022-01-10 08:32:08,942 -  INFO -  ascii is most likely the one. Stopping the process.
 2022-01-10 08:32:08,961 -  INFO -  ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
 2022-01-10 08:32:08,962 -  INFO -  ascii should target any language(s) of ['Latin Based']
 2022-01-10 08:32:08,965 -  INFO -  We detected language [('German', 0.8333), ('Hungarian', 0.8333), ('Slovak', 0.8333), ('English', 0.75), ('Dutch', 0.75), ('Italian', 0.75), ('Swedish', 0.75), ('Norwegian', 0.75), ('Czech', 0.75), ('Indonesian', 0.75), ('Danish', 0.75), ('Polish', 0.6667), ('Finnish', 0.6667), ('Slovene', 0.6667), ('Turkish', 0.5833), ('Vietnamese', 0.5), ('Lithu

Unnamed: 0_level_0,north,west,south,east
area_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
france,51.0,-4.5,42.5,8.0
corsica,43.25,8.5,41.25,9.75
guadeloupe,16.75,-62.0,15.75,-60.75
reunion,-20.75,55.0,-21.5,56.0


In [7]:
# example
boxes_df.loc['france',:].to_list()

[51.0, -4.5, 42.5, 8.0]

In [None]:
# Monthly loops over the 3 last months:

# param
# area = boxes_df.index[0]
areas = list(boxes_df.index)

# old manual set-up
# start_date_s = '2021-03'
# end_date_s = '2021-04'  # not included

# new automatic set-up
start_date_s = date.today() + relativedelta(months=-3)
end_date_s = date.today()   # not included: will take the last month

variables = ['100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
            '10m_v_component_of_wind', '2m_dewpoint_temperature', '2m_temperature', 'snowfall', 
             'surface_pressure', 'mean_surface_downward_short_wave_radiation_flux', 'total_precipitation',]
bucket_name = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'

date_range = pd.date_range(start_date_s, end_date_s, freq="M" )

# build dic to extract {year : [months]}
date_to_extract = {}
for elem in date_range:
    year = str(elem.year)
    month = str(elem.month)
    if len(month)==1:
        month = '0' + month
    if not (year in date_to_extract):
        date_to_extract[year] = [month]
    else:
        date_to_extract[year].append(month)


for area in areas:
    for year, months in date_to_extract.items():
        for month in months:

            logging.info("Request for {} {}-{}".format(area, year, month))
            outfile_name_nc = "ERA5_{}_{}-{}.nc".format(area, year, month) 

            # S3_destination = os.path.join('ERA5','netcdf', area, outfile_name_nc)

            t0 = time()
            c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type':'reanalysis',
                'format':'netcdf',
                "area": boxes_df.loc[area,:].to_list(),
                'variable':variables,
                'year':[year],
                'month':[month],
                'day':[
                    '01','02','03',
                    '04','05','06',
                    '07','08','09',
                    '10','11','12',
                    '13','14','15',
                    '16','17','18',
                    '19','20','21',
                    '22','23','24',
                    '25','26','27',
                    '28','29','30',
                    '31',
                ],
                'time':[
                    '00:00','01:00','02:00',
                    '03:00','04:00','05:00',
                    '06:00','07:00','08:00',
                    '09:00','10:00','11:00',
                    '12:00','13:00','14:00',
                    '15:00','16:00','17:00',
                    '18:00','19:00','20:00',
                    '21:00','22:00','23:00',
                ]
            },

            outfile_name_nc
            )

            logging.info("{}:{} downloaded in {:.2f} minutes".format(year, month, (time() - t0)/60))

            # move result file to S3
            S3_destination = f'ERA5/netcdf/{area}/{outfile_name_nc}'
            boto3.Session().resource('s3').Bucket(bucket_name).Object(S3_destination).upload_file(outfile_name_nc)
            os.remove(outfile_name_nc)

 2022-01-10 08:32:09,197 -  INFO -  Request for france 2021-10
 2022-01-10 08:32:09,276 -  INFO -  Welcome to the CDS
 2022-01-10 08:32:09,277 -  INFO -  Sending request to https://cds.climate.copernicus.eu/api/v2/resources/reanalysis-era5-single-levels
 2022-01-10 08:32:09,331 -  INFO -  Request is completed
 2022-01-10 08:32:09,332 -  INFO -  Downloading https://download-0005.copernicus-climate.eu/cache-compute-0005/cache/data6/adaptor.mars.internal-1641575420.9584136-21802-2-d227032c-e05b-40fa-80c7-b2d9f0786fa5.nc to ERA5_france_2021-10.nc (25.3M)
 2022-01-10 08:32:11,537 -  INFO -  Download rate 11.5M/s
 2022-01-10 08:32:11,560 -  INFO -  2021:10 downloaded in 0.04 minutes
 2022-01-10 08:32:12,241 -  INFO -  Request for france 2021-11
 2022-01-10 08:32:12,263 -  INFO -  Welcome to the CDS
 2022-01-10 08:32:12,264 -  INFO -  Sending request to https://cds.climate.copernicus.eu/api/v2/resources/reanalysis-era5-single-levels
 2022-01-10 08:32:12,306 -  INFO -  Downloading https://down