# Imports

In [1]:
pip install pymysql sqlalchemy-redshift pykeepass

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pykeepass import PyKeePass
import logging
import getpass
import os
from datetime import datetime
import pandas as pd
import numpy as np
import sqlalchemy
import matplotlib.pyplot as plt
import seaborn as sns
import boto3 #Save in S3
from pathlib import Path

# Accès aux données Keepass

In [3]:
#Keepass credential + logger set-up
#Get logging information stored in keepass
Configuration_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Windga/WindGa_hourly/Configuration')
KEEPASS_FILE = Configuration_path/'RDL.kdbx'

logging.basicConfig(format=' %(asctime)s -  %(levelname)s -  %(message)s', handlers = [logging.StreamHandler()])
logging.getLogger().setLevel(logging.INFO)

print('Provide password to get acces to Keepass file:')
password = getpass.getpass()

try:  
    kp = PyKeePass(KEEPASS_FILE, password=password)
    logging.info('Keepass data loaded')
except:
    logging.exception('Cannot open the keepass file:')

Provide password to get acces to Keepass file:


 ·········


 2022-01-24 15:58:48,244 -  INFO -  Keepass data loaded


In [4]:
# TEMP
kp.find_entries(title='PWU-RSH', first=True).url

'redshift+psycopg2://@rsh-eu-west-1a-exposure-warehouse-enduser.cs3rrvwot6nc.eu-west-1.redshift.amazonaws.com:5439/dwh'

# Connection à la base de donnéesurl

In [5]:
def connexion_setup(kp_con_name='SBX-RDS'):

    credential = kp.find_entries(title=kp_con_name, first=True)

    con = credential.url.split('//')[0] + '//' + \
    credential.username +':'+ \
    credential.password + \
    credential.url.split('//')[1]
    logging.debug(f'connexion string: {con}')

    con_engine = sqlalchemy.create_engine(con)

    return con_engine

In [6]:
#Read mapping file
VAR_MAPPING_FILE = Configuration_path/'10min_variables_iec_to_std_names_mapping.csv'
df = pd.read_csv(VAR_MAPPING_FILE)
df.head()

#Mapping dict
iec2std = df.loc[:,['iec_attribute61400','business_description']].set_index('iec_attribute61400')['business_description'].to_dict()
std2iec = df.loc[:,['iec_attribute61400','business_description']].set_index('business_description')['iec_attribute61400'].to_dict()

#Récupération des données utiles
mask_features = [False]
List_features = ['active_power_avg','wind_speed_avg']
for feature in List_features :
    mask_features = mask_features | (df.business_description == feature)
temperature_features = df.loc[mask_features,:]
features_selection_str = str(list(temperature_features.iec_attribute61400)).replace("'", '').replace("[", "").replace("]", "")

#Visualisation
features_selection_str

'wmet4_horwdspd_mag_f, wtur4_w_mag_f'

In [7]:
#Paramètres
PROJECT =  'ESPS'
MONTH = '2021-10'    # string: 'YYYY-MM'

# Extraction de données

In [8]:
#Number of WTGS in this project from WDM
query = f'''SELECT project_code, iec_eqpt_code, wt_neighbor_01, power_curve_code
            FROM wdm.adm_eqpt_wt 
            WHERE project_code = '{PROJECT}' ;
            '''
            
with connexion_setup('SBX-WDM').connect() as conn:
     wtg_eqpt_codes = pd.read_sql_query(sql=query,
                                        con=conn, 
                                        params={})
WTGS = tuple(wtg_eqpt_codes.iec_eqpt_code)

#Visualisation
WTGS

('ESPS-ECP001-TUR001',
 'ESPS-ECP001-TUR002',
 'ESPS-ECP001-TUR003',
 'ESPS-ECP001-TUR004',
 'ESPS-ECP001-TUR005')

In [9]:
#10 minutes data extraction
FROM_DATE =  MONTH + '-01 00:00:00'  # included
if int(MONTH[-2:])==12:
    TO_DATE = f'{int(MONTH[:-3]) + 1}-01-01 00:00:00'      # not included 
elif int(MONTH[-2:])>=9:
    TO_DATE = f'{MONTH[:-3]}-{int(MONTH[-2:]) + 1}-01 00:00:00'      # not included 
else :
    TO_DATE = f'{MONTH[:-3]}-0{int(MONTH[-2:]) + 1}-01 00:00:00'      # not included 
    TO_DATE = f'2021-12-01 00:00:00'      # not included


DATES = [d.strftime('%Y%m%d') for d in pd.date_range(FROM_DATE, TO_DATE)]
_10min_tables = [f'eu_data.tur_10m_{date}_q' for date in DATES[:-1]]
query = ""

for i, table in enumerate(_10min_tables):
    if i != 0:
        query = query + "UNION\n"
    query = query + f'''SELECT  asset_id,
                                ts,
                                {features_selection_str}
                        FROM {table} WHERE project = '{PROJECT}' AND tech_source = 'PIOEM' \n'''

query = query + "ORDER BY ts ;"

with (connexion_setup('PWU-RSH').connect()) as conn:
    _10min = pd.read_sql_query(sql=query, con=conn)    # query_red(conn, query)
  
_10min.rename(columns=iec2std,
             inplace=True)

_10min.set_index(['asset_id', 'ts'], inplace=True)

# re-order columns
col_selection = _10min.columns.sort_values()
_10min = _10min.loc[:,col_selection]
_10min.head()

  """)


Unnamed: 0_level_0,Unnamed: 1_level_0,active_power_avg,wind_speed_avg
asset_id,ts,Unnamed: 2_level_1,Unnamed: 3_level_1
ESPS-ECP001-TUR001,2021-10-01,2001.43,9.49306
ESPS-ECP001-TUR003,2021-10-01,762.507,5.27715
ESPS-ECP001-TUR002,2021-10-01,2369.03,10.1053
ESPS-ECP001-TUR005,2021-10-01,2532.69,11.0437
ESPS-ECP001-TUR004,2021-10-01,1213.97,7.17237
