# Instalations

In [1]:
pip install pymysql sqlalchemy-redshift pykeepass

Collecting pymysql
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
     |████████████████████████████████| 43 kB 4.0 MB/s             
[?25hCollecting sqlalchemy-redshift
  Downloading sqlalchemy_redshift-0.8.9-py2.py3-none-any.whl (36 kB)
Collecting pykeepass
  Downloading pykeepass-4.0.1.tar.gz (48 kB)
     |████████████████████████████████| 48 kB 1.7 MB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting construct==2.10.54
  Downloading construct-2.10.54.tar.gz (55 kB)
     |████████████████████████████████| 55 kB 755 kB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting pycryptodomex>=3.6.2
  Downloading pycryptodomex-3.13.0-cp35-abi3-manylinux2010_x86_64.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 42.8 MB/s            
Building wheels for collected packages: pykeepass, construct
  Building wheel for pykeepass (setup.py) ... [?25ldone
[?25h  Created wheel for pykeepass: filename=pykeepass-4.0.1-py3-none-

In [2]:
from pykeepass import PyKeePass
import logging
import getpass
import os
from datetime import datetime
import pandas as pd
import numpy as np
import sqlalchemy
import matplotlib.pyplot as plt
import seaborn as sns
import boto3 #Save in S3
from pathlib import Path

In [3]:
S3_bucket_name = 'edfred-edfre-sbx-eu-west-1-solar-radiation-data'
S3_CSV_FOLD = r'EtudeWindIndex/Real'

# Accès aux données Keepass

In [5]:
#Get logging information stored in keepass
Configuration_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Windga/WindGa_hourly/Configuration')
KEEPASS_FILE = Configuration_path/'RDL.kdbx'

logging.basicConfig(format=' %(asctime)s -  %(levelname)s -  %(message)s', handlers = [logging.StreamHandler()])
logging.getLogger().setLevel(logging.INFO)

print('Provide password to get acces to Keepass file:')
password = getpass.getpass()

try:  
    kp = PyKeePass(KEEPASS_FILE, password=password)
    logging.info('Keepass data loaded')
except:
    logging.exception('Cannot open the keepass file:')

Provide password to get acces to Keepass file:


 ·········


 2022-01-28 08:16:52,502 -  INFO -  Keepass data loaded


In [6]:
# TEMP
kp.find_entries(title='PWU-RSH', first=True).url

'redshift+psycopg2://@rsh-eu-west-1a-exposure-warehouse-enduser.cs3rrvwot6nc.eu-west-1.redshift.amazonaws.com:5439/dwh'

# Connection à la base de données

In [7]:
def connexion_setup(kp_con_name='SBX-RDS'):

    credential = kp.find_entries(title=kp_con_name, first=True)

    con = credential.url.split('//')[0] + '//' + \
    credential.username +':'+ \
    credential.password + \
    credential.url.split('//')[1]
    logging.debug(f'connexion string: {con}')

    con_engine = sqlalchemy.create_engine(con)

    return con_engine

In [8]:
#Read mapping file
VAR_MAPPING_FILE = Configuration_path/'10min_variables_iec_to_std_names_mapping.csv'
df = pd.read_csv(VAR_MAPPING_FILE)
df.head()

#Mapping dict
iec2std = df.loc[:,['iec_attribute61400','business_description']].set_index('iec_attribute61400')['business_description'].to_dict()
std2iec = df.loc[:,['iec_attribute61400','business_description']].set_index('business_description')['iec_attribute61400'].to_dict()

#Récupération des données utiles
mask_features = [False]
List_features = ['active_power_avg','wind_speed_avg']
for feature in List_features :
    mask_features = mask_features | (df.business_description == feature)
temperature_features = df.loc[mask_features,:]
features_selection_str = str(list(temperature_features.iec_attribute61400)).replace("'", '').replace("[", "").replace("]", "")

#Visualisation
features_selection_str

'wmet4_horwdspd_mag_f, wtur4_w_mag_f'

# Extraction de données

In [9]:
#Liste des projets à extraire
start_project = 'AUQB'
start_month = '2020-01'
end_month = '2021-11'

In [13]:
months_range = pd.date_range(start=start_month, end=end_month, freq='MS')
months_list = [str(months_range[i])[0:7] for i in range(len(months_range))]

#On récupère la liste des projets
List_projects = pd.read_csv(Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/list_projects.csv'),sep=';').project.tolist()

In [None]:
for PROJECT in List_projects :
    
    logging.info("project : {}".format(PROJECT))
    
    _10min_project = pd.DataFrame(columns=['asset_id','ts'] + List_features)
    
    #Number of WTGS in this project from WDM
    query = f'''SELECT project_code, iec_eqpt_code, wt_neighbor_01, power_curve_code
    FROM wdm.adm_eqpt_wt 
    WHERE project_code = '{PROJECT}' ;
    '''
    
    with connexion_setup('SBX-WDM').connect() as conn:
        wtg_eqpt_codes = pd.read_sql_query(sql=query, con=conn, params={})
    
    WTGS = tuple(wtg_eqpt_codes.iec_eqpt_code)
        
    #10 minutes data extraction
    for MONTH in months_list :
        FROM_DATE =  MONTH + '-01 00:00:00'  # included
        if int(MONTH[-2:])==12:
            TO_DATE = f'{int(MONTH[:-3]) + 1}-01-01 00:00:00'      # not included 
        elif int(MONTH[-2:])>=9:
                TO_DATE = f'{MONTH[:-3]}-{int(MONTH[-2:]) + 1}-01 00:00:00'      # not included 
        else :
            TO_DATE = f'{MONTH[:-3]}-0{int(MONTH[-2:]) + 1}-01 00:00:00'      # not included 
            TO_DATE = f'2021-12-01 00:00:00'      # not included
            
        DATES = [d.strftime('%Y%m%d') for d in pd.date_range(FROM_DATE, TO_DATE)]
        _10min_tables = [f'eu_data.tur_10m_{date}_q' for date in DATES[:-1]]
        query = ""
        
        for i, table in enumerate(_10min_tables):
            if i != 0:
                query = query + "UNION\n"
            query = query + f'''SELECT  asset_id, 
            ts,
            {features_selection_str}
            FROM {table} WHERE project = '{PROJECT}' AND tech_source = 'PIOEM' \n'''
            
        query = query + "ORDER BY ts ;"
        
        with (connexion_setup('PWU-RSH').connect()) as conn:
            _10min = pd.read_sql_query(sql=query, con=conn)    # query_red(conn, query)
            
        _10min.rename(columns=iec2std, inplace=True)
        
        # re-order columns
        col_selection = _10min.columns.sort_values()
        _10min = _10min.loc[:,col_selection]
        
        _10min_project = pd.concat([_10min_project, _10min])
        
    #On converti le timestamp
    _10min_project['year'] = _10min_project.ts.map(lambda date: date.year)
    _10min_project['month'] = _10min_project.ts.map(lambda date: date.month)
    _10min_project['day'] = _10min_project.ts.map(lambda date: date.day)
    _10min_project['hour'] = _10min_project.ts.map(lambda date: date.hour)
    _10min_project['minute'] = _10min_project.ts.map(lambda date: date.minute)
    
    #On récupère le nom de projet et le numéro de turbine
    _10min_project['project'] = _10min_project.asset_id.map(lambda name: name[0:4])
    _10min_project['turbine'] = _10min_project.asset_id.map(lambda name: int(name[-3:]))
    
    #On garde les informations utiles
    _10min_project.drop(['asset_id','ts'], axis=1, inplace=True)
    #On modifie l'index pour retrouver plus facilement les données
    _10min_project.set_index(['project','turbine','year','month','day','hour','minute'], inplace=True)

    #Sauvegarde sur le notebook
    _10min_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/Windga/WindGa_hourly/Clean/10min_'+PROJECT+'.csv')
    _10min_project.to_csv(_10min_path, index=False, sep=';')
    
    #Sauvegarde sur le S3
    outfile = '10min_'+PROJECT+'.csv'
    _10min_project.to_csv(f's3://{S3_bucket_name}/{S3_CSV_FOLD}/{outfile}', index=True, sep=';')