# Imports

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

# Création du dataframe Windga

In [2]:
WindGa_Prod_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/WindGa_monthly/Raw/WINDGA_V_RPT_GA_CALCUL_PROD_PERF.csv')
WindGa_Sol_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/WindGa_monthly/Raw/WINDGA_GA Sol.csv')

In [3]:
#Lecture de WINDGA Prod
ga_prod = pd.read_csv(WindGa_Prod_path, sep=';', low_memory=False)

#On ne garde que les données qui nous intéressent
selection = [
             'Nom centrale', 
             'PERIODE', 
             'MOIS', 
             'ANNEE', 
             'Dispo contrat realisee',
             'Dispo technique realisee', 
             'Dispo energetique realisee',
             'Production realisee',
             'Code centrale', 
             ]

#On ne garde que les éoliennes 'E'
#Ajouter 'S' pour le solaire
mask = ga_prod["Code centrale"].str.startswith('E', na=False) 
ga_prod_f = ga_prod.loc[mask, selection]

In [4]:
#Lecture de WINDGA Sol
ga_sol = pd.read_csv(WindGa_Sol_path, sep=';', low_memory=False)

#On ne garde que les données qui servent à retrouver les code projet
selection = [
       'Code Centrale',  
       'Code PI',
       ]

#On ne garde que les éoliennes 'E'
mask = ga_sol["Code Centrale"].str.startswith('E', na=False)
ga_sol_f = ga_sol.loc[mask, selection]

In [5]:
#On fusionne les deux fichiers WINDGA
windga = pd.merge(ga_prod_f, ga_sol_f,
                     left_on='Code centrale',
                     right_on='Code Centrale'
                     )

#On supprime les collonnes inutiles et rennome la colonne des codes projet
windga.drop(['Code centrale', 'Code Centrale'], axis=1, inplace=True)
windga.rename(columns={"Code PI": "project_code"}, inplace= True)

#On ajoute une colonne de date
windga['date'] = pd.to_datetime(windga['PERIODE'], format='%d/%m/%Y')

In [6]:
#Visualisation de WindGa
windga.head()

Unnamed: 0,Nom centrale,PERIODE,MOIS,ANNEE,Dispo contrat realisee,Dispo technique realisee,Dispo energetique realisee,Production realisee,project_code,date
0,Amelecourt,01/11/2009,11,2009,9576,,,3332664,AMEL,2009-11-01
1,Amelecourt,01/10/2016,10,2016,97175,91491.0,89533.0,1435415,AMEL,2016-10-01
2,Amelecourt,01/04/2012,4,2012,97,,,2025488,AMEL,2012-04-01
3,Amelecourt,01/03/2019,3,2019,98926,9882.0,98794.0,3913817,AMEL,2019-03-01
4,Amelecourt,01/09/2021,9,2021,97811,96088.0,95953.0,1019362,AMEL,2021-09-01


In [7]:
Windga_raw_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/WindGa_monthly/Raw/Windga_raw.csv')
    
#Sauvegarde
windga.to_csv(Windga_raw_path, index=False, sep=';')

# Nettoyage des données

In [8]:
#On converti en nombre les valeurs de WindGa
windga['Production realisee'] = windga['Production realisee'].str.replace(',','.')
windga['Production realisee'] = windga['Production realisee'].astype(float)

windga['Dispo energetique realisee'] = windga['Dispo energetique realisee'].str.replace(',','.')
windga['Dispo energetique realisee'] = windga['Dispo energetique realisee'].astype(float)

windga['Dispo technique realisee'] = windga['Dispo technique realisee'] .str.replace(',','.')
windga['Dispo technique realisee'] = windga['Dispo technique realisee'] .astype(float)

windga['Dispo contrat realisee'] = windga['Dispo contrat realisee'].str.replace(',','.')
windga['Dispo contrat realisee'] = windga['Dispo contrat realisee'].astype(float)

In [9]:
#On enlève deux centrales qui ne nous appartiennent plus
mask = windga['Nom centrale'].isin(['SC - Cabreirens', 'SC - Calsigas'])
windga = windga.loc[~mask,:]



#On enlève les lignes de production nulle
mask = windga['Production realisee'].isnull()
windga = windga.loc[~mask,:]



#On fait disparaître les valeurs impossibles
mask = windga['Dispo energetique realisee'] > 1
windga.loc[mask,'Dispo energetique realisee'] = np.NaN

mask = windga['Dispo technique realisee'] > 1
windga.loc[mask,'Dispo technique realisee'] = np.NaN

mask = windga['Dispo contrat realisee'] > 1
windga.loc[mask,'Dispo contrat realisee'] = np.NaN



#On construit la colonne de dispo selon l'ordre :
#Dispo consolidated = PBA_Energetique > TBA_technique > Contract
windga['dispo_consolidated'] = windga['Dispo energetique realisee']

mask = ((windga['dispo_consolidated'].isnull()) | (windga['dispo_consolidated'] == 0))
windga.loc[mask, 'dispo_consolidated'] = windga.loc[mask, 'Dispo technique realisee']

mask = (windga['dispo_consolidated'].isnull() | (windga['dispo_consolidated'] == 0)) 
windga.loc[mask, 'dispo_consolidated'] = windga.loc[mask, 'Dispo contrat realisee']

In [10]:
#On construit la colonne production 100% disponible
windga['prod_100p'] = np.NaN
mask = ~((windga['dispo_consolidated'].isnull()) | (windga['dispo_consolidated'] == 0))
windga.loc[mask, 'prod_100p'] = windga.loc[mask, 'Production realisee'] / windga.loc[mask, 'dispo_consolidated']

In [11]:
Windga_path = Path('/home/ec2-user/SageMaker/EtudeWindIndex/Data/WindGa_monthly/Clean/Windga.csv')
    
#Sauvegarde
windga.to_csv(Windga_path, index=False, sep=';')