# Area selection based on Wind Spead

All automatic stations in Northeast region of Brazil will be analysed

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import distance


import os

%load_ext autoreload
%autoreload 2
from data_preprocessing import initial_data_preprocessing

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
def extract_wind_speed_statistics(df: pd.DataFrame) -> dict:
    '''
    Extracts relevant information about the wind speed from the automatic stations.
    
    Parameters
    ----------
    df: Preprocessed dataframe containing the historic information about the wind speed and direction.
    
    Returns 
    ----------
    wind_statistics: dict containing the mean, median, std, minimum date available for the wind speed, null quantity, dataset size and non null dataset size.

    '''

    wind_statistics = {
        'WIND_SPEED_mean': df['WIND_SPEED_ms'].mean(),
        'WIND_SPEED_120m_mean': df['WIND_SPEED_120m_ms'].mean(),
        'WIND_SPEED_median': df['WIND_SPEED_ms'].median(),
        'WIND_SPEED_120m_median': df['WIND_SPEED_120m_ms'].median(),
        'WIND_SPEED_std': df['WIND_SPEED_ms'].std(),
        'WIND_SPEED_120m_std': df['WIND_SPEED_120m_ms'].std(),
        'MINIMUM_DATE': df['DATE_MEASUREMENT'].min(),
        'DATASET_SIZE': df['WIND_SPEED_ms'].shape[0],
        'NON_NULL_DATASET_SIZE': df.dropna().shape[0],
        'NON_NULL_RATIO': df.dropna().shape[0] / df['WIND_SPEED_ms'].shape[0],
        'NULL_CELLS_QUANTITY': df.isna().sum().sum(),
    }
    return wind_statistics

In [34]:
df_catalog_automatic_stations = pd.read_excel('./../../data/automatic_stations_catalog.xlsx')[[
                                                        'DC_NOME','SG_ESTADO','CD_SITUACAO',
                                                        'VL_LATITUDE','VL_LONGITUDE','VL_ALTITUDE',
                                                        'DT_INICIO_OPERACAO','CD_ESTACAO']]

df_catalog_automatic_stations = df_catalog_automatic_stations.query("CD_SITUACAO != 'Pane'")

northeast_stations = os.listdir('./../../data/raw_data')
# northeast_stations = [x.split('.')[0].replace('dados_','') for x in northeast_stations]
df_northeast_stations = pd.DataFrame(data=northeast_stations,columns=['STATION_FILE_NAME'])
df_northeast_stations['CD_ESTACAO'] = [x.split('.')[0].replace('dados_','').split('_H_')[0] for x in northeast_stations]
df_northeast_stations = df_northeast_stations.merge(df_catalog_automatic_stations,on='CD_ESTACAO')

In [36]:
wind_station_statistics = []
for ii,station in df_northeast_stations.iterrows():
    df = pd.read_csv("./../../data/raw_data/" + station['STATION_FILE_NAME'],header=9,sep=';')
    print(f"Analysing {station['CD_ESTACAO']} - {station['DC_NOME']} station \n")
    df = initial_data_preprocessing(df,minimum_date='2018-01-01',maximum_date='2021-01-01') #Filtering from 2018 to 2021 since it is the data available in Conde station
    wind_statistics = extract_wind_speed_statistics(df)
    wind_statistics['CD_ESTACAO'] = station['CD_ESTACAO']
    wind_statistics['NOME'] = station['DC_NOME']
    wind_statistics['SG_ESTADO'] = station['SG_ESTADO']
    wind_statistics['LATITUDE'] = station['VL_LATITUDE']
    wind_statistics['LONGITUDE'] = station['VL_LONGITUDE']
    wind_station_statistics.append(wind_statistics)
df_wind_statistics = pd.DataFrame(wind_station_statistics)

Analysing A424 - IRECE station 

Dataset initial size:  (90072, 23)
Dataset final size:  (26327, 23)
Analysing A311 - FLORIANO station 

Dataset initial size:  (90072, 23)
Dataset final size:  (18239, 23)
Analysing A432 - BURITIRAMA station 

Dataset initial size:  (90072, 23)
Dataset final size:  (18473, 23)
Analysing A307 - PETROLINA station 

Dataset initial size:  (90072, 23)
Dataset final size:  (23982, 23)
Analysing A408 - ITABERABA station 

Dataset initial size:  (90072, 23)
Dataset final size:  (26328, 23)
Analysing A203 - SAO LUIS station 

Dataset initial size:  (90072, 23)
Dataset final size:  (25587, 23)
Analysing A421 - BREJO GRANDE station 

Dataset initial size:  (90072, 23)
Dataset final size:  (22109, 23)
Analysing A345 - SAO RAIMUNDO NONATO station 

Dataset initial size:  (90072, 23)
Dataset final size:  (21276, 23)
Analysing A438 - MARAU station 

Dataset initial size:  (90072, 23)
Dataset final size:  (26315, 23)
Analysing A407 - ITIRUCU station 

Dataset initial 

In [37]:
df_wind_statistics = df_wind_statistics[['CD_ESTACAO', 'NOME', 'SG_ESTADO', 'WIND_SPEED_mean', 'WIND_SPEED_120m_mean', 'WIND_SPEED_median', 
                                         'WIND_SPEED_120m_median', 'WIND_SPEED_std', 'WIND_SPEED_120m_std', 'MINIMUM_DATE', 'DATASET_SIZE', 
                                         'NON_NULL_RATIO', 'NON_NULL_DATASET_SIZE', 'NULL_CELLS_QUANTITY', 'LATITUDE', 'LONGITUDE']]

In [38]:
df_wind_statistics.sort_values(by=['WIND_SPEED_median'],ascending=[False]).query("NON_NULL_RATIO >=0.75").head(10)

Unnamed: 0,CD_ESTACAO,NOME,SG_ESTADO,WIND_SPEED_mean,WIND_SPEED_120m_mean,WIND_SPEED_median,WIND_SPEED_120m_median,WIND_SPEED_std,WIND_SPEED_120m_std,MINIMUM_DATE,DATASET_SIZE,NON_NULL_RATIO,NON_NULL_DATASET_SIZE,NULL_CELLS_QUANTITY,LATITUDE,LONGITUDE
78,A304,NATAL,RN,3.86984,6.03841,3.8,5.929433,1.468671,2.291681,2018-01-01,25680,0.902726,23182,2530,-5.837222,-35.208056
77,A431,CONDE,BA,3.551669,5.541944,3.7,5.773395,1.595726,2.489934,2018-01-01,26179,0.943772,24707,4425,-12.035833,-37.683889
3,A307,PETROLINA,PE,3.420309,5.336971,3.5,5.46132,1.259292,1.96497,2018-01-01,23982,0.988992,23718,652,-9.388323,-40.523262
51,A426,GUANAMBI,BA,3.640764,5.680965,3.4,5.305282,1.953206,3.047739,2018-01-01,24968,0.941886,23517,1497,-14.208056,-42.749722
52,A313,CAMPINA GRANDE,PB,3.272909,5.106973,3.3,5.149244,1.331662,2.077894,2018-01-01,24001,0.970543,23294,715,-7.225574,-35.904831
45,A330,PAULISTANA,PI,3.128074,4.880974,3.2,4.993207,1.765922,2.755504,2018-01-01,25191,0.998531,25154,149,-8.132288,-41.142945
36,A325,QUIXERAMOBIM,CE,3.245157,5.063669,3.2,4.993207,1.931509,3.013882,2018-01-01,16752,0.988181,16554,852,-5.174444,-39.289444
20,A436,QUEIMADAS,BA,3.007046,4.692126,2.9,4.525094,1.529744,2.386978,2018-01-01,22296,0.905454,20188,8727,-10.984722,-39.616944
72,A371,PIRANHAS,AL,2.918835,4.554483,2.9,4.525094,1.860368,2.902875,2018-01-01,20207,0.978374,19770,1256,-9.622222,-37.767222
44,A442,EUCLIDES DA CUNHA,BA,2.89994,4.525,2.8,4.369056,1.315845,2.053214,2018-01-01,21968,0.795657,17479,22750,-10.537233,-38.996605


In [39]:
df_wind_statistics.to_csv("./../../data/automatic_stations_wind_statistics.csv",index=False)

# Conclusions

Station of Conde seems adequate. The city is not a capital and there is good amount of available data, since late september 2017.