# Area selection based on Wind Spead

All automatic stations in Northeast region of Brazil will be analysed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import distance


import os

%load_ext autoreload
%autoreload 2
from data_preprocessing import initial_data_preprocessing

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)


In [73]:
def extract_wind_speed_statistics(df: pd.DataFrame) -> dict:
    '''
    Extracts relevant information about the wind speed from the automatic stations.
    
    Parameters
    ----------
    df: Preprocessed dataframe containing the historic information about the wind speed and direction.
    
    Returns 
    ----------
    wind_statistics: dict containing the mean, median, std, minimum date available for the wind speed, null quantity, dataset size and non null dataset size.

    '''

    wind_statistics = {
        'WIND_SPEED_mean': df['WIND_SPEED_ms'].mean(),
        'WIND_SPEED_120m_mean': df['WIND_SPEED_120m_ms'].mean(),
        'WIND_SPEED_median': df['WIND_SPEED_ms'].median(),
        'WIND_SPEED_120m_median': df['WIND_SPEED_120m_ms'].median(),
        'WIND_SPEED_std': df['WIND_SPEED_ms'].std(),
        'WIND_SPEED_120m_std': df['WIND_SPEED_120m_ms'].std(),
        'MINIMUM_DATE': df['DATE_MEASUREMENT'].min(),
        'DATASET_SIZE': df['WIND_SPEED_ms'].shape[0],
        'NON_NULL_DATASET_SIZE': df.dropna().shape[0],
        'NULL_CELLS_QUANTITY': df.isna().sum().sum(),
    }
    return wind_statistics

In [74]:
df_catalog_automatic_stations = pd.read_excel('./../../data/automatic_stations_catalog.xlsx')[[
                                                        'DC_NOME','SG_ESTADO','CD_SITUACAO',
                                                        'VL_LATITUDE','VL_LONGITUDE','VL_ALTITUDE',
                                                        'DT_INICIO_OPERACAO','CD_ESTACAO']]

df_catalog_automatic_stations = df_catalog_automatic_stations.query("CD_SITUACAO != 'Pane'")

northeast_stations = os.listdir('./../../data/raw_data')
# northeast_stations = [x.split('.')[0].replace('dados_','') for x in northeast_stations]
df_northeast_stations = pd.DataFrame(data=northeast_stations,columns=['STATION_FILE_NAME'])
df_northeast_stations['CD_ESTACAO'] = [x.split('.')[0].replace('dados_','').split('_H_')[0] for x in northeast_stations]
df_northeast_stations = df_northeast_stations.merge(df_catalog_automatic_stations,on='CD_ESTACAO')

In [75]:
wind_station_statistics = []
for ii,station in df_northeast_stations.iterrows():
    df = pd.read_csv("./../../data/raw_data/" + station['STATION_FILE_NAME'],header=9,sep=';')
    print(f"Analysing {station['CD_ESTACAO']} - {station['DC_NOME']} station")
    df = initial_data_preprocessing(df)
    wind_statistics = extract_wind_speed_statistics(df)
    wind_statistics['CD_ESTACAO'] = station['CD_ESTACAO']
    wind_statistics['DC_NOME'] = station['DC_NOME']
    wind_statistics['SG_ESTADO'] = station['SG_ESTADO']
    wind_statistics['VL_LATITUDE'] = station['VL_LATITUDE']
    wind_statistics['VL_LONGITUDE'] = station['VL_LONGITUDE']
    wind_station_statistics.append(wind_statistics)
df_wind_statistics = pd.DataFrame(wind_station_statistics)


Analysing A424 - IRECE station
Dataset initial size:  (90072, 23)
Dataset final size:  (39948, 23)
Analysing A311 - FLORIANO station
Dataset initial size:  (90072, 23)
Dataset final size:  (25485, 23)
Analysing A432 - BURITIRAMA station
Dataset initial size:  (90072, 23)
Dataset final size:  (21182, 23)
Analysing A307 - PETROLINA station
Dataset initial size:  (90072, 23)
Dataset final size:  (37610, 23)
Analysing A408 - ITABERABA station
Dataset initial size:  (90072, 23)
Dataset final size:  (39952, 23)
Analysing A203 - SAO LUIS station
Dataset initial size:  (90072, 23)
Dataset final size:  (39065, 23)
Analysing A421 - BREJO GRANDE station
Dataset initial size:  (90072, 23)
Dataset final size:  (24879, 23)
Analysing A345 - SAO RAIMUNDO NONATO station
Dataset initial size:  (90072, 23)
Dataset final size:  (28411, 23)
Analysing A438 - MARAU station
Dataset initial size:  (90072, 23)
Dataset final size:  (32246, 23)
Analysing A407 - ITIRUCU station
Dataset initial size:  (90072, 23)
D

In [81]:
df_wind_statistics = df_wind_statistics[['CD_ESTACAO', 'DC_NOME', 'SG_ESTADO', 'WIND_SPEED_mean', 'WIND_SPEED_120m_mean', 'WIND_SPEED_median', 'WIND_SPEED_120m_median', 'WIND_SPEED_std', 'WIND_SPEED_120m_std', 'MINIMUM_DATE', 'DATASET_SIZE', 'NON_NULL_DATASET_SIZE', 'NULL_CELLS_QUANTITY', 'VL_LATITUDE', 'VL_LONGITUDE']]

In [85]:
df_wind_statistics.sort_values(by=['WIND_SPEED_median'],ascending=[False]).head(10)

Unnamed: 0,CD_ESTACAO,DC_NOME,SG_ESTADO,WIND_SPEED_mean,WIND_SPEED_120m_mean,WIND_SPEED_median,WIND_SPEED_120m_median,WIND_SPEED_std,WIND_SPEED_120m_std,MINIMUM_DATE,DATASET_SIZE,NON_NULL_DATASET_SIZE,NULL_CELLS_QUANTITY,VL_LATITUDE,VL_LONGITUDE
78,A304,NATAL,RN,4.021618,6.275241,4.0,6.241508,1.444373,2.253767,2017-09-19,33208,30633,2751,-5.837222,-35.208056
77,A431,CONDE,BA,3.533714,5.513927,3.7,5.773395,1.582298,2.468981,2017-09-27,32114,30054,7038,-12.035833,-37.683889
24,A310,AREIA,PB,3.518874,5.490771,3.7,5.773395,1.627065,2.538835,2017-09-19,37869,20625,18238,-6.975451,-35.718128
52,A313,CAMPINA GRANDE,PB,3.367111,5.253964,3.4,5.305282,1.334303,2.082015,2017-09-19,37627,36802,867,-7.225574,-35.904831
3,A307,PETROLINA,PE,3.175936,4.955658,3.3,5.149244,1.460493,2.278919,2017-09-19,37610,37337,661,-9.388323,-40.523262
28,A309,ARCO VERDE,PE,3.173074,4.951192,3.3,5.149244,1.476289,2.303568,2017-09-19,28451,19771,9191,-8.433611,-37.055556
36,A325,QUIXERAMOBIM,CE,3.343962,5.217842,3.3,5.149244,1.836147,2.865081,2017-09-19,20970,20673,1502,-5.174444,-39.289444
51,A426,GUANAMBI,BA,3.551618,5.541863,3.3,5.149244,1.953853,3.048748,2017-09-19,35509,33065,2742,-14.208056,-42.749722
58,A423,REMANSO,BA,3.31549,5.173414,3.2,4.993207,1.827869,2.852164,2017-09-19,34171,23208,11186,-9.625556,-42.077222
45,A330,PAULISTANA,PI,3.17387,4.952434,3.2,4.993207,1.755176,2.738737,2017-09-19,33245,33153,244,-8.132288,-41.142945


# Conclusions

Station of Conde seems adequate. The city is not a capital and there is good amount of available data, since late september 2017.