# Joining and Analysing data from all stations

In [1]:
import glob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
from data_preprocessing import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [2]:
df_station_statistics = pd.read_csv("./../../data/automatic_stations_wind_statistics.csv")
station_codes = ['A431','A458', 'A409', 'A401', 'A413', 'A406', 'A442', 'A450', 'A434', 'A436']
station_names = ['CONDE','RIBEIRA DO AMPARO', 'ARACAJU', 'SALVADOR', 'FEIRA DE SANTANA', 'CRUZ DAS ALMAS', 'EUCLIDES DA CUNHA',
                 'JEREMOABO', 'AMARGOSA', 'QUEIMADAS']

df_stations = []
for station_code,station_name in zip(station_codes,station_names):
    station_file_name = glob.glob(f'./../../data/raw/*{station_code}*')[0]
    df_raw = pd.read_csv(station_file_name,header=9,sep=';')
    df = initial_data_preprocessing(df_raw,minimum_date='2018-01-01',maximum_date='2021-01-01')
    df['NAME'] = station_name
    df['CODE'] = station_code
    df['LATITUDE'] = df_station_statistics.query("CD_ESTACAO == @station_code")['LATITUDE'].iloc[0]
    df['LONGITUDE'] = df_station_statistics.query("CD_ESTACAO == @station_code")['LONGITUDE'].iloc[0]
    df['ALTITUDE'] = df_station_statistics.query("CD_ESTACAO == @station_code")['ALTITUDE'].iloc[0]
    df_stations.append(df)
df_stations = pd.concat(df_stations)

Dataset initial size:  (90072, 23)
Dataset final size:  (26179, 25)
Dataset initial size:  (31200, 23)
Dataset final size:  (16432, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (26328, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (26309, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (24295, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (21906, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (21968, 25)
Dataset initial size:  (58416, 23)
Dataset final size:  (26261, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (25194, 25)
Dataset initial size:  (90072, 23)
Dataset final size:  (22296, 25)


In [3]:
df_stations = df_stations[['TOTAL_PRECIPITATION_mm', 'ATM_PRESSURE_mB', 'ATM_PRESSURE_SEA_LEVEL_mB', 
'MAX_ATM_PRESSURE_PREV_HOUR_mB', 'MIN_ATM_PRESSURE_PREV_HOUR_mB', 'GLOBAL_RADIATION_Kjm2', 
'AIR_TEMPERATURE_DRY_BULB_Celsius', 'DEW_POINT_TEMPERATURE_Celsius', 'MAX_TEMPERATURE_PREV_HOUR_Celsius', 
'MIN_TEMPERATURE_PREV_HOUR_Celsius', 'DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius', 'DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius',
'MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 'MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage', 'RELATIVE_HUMIDITY_percentage', 
'WIND_DIRECTION_degrees', 'WIND_MAX_GUNS_ms', 'WIND_SPEED_ms', 'WIND_SPEED_120m_ms', 
'LATITUDE', 'LONGITUDE', 'ALTITUDE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'DATETIME', 'NAME', 'CODE']]

df_stations.to_csv("./../../data/preprocessed/wind_data_conde_and_adjacent_stations.csv",index=False)

In [59]:
df_stations.groupby("YEAR").count()[['WIND_DIRECTION_degrees','WIND_SPEED_ms']]

Unnamed: 0_level_0,WIND_DIRECTION_degrees,WIND_SPEED_ms
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,75356,75362
2019,84178,84179
2020,77069,77075
2021,182,182


In [96]:
df_stations.head(3)

Unnamed: 0,TOTAL_PRECIPITATION_mm,ATM_PRESSURE_mB,ATM_PRESSURE_SEA_LEVEL_mB,MAX_ATM_PRESSURE_PREV_HOUR_mB,MIN_ATM_PRESSURE_PREV_HOUR_mB,GLOBAL_RADIATION_Kjm2,AIR_TEMPERATURE_DRY_BULB_Celsius,DEW_POINT_TEMPERATURE_Celsius,MAX_TEMPERATURE_PREV_HOUR_Celsius,MIN_TEMPERATURE_PREV_HOUR_Celsius,DEW_POINT_MAX_TEMPERATURE_PREV_HOUR_Celsius,DEW_POINT_MIN_TEMPERATURE_PREV_HOUR_Celsius,MAX_RELATIVE_HUMIDITY_PREV_HOUR_percentage,MIN_RELATIVE_HUMIDITY_PREV_HOUR_percentage,RELATIVE_HUMIDITY_percentage,WIND_DIRECTION_degrees,WIND_MAX_GUNS_ms,WIND_SPEED_ms,WIND_SPEED_120m_ms,YEAR,MONTH,DAY,HOUR,DATETIME,NAME,CODE,LATITUDE,LONGITUDE,ALTITUDE
0,0.0,1009.6,,1009.6,1009.3,-3.54,26.1,23.5,26.3,26.0,23.6,23.3,86.0,84.0,86.0,81.0,7.4,3.9,6.085471,2018,1,1,0,2018-01-01 00:00:00,CONDE,A431,-12.035833,-37.683889,31.9
1,0.0,1009.7,1013.559272,1009.8,1009.6,-3.54,26.1,23.3,26.1,26.0,23.7,23.3,87.0,85.0,85.0,80.0,8.1,4.3,6.709622,2018,1,1,1,2018-01-01 01:00:00,CONDE,A431,-12.035833,-37.683889,31.9
2,0.0,1009.7,1013.560564,1009.9,1009.7,-3.54,26.0,23.4,26.1,26.0,23.5,23.3,86.0,85.0,85.0,89.0,9.3,5.6,8.738112,2018,1,1,2,2018-01-01 02:00:00,CONDE,A431,-12.035833,-37.683889,31.9


In [93]:
df_stations.query("NAME == 'RIBEIRA DO AMPARO'").shape

(16432, 29)

In [98]:
df_stations.query("NAME == 'RIBEIRA DO AMPARO'").set_index('DATETIME').resample('1H').asfreq().shape

(19875, 28)