# INMET METHEOROLOGICAL STATIONS RECORDS - DATA CLEANING

#### Import modules and libraries

In [None]:
import os, json
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns; sns.set()
from IPython.display import clear_output as co

---
# Preprocess & Data Cleaning 

#### Load dataset - Inmet metheorological stations' hourly records

In [None]:
inmet = pd.read_csv('../../../Dados/Desafio COR-Rio IV/Meio Ambiente Clima/Meteorologia_INMET.csv')

inmet.head(5)

#### Columns per type

In [None]:
id_cols = ['primary_key', 'id_estacao', 'data', 'horario', 'data_particao']
float_cols = list(set(inmet.columns).difference(id_cols))

#### Average duplicates

In [None]:
keys_columns = inmet[id_cols].drop_duplicates('primary_key')
numeric_columns_averaged = inmet.groupby('primary_key').mean()

INMET = keys_columns.join(numeric_columns_averaged, how='left', on='primary_key')

len(inmet), len(INMET), len(inmet['primary_key'].unique())

#### Set datetime index

In [None]:
INMET.set_index(pd.to_datetime(INMET['data'] + ' ' + INMET['horario']), inplace=True)

#### Join stations data across horizontal axis

In [None]:
def concat_groups(data, key='id_estacao', cols=float_cols, how='outter'):
    dfs = [data[cols][data[key]==group].add_suffix(' - ' + group) for group in data[key].unique()]
    return pd.concat(dfs, 1, join=how)

inmet_flat = concat_groups(INMET, 'id_estacao', float_cols, how='outer')

inmet_flat.shape # Same as unique index values size

#### Fill missing records by upsampling to hourly frequency

In [None]:
inmet_flat = inmet_flat.asfreq('H')

inmet_flat.shape # Same as hourly index range size

#### Drop empty columns

In [None]:
inmet_flat.dropna(1, how='all', inplace=True)

inmet_flat.shape 

### Save & reload

#### Save stations clean data

In [None]:
# inmet_flat.to_csv('Dados/Clean/INMET.csv', index=True)

#### Reload & preprocessing

In [None]:
inmet = pd.read_csv('Dados/Clean/INMET.csv', index_col=0)
inmet.set_index(pd.to_datetime(inmet.index), inplace=True) # Convert index to datetime

inmet.shape

### Extra: Missing values left

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 3))

inmet.isna().mean().sort_values().plot(xticks=[], title='Missing values per column sorted', ax=axs[0])
axs[0].set(ylabel='Missing (%)', xlabel='Column')

inmet.isna().mean(1).sort_values().reset_index(drop=True).plot(xticks=[], title='Missing values per row sorted', ax=axs[1])
axs[1].set(ylabel='Missing (%)', xlabel='Row')

plt.show()