# INMET METHEOROLOGICAL STATIONS - PREPROCESSING & FEATURE ENGINEERING

#### Import modules and libraries

In [1]:
import os, json
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns; sns.set()
from IPython.display import clear_output as co

#### Load data

In [None]:
inmet = pd.read_csv('Dados/Clean/INMET.csv', index_col=0)
inmet.set_index(pd.to_datetime(inmet.index), inplace=True) # Convert index to datetime

inmet.shape

### Planning

Model arquitecture options:
1. Binary labels for (any) incident ocorrence in time intervals - Done
1. Binary labels for incident ocorrence in time intervals for each incident group - Done
1. Labels for each incident group (+ no incident) in time intervals
4. Shift labels to predict future ocorrence per hours prior
    1. Shift labels one, two, three positions to predict risk for next one, two, three hours (separate model for each hour step).
6. Reorder columns values as first, second ... stations closer
7. Predict incident quantity per day, week and month, in city, neighborhoods and clusters.
8. Predict time until next event (for city and event groups)

Feature Engineering:
1. Time serie index (Make it a time serie model)
1. Trimester, month, week, day of the year.
2. Acumulated on last one, two, three... records (hours).

Obs: Filter records by arbitrary rule to reduce samples and overcome imbalancement

---
# 1. Stations' records labeled by water bag event ocorrence

### Transform functions

In [4]:
def intervals_intersect_interval(values_min, values_max, time_min, time_max):
    return ~((values_max <= time_min) | (values_min >= time_max))

def timeSequenceEventLabels(time_serie, target, start, end, groups, report=1000): # Accepts time_serie as pandas Datetime Index
    period_label = []
    for i in range(time_serie.shape[0] - 1):
        if (i+1) % report == 0:
            co(wait=True); print(f'{i+1}/{time_serie.shape[0] - 1} bin edges evaluated...')
        periods_intersect = intervals_intersect_interval(target[start], target[end], time_serie[i], time_serie[i+1])
        target_bin = target[periods_intersect]
        label = 1 if len(target_bin) else 0
        n_events = len(target_bin)
        event_ids = target_bin.index.tolist()
        event_groups = target_bin[groups].tolist()
        period_label.append([label, n_events, event_ids, event_groups])
    
    print(f'Done! {time_serie.shape[0] - 1} periods evaluated.')
    return pd.DataFrame(period_label, columns=['label', 'events', 'event ids', 'event groups'], index=time_serie[:-1])

### Water bag events

#### Join clusters data to water bag events collection

In [6]:
events = waterbags[['EVENTO_INICIO', 'EVENTO_FIM']].apply(pd.to_datetime)
events = events.join(clusters['sublabel'], how='inner')

display(events.head(4), events.shape)

Unnamed: 0_level_0,EVENTO_INICIO,EVENTO_FIM,sublabel
EVENTO_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1258,2015-09-12 20:12:00,2015-09-13 03:39:00,12
1259,2015-09-12 22:13:00,2015-09-13 01:21:00,-1
1260,2015-09-12 22:15:00,2015-09-13 01:20:00,-1
1261,2015-09-12 22:16:00,2015-09-13 01:20:00,-1


(2983, 3)

### Label stations' records by water bag event ocorrence

In [7]:
event_ts = timeSequenceEventLabels(
    inmet.index, events,
    start='EVENTO_INICIO',
    end='EVENTO_FIM',
    groups='sublabel',
    report=1000,
)

108000/108863 bin edges evaluated...
Done! 108863 periods evaluated.


#### Save labeled time serie

In [8]:
# event_ts.to_csv('Dados/Transform/waterbag_timeserie_inmet.csv', index=True)

#### Reload & preprocess

In [9]:
event_ts = pd.read_csv('Dados/Transform/waterbag_timeserie_inmet.csv', index_col=0)

# Convert columns contaning lists from string to json objects
event_ts['event groups'] = event_ts['event groups'].map(json.loads)
event_ts['event ids'] = event_ts['event ids'].map(json.loads)

# Convert index to datetime
event_ts.set_index(pd.to_datetime(event_ts.index), inplace=True)

display(event_ts.sample(5), event_ts.shape)

Unnamed: 0,label,events,event ids,event groups
2010-08-19 09:00:00,0,0,[],[]
2012-04-07 18:00:00,0,0,[],[]
2015-08-09 07:00:00,0,0,[],[]
2018-04-25 06:00:00,0,0,[],[]
2021-10-04 21:00:00,0,0,[],[]


(108863, 4)

---
# 3. Stations' records labeled per cluster group

### Time serie of event ocorrence per group

In [10]:
groups = np.unique(event_ts['event groups'].sum())
groups_labels = [event_ts['event groups'].map(lambda labels: int((group in labels))).rename(group) for group in groups]
group_ts = pd.concat(groups_labels, 1)

group_ts.shape

(108863, 62)

#### Save time serie labeled per group

In [11]:
# group_ts.to_csv('Dados/Transform/waterbag_clusters_timeserie_inmet.csv', index=True)

#### Reload & preprocess

In [12]:
# Reload
group_ts = pd.read_csv('Dados/Transform/waterbag_clusters_timeserie_inmet.csv', index_col=0)
group_ts.set_index(pd.to_datetime(group_ts.index), inplace=True)

display(group_ts.sample(5), group_ts.shape)

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,54,56,57,58,59,60,61,63,65,67
2018-12-09 08:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-02-25 23:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-05-02 14:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-05-31 01:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010-06-22 13:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(108863, 62)