# INMET DATA PREPROCESSING

#### Import modules and libraries

In [1]:
import os, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns; sns.set()
from IPython.display import clear_output as co

#### Load INMET metheorlogical stations data

In [3]:
inmet = pd.read_csv('Dados/Clean/INMET.csv', index_col=0)

inmet.set_index(pd.to_datetime(inmet.index), inplace=True); data.shape

(108864, 65)

#### Load water bag catalog and incident clusters

In [4]:
bolsoes = pd.read_csv('Dados/water_bag_catalog_google.csv', index_col=0)
clusters = pd.read_csv('Dados/water_bag_clusters.csv', index_col=0)

bolsoes.shape, clusters.shape

((3140, 20), (2983, 20))

# 1. Preprocessing

Model arquitecture options:
1. Binary labels for (any) incident ocorrence in time intervals - Done
1. Binary labels for incident ocorrence in time intervals for each incident group - Done
1. Labels for each incident group (+ no incident) in time intervals
4. Shift labels to predict future ocorrence per hours prior
    1. Shift labels one, two, three positions to predict risk for next one, two, three hours (separate model for each hour step).
6. Reorder columns values as first, second ... stations closer
7. Predict incident quantity per day, week and month, in city, neighborhoods and clusters.
8. Predict time until next event (for city and event groups)

Additional data:
1. Time serie index (Make it a time serie model)
1. Trimester, month, week, day of the year.
2. Acumulated on last one, two, three... records (hours).

Obs: Filter records by arbitrary rule to reduce samples and overcome imbalancement

### Transform functions

In [4]:
def intervals_intersect_interval(values_min, values_max, time_min, time_max):
    return ~((values_max <= time_min) | (values_min >= time_max))

def timeSequenceEventLabels(target, time_serie, start, end, label_col, report=1000): # Accepts time_serie as pandas Datetime Index
    period_label = []
    for i in range(time_serie.shape[0] - 1):
        if (i+1) % report == 0:
            co(wait=True); print(f'{i+1}/{time_serie.shape[0]} bin edges evaluated...')
        periods_intersect = intervals_intersect_interval(target[start], target[end], time_serie[i], time_serie[i+1])
        target_bin = target[periods_intersect]
        label = 1 if len(target_bin) else 0
        n_events = len(target_bin)
        event_ids = target_bin.index.tolist()
        event_groups = target_bin[label_col].tolist()
        period_label.append([label, n_events, event_ids, event_groups])
    
    print(f'Done! {time_serie.shape[0] - 1} periods evaluated.')
    return pd.DataFrame(period_label, columns=['label', 'events', 'event ids', 'event groups'], index=time_serie[:-1])

### Create events collection
   0. Event id
   1. Start time
   2. End time
   3. Group

In [7]:
events = bolsoes[['EVENTO_INICIO', 'EVENTO_FIM']].apply(pd.to_datetime)
events = events.join(clusters['sublabel'], how='inner')

display(events.head(4)); display(events.shape)

Unnamed: 0_level_0,EVENTO_INICIO,EVENTO_FIM,sublabel
EVENTO_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1258,2015-09-12 20:12:00,2015-09-13 03:39:00,12
1259,2015-09-12 22:13:00,2015-09-13 01:21:00,-1
1260,2015-09-12 22:15:00,2015-09-13 01:20:00,-1
1261,2015-09-12 22:16:00,2015-09-13 01:20:00,-1


(2983, 3)

### Drop outlier events (Optional)

In [6]:
# events = events[events['sublabel'] != -1]#.drop_duplicates().set_index('EVENTO_INICIO')

events.shape

(2983, 3)

### Label time records by events presence

In [60]:
label_col = 'sublabel'
start, end = 'EVENTO_INICIO', 'EVENTO_FIM'
report = 1000

event_timeserie = timeSequenceEventLabels(events, data.index, start, end, label_col, report=1000)

108000/108864 bin edges evaluated...


### Save and reload labeled time serie

In [22]:
# event_timeseriel.to_csv('../Dados/Transform/waterbag_timeserie_inmet.csv', index=True)

event_ts = pd.read_csv('Dados/Transform/waterbag_timeserie_inmet.csv', index_col=0)
event_ts.set_index(pd.to_datetime(event_ts.index), inplace=True); event_ts.sample(5)

Unnamed: 0,label,events,event ids,event groups
2022-02-15 16:00:00,0,0,[],[]
2015-05-09 14:00:00,0,0,[],[]
2017-07-24 11:00:00,0,0,[],[]
2019-02-07 05:00:00,1,11,"[39326, 39327, 39331, 39334, 39337, 39344, 393...","[23, 0, 3, -1, 12, -1, 34, -1, 10, 15, -1]"
2013-06-13 17:00:00,0,0,[],[]


In [None]:
2. 