# WATER BAG PREDICTION PER CRITICAL REGION

### BASED ON INMET METHEOROLOGICAL STATIONS DATA

---
#### Change project root directory

In [1]:
cd ../

C:\Users\luisr\Desktop\Repositories\Data Science Projects\Hackaton COR IV - Centro de Operações do RJ\ACELERAÇÃO


#### Import modules and libraries

In [2]:
import os, json, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns; sns.set()
from IPython.display import clear_output as co

#### Time series transformation & binary classification pipeline

from Modulos.timeserie_transform import TimeseriesTransformPipeline
from Modulos.imbalanced_selection import groupConsecutiveFlags, MinorityGroupSplitUndersample
from Modulos.imbalanced_classification import ClassificationPipeline, classesGroupRecall

#### Preprocessing and machine learning modules

from sklearn.preprocessing import MinMaxScaler as mms
from Modulos.cv_samplers import GroupUnderSampleSplit, print_cls_cnt

from sklearn.utils import all_estimators
from sklearn.metrics import classification_report as cr

from imblearn import ensemble
from imblearn.metrics import classification_report_imbalanced as cri

#  Classifier colleciton
classifiers = dict(all_estimators('classifier'))

# Specific classification models

gbc = classifiers['GradientBoostingClassifier']
brfc = ensemble.BalancedRandomForestClassifier

#### Metrics and scoring functions

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, precision_score

recall_0 = make_scorer(recall_score, pos_label=0)
recall_1 = make_scorer(recall_score, pos_label=1)
precision_0 = make_scorer(precision_score, pos_label=0)
precision_1 = make_scorer(precision_score, pos_label=1)

scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall', 'precision': 'precision',
    'recall-0': recall_0, 'recall-1': recall_1,
    'precision-0': precision_0, 'precision-1': precision_1
}



---
## Load & Preprocess Data

In [4]:
from Modulos.waterbags import waterbag_project

project = waterbag_project(
    time_serie='clusters', freq='upsample', load_waterbags=True, 
    time_features=False, inmet_features=False, alerta_features=False,
)

data = project.data
group_ts = project.time_serie
waterbags = project.waterbags

# Drop records before july, 2018
data = data['2018-06':]
group_ts = group_ts['2018-06':]

---

# Data Transformation

In [5]:
time_features = [
    'year', 'month', 'day', 'hour', 'minute', 'date', 'time',
    'index', 'dayofyear', 'weekofyear', 'weekday', 'quarter'
]

### Define feature dataset and target variables

In [7]:
top_grp = '1'

routes = waterbags.groupby(['sublabel', 'main_route']).first().index.to_frame().set_index('sublabel').to_dict()['main_route']
print(f'Train and test sets for water bag group: {routes[int(top_grp)]}. (id: {top_grp})')

Y = group_ts[str(top_grp)].copy()

# group_min_time = Y[Y==1].index.min() # Group specific minimum time value
group_min_time = '2018-06'

X = TimeseriesTransformPipeline(
    data, # group_min_time,
    cut=-1, drop_empty_cols=True,
#     label_encode=time_features,
    scale=True, interpolate=None, fillna=None
)

Xf = TimeseriesTransformPipeline(
    X, interpolate='nearest', fillna='min'
)

Y = Y.loc[X.index]

groups = groupConsecutiveFlags(ts=Y)

display(pd.concat([Y.value_counts().to_frame('Class Count'), Y.value_counts(normalize=True).to_frame('Class Count')], 1))

Train and test sets for water bag group: Rua do Catete. (id: 1)
Initial data: (142867, 230)
Drop empty columns:  (142866, 217)
Initial data: (142866, 217)


Unnamed: 0,Class Count,Class Count.1
0.0,142083,0.994519
1.0,783,0.005481


---
# Base line classification model

In [31]:
# Main parameters definition
seed = 0

n_splits = 5
train_size = 0.66
test_size = 0.33
train_prct = None
test_prct = None

# Model definition
model = gbc(n_estimators=100, random_state=seed, verbose=1)

### Fit base line model

In [32]:
### Instantiate classification pipeline object
CP = ClassificationPipeline(
    n_splits=5,
    train_size=train_size, train_prct=train_prct,
    test_size=test_size, test_prct=test_prct,
    shuffle=True, random_state=seed,
)

# Model & Evaluate
score, cls_cnt = CP.binary(
    Xf, Y, model,
    groups=groups, strategy='GroupShuffleSplit',
    return_cls_cnt=True, store=True
)

cls_cnt.index = cls_cnt.index.astype('str')
display(pd.concat([score, cls_cnt['Train set']], 1).fillna(''))

      Iter       Train Loss   Remaining Time 
         1           0.0352            3.17m
         2           0.0325            3.02m
         3           0.0302            2.93m
         4           0.0280            2.87m
         5           0.0262            2.82m
         6           0.0249            2.78m
         7           0.0240            2.75m
         8           0.0230            2.71m
         9           0.0222            2.68m
        10           0.0216            2.64m
        20           0.0155            2.35m
        30           0.0117            2.07m
        40           0.0086            1.87m
        50           0.0071            1.67m
        60           0.0058            1.36m
        70           0.0051            1.03m
        80           0.0044           40.84s
        90           0.0064           20.25s
       100 335839869723141128323072.0000            0.00s


Unnamed: 0,precision,recall,f1-score,support,Train set
0.0,0.996649,0.999365,0.998005,48813.0,93270.0
1.0,0.761538,0.376426,0.503817,263.0,514.0
accuracy,0.996027,0.996027,0.996027,0.996027,
macro avg,0.879094,0.687895,0.750911,49076.0,
weighted avg,0.995389,0.996027,0.995357,49076.0,


### Save finalized model

In [35]:
import pickle

pickle.dump(CP.model, open('ModeloBolsões.sav', 'wb'))

### Save model deployment transformation info

#### Transformation steps:
1. Scale all columns based on 'X' sample min and max values
2. Fill missing values with sample min values

In [53]:
X_raw = TimeseriesTransformPipeline(
    data, cut=-1, drop_empty_cols=True,
)

X_info = X_raw.describe().loc[['min', 'max']].T
X_info.to_csv('InformaçõesDeploy.csv', index=True)

display(X_info.head(), X_info.shape)

Initial data: (142867, 230)
Drop empty columns:  (142866, 217)


Unnamed: 0,min,max
acumulado_chuva_1_h - A602,0.0,97.2
pressao_minima - A602,999.3,1030.7
temperatura - A602,11.2,38.9
pressao_maxima - A602,999.4,1031.0
pressao - A602,999.3,1030.9


(217, 2)