# Waterbag-RJ Model API - Scheduled Real-Time Models Prediction

In [None]:
cd ../../../../Apps/Python/bolsao-api

## Polygons time series 'Models' class

In [2]:
from warnings import filterwarnings as fws; fws('ignore')
import os, pandas as pd, numpy as np, pickle
from modules.time_features import load_time_features, now_info
from modules.stations import alertario_live, inmet_live

# features settings

alertario_features = [
    'acumulado_chuva_15_min', 'acumulado_chuva_1_h', 'acumulado_chuva_4_h',
    'acumulado_chuva_24_h', 'acumulado_chuva_96_h',
]

inmet_features = [
    'acumulado_chuva_1_h', 'pressao', 'pressao_maxima',
    'pressao_minima', 'radiacao_global', 'rajada_vento_max',
    'temperatura', 'temperatura_maxima', 'temperatura_minima',
    'temperatura_orvalho', 'temperatura_orvalho_maximo',
    'temperatura_orvalho_minimo', 'velocidade_vento'
]

# Load multiple models

def load_models(keys, path, file_fmt='{}.pickle'):
    models = {}
    for model_id in keys:
        path_model = path + str(model_id) + '/'
        if os.path.exists(path_model):
            models[model_id] = pickle.load(open(path_model + file_fmt.format(model_id), 'rb'))
    return models

# Load multiple label encoders

def load_encoders(path):
    models = {}
    for file in os.listdir(path):
        model_id, ext = file.split('.')
        if ext == 'pickle':
            models[model_id] = pickle.load(open(path + file, 'rb'))
    return models

class Models:
    
    def __init__(self,
        models, encoders, names, deploy_info,
        fill='mean', report_ignored=False
    ):
        self.models=models; self.encoders=encoders; self.names=names; self.deploy_info=deploy_info;
        self.fill=fill; self.report_ignored=report_ignored
        if type(models) is str:
            self.models = load_models(names.index, models)
        if type(encoders) is str:
            self.encoders = load_encoders(encoders)

    def get_features(self, now):
        time_flat = load_time_features(now, self.encoders)
        inmet_flat = self.flat_features(inmet_live(), 'id_estacao', inmet_features)
        alertario_flat = self.flat_features(alertario_live(), 'id_estacao', alertario_features)
        return pd.concat([df for df in [time_flat, inmet_flat, alertario_flat] if len(df)])

    def flat_features(self, data, id_col, features):
        ids = data[id_col].unique()
        return pd.concat([data.set_index(id_col).loc[_id, features].add_suffix(f' - {_id}') for _id in ids], axis=0)

    # Features formatting
    def formatted_features(self, features):

        # Handle missing features - fill with the mean
        missing_features = list(set(self.deploy_info.index).difference(features.index))
        if len(missing_features):
            print(f'Missing features ({len(missing_cols)}):', missing_cols, '\n')
            for name in missing_features:
                features[name] = self.deploy_info.loc[name, self.fill]

        # Handle extra features (ignore)
        ignored_cols = list(set(features.index).difference(self.deploy_info.index))
        if len(ignored_cols) and self.report_ignored:
            print(f'Ignored features ({len(ignored_cols)}):', ignored_cols, '\n')

        # Reorder features to match model input format
        features = features.loc[self.deploy_info.index]

        # Fill missing values - with the mean
        features.replace('', np.nan, inplace=True)
        na_msk = features.isna()
        if na_msk.sum():
            features[na_msk] = self.deploy_info.loc[na_msk, self.fill]

        # Reshape and return
        return features.to_frame().T

    # Calibrate predicted probabilities function
    def calibrate(self, prob, threshold=0.5):
        if prob < threshold:
            return 0.5 * prob / threshold
        else:
            return 0.5 + 0.5 * (prob - threshold) / (1 - threshold)

    # Multi model prediction
    def multi_model_prediction(self, features, time_info):
        predictions = []
        for model_id, model in self.models.items():
            yprob = model.predict_proba(features)[0][1]
            # yprob_cal = self.calibrate(yprob, model['metadata']['threshold'])
            yconf = abs(0.5 - yprob) / 0.5
            label = int(yprob >= 0.5)
            ### Prediction record
            predictions.append({
                'timestamp': time_info['now'],
                'date': time_info['today'],
                'time': time_info['time'],
                'cluster_id': model_id,
                'cluster': self.names[model_id],
                'range': '1h',
                'probability': round(yprob, 6),
                'confidence': round(yconf, 6),
                'label': label,
            })
        return predictions        
    
    # Make and post predictions task
    def predict(self, as_datetime=False):
        # ---
        now, today, time = now_info()
        print(f'Scheduled prediction starting. Date: {today}, Time: {time}.')
        # ---
        features = self.get_features(now)
        features = self.formatted_features(features)
        # ---
        time_info = {'now': now, 'today': today, 'time': time}
        predictions = self.multi_model_prediction(features, time_info)
        # ---
        now, today, time = now_info()
        print(f'Scheduled prediction: Success. Date: {today}, Time: {time}.')
        # ---
        if not as_datetime:
            for pred in prediction:
                pred['timestamp'] = pred['timestamp'].strftime('%Y-%m-%d %X')
        # ---
        return predictions

### Models instance parameters settings

In [19]:
# ---
# Deployment resources

path_info = 'static/models/feature_info.csv'
path_clusters = 'static/clusters/clusters_micro.csv'

deploy_info = pd.read_csv(path_info, index_col=0)
clusters = pd.read_csv(path_clusters, index_col=0)['main_route']

### Models instance

In [20]:
model = Models(
    'models/', 'encoders/', clusters, deploy_info,
    fill='mean', report_ignored=False,
)

### Models prediction

In [21]:
prediction = model.predict()

pd.DataFrame(prediction).head()

Scheduled prediction starting. Date: 2022-12-30, Time: 20:29:17.
Scheduled prediction: Success. Date: 2022-12-30, Time: 20:29:23.


Unnamed: 0,timestamp,date,time,cluster_id,cluster,range,probability,confidence,label
0,2022-12-30 20:29:17.337867-03:00,2022-12-30,20:29:17,0,Avenida Armando Lombardi,1h,0.027737,0.944525,0
1,2022-12-30 20:29:17.337867-03:00,2022-12-30,20:29:17,1,Rua do Catete,1h,0.000652,0.998695,0
2,2022-12-30 20:29:17.337867-03:00,2022-12-30,20:29:17,2,Rua Tonelero,1h,0.004486,0.991028,0
3,2022-12-30 20:29:17.337867-03:00,2022-12-30,20:29:17,3,Avenida Epitácio Pessoa,1h,0.000744,0.998512,0
4,2022-12-30 20:29:17.337867-03:00,2022-12-30,20:29:17,4,Avenida Ministro Ivan Lins,1h,0.02881,0.942379,0


### Post predictions to mongo database

In [22]:
from modules.mongo import post_to_mongo

post_to_mongo(model.predict(), 'Waterbag', 'Prediction')

Scheduled prediction starting. Date: 2022-12-30, Time: 20:29:25.
Scheduled prediction: Success. Date: 2022-12-30, Time: 20:29:28.


'SUCCESS'