# Waterbag Model API Deployment Test Environment

### Utility functions

In [1]:
# pip install --upgrade google-cloud-bigquery
import os, json, pandas as pd, numpy as np, requests, pickle, pymongo
from datetime import datetime, timezone
from sklearn.preprocessing import MinMaxScaler as mms
from google.cloud import bigquery
from google.oauth2 import service_account
from datetime import datetime
import pytz; tz_br = pytz.timezone('Brazil/East')
datetime.now(tz_br).isoformat()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


'2022-10-17T10:51:44.442451-03:00'

#### Flat stations' observations

In [2]:
row_map = lambda row: row[1].add_suffix(' - ' + row[0])

def flat_observations(data):
    return pd.concat(list(map(row_map, data.iterrows())))

#### Calibrate predicted probability

In [3]:
def calibrate(prob, threshold=0.5):
    if prob < threshold:
        return 0.5 * prob / threshold
    else:
        return 0.5 + 0.5 * (prob - threshold) / (1 - threshold)

---
# Model deployment information

In [12]:
info_path = 'feature_info.csv'
deploy_info = pd.read_csv(info_path, index_col=0)

from alerta_deploy import alerta_feature_name_map, alerta_station_name_id_map

---
# Inmet bigquery request - python client library

In [196]:
project_id = 'pluvia-360323'
google_credentials = '../../../../Apps/Python/bolsao-api/credentials/pluvia-360323-eae2907a9c98.json'
credentials = service_account.Credentials.from_service_account_file(google_credentials)

query = '''
SELECT * FROM `datario.meio_ambiente_clima.meteorologia_inmet`
WHERE data_particao >= "{}"
ORDER BY data_particao DESC, horario DESC
'''

def inmet_bigquery_request():
    yesterday = (datetime.now(tz_br) - pd.offsets.Day()).date().isoformat()
    client = bigquery.Client(credentials=credentials)
    query_job = client.query(query.format(yesterday))
    inmet = pd.DataFrame(list(map(dict, query_job.result())))

    ### Inmet data preprocessing
    key_cols = ['primary_key', 'data_particao', 'horario']
    # Last available record per station
    last_records = inmet.groupby(['id_estacao']).first().drop(key_cols, axis=1)

    # Flat stations' readings
    return flat_observations(last_records)

# Inmet bigquery request - python client library
inmet_flat = inmet_bigquery_request(); inmet_flat.head()

pressao - A602                       1019.4
pressao_minima - A602                1019.3
pressao_maxima - A602                1019.5
temperatura_orvalho - A602             18.8
temperatura_orvalho_minimo - A602      18.5
dtype: float64

In [202]:
inmet_flat

pressao - A602                       1019.4
pressao_minima - A602                1019.3
pressao_maxima - A602                1019.5
temperatura_orvalho - A602             18.8
temperatura_orvalho_minimo - A602      18.5
temperatura_orvalho_maximo - A602      19.0
umidade - A602                         73.0
umidade_minima - A602                  68.0
umidade_maxima - A602                  73.0
temperatura - A602                     24.0
temperatura_minima - A602              24.0
temperatura_maxima - A602              25.1
rajada_vento_max - A602                 9.9
direcao_vento - A602                  102.0
velocidade_vento - A602                 5.0
radiacao_global - A602                853.6
acumulado_chuva_1_h - A602              0.0
pressao - A621                       1017.4
pressao_minima - A621                1016.9
pressao_maxima - A621                1017.4
temperatura_orvalho - A621             18.8
temperatura_orvalho_minimo - A621      18.5
temperatura_orvalho_maximo - A62

---
# Alerta-Rio API request

In [18]:
def alertario_api_request():
    AlertaAPI = r'http://websempre.rio.rj.gov.br/json/chuvas'
    alerta = pd.DataFrame(requests.get(AlertaAPI).json()['objects'])

    # Alerta-Rio data preprocessing
    alerta = pd.DataFrame(
        alerta['data'].tolist(),
        index=alerta['name'].map(alerta_station_name_id_map).astype('str')
    ).rename(columns=alerta_feature_name_map)

    # Flat stations observations
    return flat_observations(alerta)

# ---
# Alerta-Rio API request
alerta_flat = alertario_api_request(); alerta_flat.head()

m05 - 1                        0.0
acumulado_chuva_15_min - 1     0.0
mes - 1                       12.4
acumulado_chuva_96_h - 1       3.2
acumulado_chuva_24_h - 1       3.2
dtype: float64

---
# Feature transformation

In [213]:
def load_models(keys, path, file_fmt):
    models = {}
    for model_id in keys:
        path_model = path + str(model_id) + '/'
        if os.path.exists(path_model):
            models[model_id] = pickle.load(open(path_model + file_fmt.format(model_id), 'rb'))
    return models

def load_encoders(path):
    models = {}
    for file in os.listdir(path):
        model_id, ext = file.split('.')
        if ext == 'pickle':
            models[model_id] = pickle.load(open(path + file, 'rb'))
    return models

time_keys = [
    'month', 'day', 'hour', 'minute', 'time',
    'dayofyear', 'weekofyear', 'weekday', 'quarter'
]

def load_time_features(now, encoders):
    
    ts = pd.DatetimeIndex([now]).floor('15Min')
    values = [
        ts.month, ts.day, ts.hour, ts.minute, ts.time,
        ts.dayofyear, ts.weekofyear, ts.weekday, ts.quarter
    ]
    time_features = pd.DataFrame(np.array(values).T, index=ts, columns=time_keys)

    # data type conversion for label encoding
    float_cols = [key for key in time_keys if key != 'time']
    time_features[float_cols] = time_features[float_cols].astype('float')
    time_features['time'] = time_features['time'].astype(str)

    # Label encoding
    for col in time_features.columns:
        time_features[col] = encoders[col].transform(time_features[col])
    return time_features.iloc[0]

### Load models and encoders

In [214]:
path_clusters = '../Dados/Clusters/clusters_micro.csv'
clusters = pd.read_csv(path_clusters, index_col=0)['main_route']

### Load classification model for each cluster

models = load_models(clusters.index, 'Modelos/', file_fmt='{}.pickle')
encoders = load_encoders('Encoders/')

### Time features

In [None]:
now = datetime.now()
today = now.date().isoformat()
time = now.time().isoformat()[:8]

time_flat = load_time_features(now, encoders); time_flat

In [10]:
now.isoformat()

'2022-10-16T18:24:36.937044'

### Combine and transform observations from different sources

In [198]:
features = pd.concat([time_flat, inmet_flat, alerta_flat])

# Handle missing features
missing_cols = list(set(deploy_info.index).difference(features.index))
ignored_cols = list(set(features.index).difference(deploy_info.index))

if len(missing_cols) or len(ignored_cols):
    print(f'Missing features ({len(missing_cols)}):', missing_cols, '\n')
    print(f'Ignored features ({len(ignored_cols)}):', ignored_cols, '\n')
    for col in missing_cols:
        features[col] = deploy_info.loc[col, 'mean']

#### Reorder readings to match model input format
features = features.loc[deploy_info.index].to_frame().T
print('Features shape:', features.shape)

#### Fill missing values with the mean
na_msk = features.loc[0].isna()
features.loc[0, na_msk] = deploy_info['mean'][na_msk]

Missing features (28): ['velocidade_vento - A652', 'pressao - A652', 'rajada_vento_max - A654', 'temperatura_orvalho_maximo - A652', 'acumulado_chuva_1_h - A654', 'pressao - A654', 'temperatura_maxima - A652', 'pressao_maxima - A652', 'pressao_maxima - A654', 'radiacao_global - A654', 'acumulado_chuva_1_h - A652', 'temperatura_orvalho_maximo - A654', 'pressao_minima - A652', 'pressao_minima - A654', 'year', 'temperatura_orvalho_minimo - A652', 'rajada_vento_max - A652', 'date', 'temperatura - A652', 'radiacao_global - A652', 'temperatura_minima - A654', 'temperatura_orvalho - A654', 'temperatura_maxima - A654', 'temperatura_orvalho - A652', 'temperatura_minima - A652', 'velocidade_vento - A654', 'temperatura - A654', 'temperatura_orvalho_minimo - A654'] 

Ignored features (144): ['h02 - 30', 'm05 - 13', 'umidade_minima - A602', 'm05 - 19', 'h02 - 12', 'h03 - 1', 'h03 - 9', 'm05 - 4', 'm05 - 12', 'h02 - 29', 'h03 - 7', 'h03 - 25', 'mes - 9', 'h03 - 33', 'm05 - 20', 'h02 - 22', 'h03 - 16

---
# Multiple model prediction

In [168]:
def multi_model_prediction(models, features, time_info, names):
    
    predictions = []
    for model_id, model in models.items():
        ### Model output transformation
        yprob = model.predict_proba(features)[0][1]
#         yprob_cal = calibrate(yprob, model['metadata']['threshold'])
        yconf = abs(0.5 - yprob) / 0.5
        label = int(yprob >= 0.5)
        ### Prediction record
        predictions.append({
            'timestamp': time_info['now'],
            'date': time_info['today'],
            'time': time_info['time'],
            'cluster_id': model_id,
            'cluster': names[model_id],
            'range': '1h',
            'probability': round(yprob, 6),
            'confidence': round(yconf, 6),
            'label': label,
        })
    return predictions        

### Prediction record

In [169]:
now = datetime.now()
today = now.date().isoformat()
time = now.time().isoformat()[:8]
time_info = {'now': now, 'today': today, 'time': time}

predictions = multi_model_prediction(models, features, time_info, names=clusters); predictions

[{'timestamp': datetime.datetime(2022, 10, 15, 7, 56, 2, 725342),
  'date': '2022-10-15',
  'time': '07:56:02',
  'cluster_id': 0,
  'cluster': 'Avenida Armando Lombardi',
  'range': '1h',
  'probability': 0.01209,
  'confidence': 0.97582,
  'label': 0},
 {'timestamp': datetime.datetime(2022, 10, 15, 7, 56, 2, 725342),
  'date': '2022-10-15',
  'time': '07:56:02',
  'cluster_id': 1,
  'cluster': 'Rua do Catete',
  'range': '1h',
  'probability': 0.001561,
  'confidence': 0.996877,
  'label': 0},
 {'timestamp': datetime.datetime(2022, 10, 15, 7, 56, 2, 725342),
  'date': '2022-10-15',
  'time': '07:56:02',
  'cluster_id': 2,
  'cluster': 'Rua Tonelero',
  'range': '1h',
  'probability': 0.000638,
  'confidence': 0.998724,
  'label': 0},
 {'timestamp': datetime.datetime(2022, 10, 15, 7, 56, 2, 725342),
  'date': '2022-10-15',
  'time': '07:56:02',
  'cluster_id': 3,
  'cluster': 'Avenida Epitácio Pessoa',
  'range': '1h',
  'probability': 0.008609,
  'confidence': 0.982782,
  'label': 0}

---
# Save prediction to mongo database

In [867]:
conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=10000)

insert_result = client.Waterbag.Prediction.insert_many(predictions)
insert_result.inserted_ids

---
# API Endpoints - Retrieve predictions

In [70]:
def to_str_id(obj):
    obj['_id'] = str(obj['_id'])
    return obj

### Mongo client instance

In [65]:
conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"

## Predictions collection endpoint

#### Endpoint params description:

1. Documents will be filtered by the params matching fields in the documents.
2. Params can have multiple values divided by comma, i.e '/predictions?cluster_id=0,1,2'
3. Optional parameters:
    1. sort -> field to sort by. Default: 'timestamp'
    2. sort_order -> '1' or '-1'. Default: '-1'
    3. limit -> integer greater than 1. Default: None
    
#### Request url examples:

1. /predictions?cluster_id=0,1,2&date=2022-09-27
2. /predictions?cluster_id=0&sort=timestamp&sort_order=-1
3. /predictions?cluster_id=0&limit=100

In [233]:
from sklearn.model_selection import cross_validate, KFold

In [226]:
def url_param_processing(query):
    
    if 'limit' not in query.keys():
        limit = None            # Default limit
    else:
        limit = int(query['limit'])
        del query['limit']
    
    if 'sort' not in query.keys():
        sort_by = 'timestamp'   # Default sorting
        sort_order = -1
    else:
        sort_by = query['sort']
        del query['sort']
        if 'sort_order' not in query.keys():
            sort_order = -1     # Default sort order
        else:
            sort_order = int(query['sort_order'])
            del query['sort_order']
            
    query_spread = {key: {'$in': str(value).split(',')} for key, value in query.items()}    
    return query_spread, sort_by, sort_order, limit

def prediction_records(query):
    query_spread, sort_by, sort_order, limit = url_param_processing(query)
    print(
        'Endpoint Request: /predictions. Query Params:', query_spread,
        ' URL Params:', {'sort': sort_by, 'sort_order': sort_order, 'limit': limit}
    )
    client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=15000)
    docs = client.Waterbag.Prediction.find(query_spread).sort([(sort_by, sort_order)])
    if limit is not None:
        docs = docs.limit(limit)
    return list(map(to_str_id, docs)) # prediction object list

### Example query request

In [231]:
# query = request.args.to_dict() # Flask request url args
query = {'date': '2022-09-27', 'sort': 'cluster_id' , 'sort_order': '-1', 'limit': '3'}

prediction_records(query)

Endpoint Request: /predictions. Query Params: {'date': {'$in': ['2022-09-27']}}  URL Params: {'sort': 'cluster_id', 'sort_order': -1, 'limit': 3}


[{'_id': '6333205abf7dfe3425cd5d8c',
  'timestamp': datetime.datetime(2022, 9, 27, 16, 10),
  'date': '2022-09-27',
  'time': '13:10:00',
  'cluster_id': '4',
  'cluster': 'Avenida Ministro Ivan Lins',
  'range': '1h',
  'probability': 0.008267,
  'confidence': 0.983465,
  'label': '0'},
 {'_id': '63331f2ebf7dfe3425cd5d86',
  'timestamp': datetime.datetime(2022, 9, 27, 16, 5, 0, 1000),
  'date': '2022-09-27',
  'time': '13:05:00',
  'cluster_id': '4',
  'cluster': 'Avenida Ministro Ivan Lins',
  'range': '1h',
  'probability': 0.008267,
  'confidence': 0.983465,
  'label': '0'},
 {'_id': '63331e02bf7dfe3425cd5d80',
  'timestamp': datetime.datetime(2022, 9, 27, 16, 0),
  'date': '2022-09-27',
  'time': '13:00:00',
  'cluster_id': '4',
  'cluster': 'Avenida Ministro Ivan Lins',
  'range': '1h',
  'probability': 0.008267,
  'confidence': 0.983465,
  'label': '0'}]

## Last timestamp predictions

#### /predict endpoint

In [129]:
def last_prediction_record(limit=500):
    
    now = datetime.now(tz_br)
    today = now.date().isoformat()
    time = now.time().isoformat()[:8]
    yesterday = (now - pd.offsets.Day()).date().isoformat()
    
    sort_by = [('timestamp', -1), ('cluster_id', 1)]
    last_24h = {
        "$or": [{
            "date": today
        }, {
            '$and': [{'date': yesterday}, {'time': {'$gte': time}}]
        }]
    }
    
    ### Consult prediction database latest record
    client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=15000)
    first_docs = client.Waterbag.Prediction.find(last_24h).sort(sort_by).limit(limit)
    first_docs = pd.DataFrame(list(map(to_str_id, first_docs))) # prediction object list
    docs_clusters = first_docs.groupby('cluster_id', as_index=False).first()
    
    return list(docs_clusters.T.to_dict().values())

last_prediction_record(limit=500)[:3]

[{'cluster_id': 0,
  '_id': '633311214d7183838dc5561c',
  'timestamp': Timestamp('2022-09-27 15:05:00'),
  'date': '2022-09-27',
  'time': '12:05:00',
  'cluster': 'Avenida Armando Lombardi',
  'range': '1h',
  'probability': 0.010816,
  'confidence': 0.978369,
  'label': 0},
 {'cluster_id': 1,
  '_id': '633311214d7183838dc5561d',
  'timestamp': Timestamp('2022-09-27 15:05:00'),
  'date': '2022-09-27',
  'time': '12:05:00',
  'cluster': 'Rua do Catete',
  'range': '1h',
  'probability': 0.003329,
  'confidence': 0.993342,
  'label': 0},
 {'cluster_id': 2,
  '_id': '633311214d7183838dc5561e',
  'timestamp': Timestamp('2022-09-27 15:05:00'),
  'date': '2022-09-27',
  'time': '12:05:00',
  'cluster': 'Rua Tonelero',
  'range': '1h',
  'probability': 0.004038,
  'confidence': 0.991925,
  'label': 0}]

---
# Extra: Test predict endpoint

In [96]:
api_url = 'https://bolsoes-api.herokuapp.com'

print(requests.get(api_url + '/predict').text)

{"_id":"632c32223e18981a9a002da8","cluster":"Catete","confidence":0.9841241118565575,"date":"2022-09-22","probability":0.00793794407172125,"range":"1h","time":"07:00:00","timestamp":"Thu, 22 Sep 2022 10:00:00 GMT"}



## Extra: Clean database

In [62]:
conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=15000)

delete_res = client.Waterbag.Prediction.delete_many({})

In [63]:
delete_res.raw_result

{'n': 1267,
 'electionId': ObjectId('7fffffff000000000000000c'),
 'opTime': {'ts': Timestamp(1664233178, 1269), 't': 12},
 'ok': 1.0,
 '$clusterTime': {'clusterTime': Timestamp(1664233178, 1269),
  'signature': {'hash': b'\xd7\x806c"x\xed\xd9\x0bm\x04\xb6w\xfa\xd2\xddu\xb6\xf3\n',
   'keyId': 7088356184993824773}},
 'operationTime': Timestamp(1664233178, 1269)}