# Waterbag Model API Deployment Test Environment

### Utility functions

In [9]:
# pip install --upgrade google-cloud-bigquery
import os, json, pandas as pd, numpy as np, requests, pickle, pymongo
from datetime import datetime, timezone
from sklearn.preprocessing import MinMaxScaler as mms
from google.cloud import bigquery
from google.oauth2 import service_account
from datetime import datetime
import pytz; tz_br = pytz.timezone('Brazil/East')
datetime.now(tz_br).isoformat()

'2022-09-26T18:12:52.120368-03:00'

#### Flat stations' observations

In [28]:
row_map = lambda row: row[1].add_suffix(' - ' + row[0])

def flat_observations(data):
    return pd.concat(list(map(row_map, data.iterrows())))

#### Calibrate predicted probability

In [44]:
def calibrate(prob, threshold=0.5):
    if prob < threshold:
        return 0.5 * prob / threshold
    else:
        return 0.5 + 0.5 * (prob - threshold) / (1 - threshold)

---
# Model deployment information

In [30]:
info_path = 'deploy_info.csv'
deploy_info = pd.read_csv(info_path, index_col=0)

from alerta_deploy import alerta_feature_name_map, alerta_station_name_id_map

---
# Inmet bigquery request - python client library

In [27]:
project_id = 'pluvia-360323'
google_credentials = '../../../../Apps/Servers/bolsao-api/pluvia-360323-cba05f315c7e.json'
credentials = service_account.Credentials.from_service_account_file(google_credentials)

query = '''
SELECT * FROM `datario.meio_ambiente_clima.meteorologia_inmet`
WHERE data_particao >= "{}"
ORDER BY data_particao DESC, horario DESC
'''

def inmet_bigquery_request():
    yesterday = (datetime.now(tz_br) - pd.offsets.Day()).date().isoformat()
    client = bigquery.Client(credentials=credentials)
    query_job = client.query(query.format(yesterday))
    inmet = pd.DataFrame(list(map(dict, query_job.result())))

    ### Inmet data preprocessing
    key_cols = ['primary_key', 'data_particao', 'horario']
    # Last available record per station
    last_records = inmet.groupby(['id_estacao']).first().drop(key_cols, axis=1)

    # Flat stations' readings
    return flat_observations(last_records)

# Inmet bigquery request - python client library
inmet_flat = inmet_bigquery_request()

---
# Alerta-Rio API request

In [31]:
def alertario_api_request():
    AlertaAPI = r'http://websempre.rio.rj.gov.br/json/chuvas'
    alerta = pd.DataFrame(requests.get(AlertaAPI).json()['objects'])

    # Alerta-Rio data preprocessing
    alerta = pd.DataFrame(
        alerta['data'].tolist(),
        index=alerta['name'].map(alerta_station_name_id_map).astype('str')
    ).rename(columns=alerta_feature_name_map)

    # Flat stations observations
    return flat_observations(alerta)

# ---
# Alerta-Rio API request
alerta_flat = alertario_api_request()

---
# Feature transformation

### Combine and transform observations from both sources

In [32]:
#### Reorder readings to match model input format
features = pd.concat([inmet_flat, alerta_flat]).loc[deploy_info.index].to_frame().T
print(features.shape)

#### Min max scale transformation
scaler = mms().fit(deploy_info.T)
features.loc[features.index] = scaler.transform(features)

#### Fill missing values with variable minimum
na_msk = features.loc[0].isna()
features.loc[0, na_msk] = deploy_info['min'][na_msk]

(1, 217)


---
# Multiple model prediction

In [56]:
def load_models(clusters, path_models, file_model):
    models = {}
    for model_id in clusters.index:
        model_name = clusters.loc[model_id]
        path_model = path_models + model_name + '-' + str(model_id) + '/'
        if os.path.exists(path_model):
            models[model_id] = {
                'name': model_name,
                'model': pickle.load(open(path_model + file_model.format(model_id), 'rb')),
                'metadata': json.load(open(path_model + 'metadata.json', 'r'))
            }
    return models

path_clusters = '../Dados/Clusters/clusters_micro.csv'
clusters = pd.read_csv(path_clusters, index_col=0)['main_route']

### Load classification model for each cluster
path_models = 'Modelos/'
file_model = 'model_{}.pickle'

models = load_models(clusters, path_models, file_model)

### Prediction record

In [59]:
def multi_model_prediction(models, features, time_info):
    
    predictions = []
    for model_id, model in models.items():
        ### Model output transformation
        yprob = model['model'].predict_proba(features)[0][1]
#         yprob_cal = calibrate(yprob, model['metadata']['threshold'])
        yconf = abs(0.5 - yprob) / 0.5
        label = int(yprob >= 0.5)
        ### Prediction record
        predictions.append({
            'timestamp': time_info['now'],
            'date': time_info['today'],
            'time': time_info['time'],
            'cluster_id': model_id,
            'cluster': model['name'],
            'range': '1h',
            'probability': round(yprob, 6),
            'confidence': round(yconf, 6),
            'label': label,
        })
    return predictions
        

now = datetime.now()
today = now.date().isoformat()
time = now.time().isoformat()[:8]

time_info = {'now': now, 'today': today, 'time': time}

predictions = multi_model_prediction(models, features, time_info)

In [60]:
predictions

[{'timestamp': datetime.datetime(2022, 9, 26, 18, 50, 41, 788350),
  'date': '2022-09-26',
  'time': '18:50:41',
  'cluster_id': 0,
  'cluster': 'Avenida Armando Lombardi',
  'range': '1h',
  'probability': 0.011933,
  'confidence': 0.976134,
  'label': 0},
 {'timestamp': datetime.datetime(2022, 9, 26, 18, 50, 41, 788350),
  'date': '2022-09-26',
  'time': '18:50:41',
  'cluster_id': 1,
  'cluster': 'Rua do Catete',
  'range': '1h',
  'probability': 0.003329,
  'confidence': 0.993342,
  'label': 0}]

---
# Save prediction to mongo database

In [867]:
conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=10000)

insert_result = client.Waterbag.Prediction.insert_many(predictions)
insert_result.inserted_ids

---
# API Endpoints - Retrieve predictions

In [70]:
def to_str_id(obj):
    obj['_id'] = str(obj['_id'])
    return obj

### Mongo client instance

In [65]:
conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"

## Predictions collection endpoint

#### Endpoint params description:

1. Documents will be filtered by the params matching fields in the documents.
2. Params can have multiple values divided by comma, i.e '/predictions?cluster_id=0,1,2'
3. Optional parameters:
    1. sort -> field to sort by. Default: 'timestamp'
    2. sort_order -> '1' or '-1'. Default: '-1'
    3. limit -> integer greater than 1. Default: None
    
#### Request url examples:

1. /predictions?cluster_id=0,1,2&date=2022-09-27
2. /predictions?cluster_id=0&sort=timestamp&sort_order=-1
3. /predictions?cluster_id=0&limit=100

In [233]:
from sklearn.model_selection import cross_validate, KFold

In [226]:
def url_param_processing(query):
    
    if 'limit' not in query.keys():
        limit = None            # Default limit
    else:
        limit = int(query['limit'])
        del query['limit']
    
    if 'sort' not in query.keys():
        sort_by = 'timestamp'   # Default sorting
        sort_order = -1
    else:
        sort_by = query['sort']
        del query['sort']
        if 'sort_order' not in query.keys():
            sort_order = -1     # Default sort order
        else:
            sort_order = int(query['sort_order'])
            del query['sort_order']
            
    query_spread = {key: {'$in': str(value).split(',')} for key, value in query.items()}    
    return query_spread, sort_by, sort_order, limit

def prediction_records(query):
    query_spread, sort_by, sort_order, limit = url_param_processing(query)
    print(
        'Endpoint Request: /predictions. Query Params:', query_spread,
        ' URL Params:', {'sort': sort_by, 'sort_order': sort_order, 'limit': limit}
    )
    client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=15000)
    docs = client.Waterbag.Prediction.find(query_spread).sort([(sort_by, sort_order)])
    if limit is not None:
        docs = docs.limit(limit)
    return list(map(to_str_id, docs)) # prediction object list

### Example query request

In [231]:
# query = request.args.to_dict() # Flask request url args
query = {'date': '2022-09-27', 'sort': 'cluster_id' , 'sort_order': '-1', 'limit': '3'}

prediction_records(query)

Endpoint Request: /predictions. Query Params: {'date': {'$in': ['2022-09-27']}}  URL Params: {'sort': 'cluster_id', 'sort_order': -1, 'limit': 3}


[{'_id': '6333205abf7dfe3425cd5d8c',
  'timestamp': datetime.datetime(2022, 9, 27, 16, 10),
  'date': '2022-09-27',
  'time': '13:10:00',
  'cluster_id': '4',
  'cluster': 'Avenida Ministro Ivan Lins',
  'range': '1h',
  'probability': 0.008267,
  'confidence': 0.983465,
  'label': '0'},
 {'_id': '63331f2ebf7dfe3425cd5d86',
  'timestamp': datetime.datetime(2022, 9, 27, 16, 5, 0, 1000),
  'date': '2022-09-27',
  'time': '13:05:00',
  'cluster_id': '4',
  'cluster': 'Avenida Ministro Ivan Lins',
  'range': '1h',
  'probability': 0.008267,
  'confidence': 0.983465,
  'label': '0'},
 {'_id': '63331e02bf7dfe3425cd5d80',
  'timestamp': datetime.datetime(2022, 9, 27, 16, 0),
  'date': '2022-09-27',
  'time': '13:00:00',
  'cluster_id': '4',
  'cluster': 'Avenida Ministro Ivan Lins',
  'range': '1h',
  'probability': 0.008267,
  'confidence': 0.983465,
  'label': '0'}]

## Last timestamp predictions

#### /predict endpoint

In [129]:
def last_prediction_record(limit=500):
    
    now = datetime.now(tz_br)
    today = now.date().isoformat()
    time = now.time().isoformat()[:8]
    yesterday = (now - pd.offsets.Day()).date().isoformat()
    
    sort_by = [('timestamp', -1), ('cluster_id', 1)]
    last_24h = {
        "$or": [{
            "date": today
        }, {
            '$and': [{'date': yesterday}, {'time': {'$gte': time}}]
        }]
    }
    
    ### Consult prediction database latest record
    client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=15000)
    first_docs = client.Waterbag.Prediction.find(last_24h).sort(sort_by).limit(limit)
    first_docs = pd.DataFrame(list(map(to_str_id, first_docs))) # prediction object list
    docs_clusters = first_docs.groupby('cluster_id', as_index=False).first()
    
    return list(docs_clusters.T.to_dict().values())

last_prediction_record(limit=500)[:3]

[{'cluster_id': 0,
  '_id': '633311214d7183838dc5561c',
  'timestamp': Timestamp('2022-09-27 15:05:00'),
  'date': '2022-09-27',
  'time': '12:05:00',
  'cluster': 'Avenida Armando Lombardi',
  'range': '1h',
  'probability': 0.010816,
  'confidence': 0.978369,
  'label': 0},
 {'cluster_id': 1,
  '_id': '633311214d7183838dc5561d',
  'timestamp': Timestamp('2022-09-27 15:05:00'),
  'date': '2022-09-27',
  'time': '12:05:00',
  'cluster': 'Rua do Catete',
  'range': '1h',
  'probability': 0.003329,
  'confidence': 0.993342,
  'label': 0},
 {'cluster_id': 2,
  '_id': '633311214d7183838dc5561e',
  'timestamp': Timestamp('2022-09-27 15:05:00'),
  'date': '2022-09-27',
  'time': '12:05:00',
  'cluster': 'Rua Tonelero',
  'range': '1h',
  'probability': 0.004038,
  'confidence': 0.991925,
  'label': 0}]

---
# Extra: Test predict endpoint

In [96]:
api_url = 'https://bolsoes-api.herokuapp.com'

print(requests.get(api_url + '/predict').text)

{"_id":"632c32223e18981a9a002da8","cluster":"Catete","confidence":0.9841241118565575,"date":"2022-09-22","probability":0.00793794407172125,"range":"1h","time":"07:00:00","timestamp":"Thu, 22 Sep 2022 10:00:00 GMT"}



## Extra: Clean database

In [62]:
conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"
client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=15000)

delete_res = client.Waterbag.Prediction.delete_many({})

In [63]:
delete_res.raw_result

{'n': 1267,
 'electionId': ObjectId('7fffffff000000000000000c'),
 'opTime': {'ts': Timestamp(1664233178, 1269), 't': 12},
 'ok': 1.0,
 '$clusterTime': {'clusterTime': Timestamp(1664233178, 1269),
  'signature': {'hash': b'\xd7\x806c"x\xed\xd9\x0bm\x04\xb6w\xfa\xd2\xddu\xb6\xf3\n',
   'keyId': 7088356184993824773}},
 'operationTime': Timestamp(1664233178, 1269)}