In [21]:
from experiment.api import mlflow as mlflow_api
from experiment.utils import transformation

import numpy as np
import pandas as pd
import pathlib

# import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
working_dir = pathlib.Path.cwd()


In [23]:
mlflow = mlflow_api.MLFlow()
mlflow.clean(gc=True)

kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]


In [24]:
clean_annotations = pd.read_csv(
    working_dir.parent / "data" / "output" / "clean_annotations.csv"
)

# skip the last report
sentences = clean_annotations["relevant_text"].to_list()[-1:]
clean_labels = clean_annotations["classifications"].to_list()[-1:]

In [25]:
clean_sentences = transformation.sentence_cleaning_pipeline(sentences)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gokasci/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
model_config_path = (
    working_dir.parent / "data" / "input" / "model_config" / "nlp_experiment.json"
)
model_config = mlflow.get_model_config(model_config_path)

In [27]:
# run the server
mlflow.run_server()

In [28]:
# get the best metric 
best_metric = mlflow.get_best_run_by_metric("NLP Experiments", "val_accuracy")
best_metric

[2023-08-19 22:18:43 +0200] [27739] [INFO] Starting gunicorn 21.2.0
[2023-08-19 22:18:43 +0200] [27739] [INFO] Listening at: http://127.0.0.1:9999 (27739)
[2023-08-19 22:18:43 +0200] [27739] [INFO] Using worker: sync
[2023-08-19 22:18:43 +0200] [27741] [INFO] Booting worker with pid: 27741
[2023-08-19 22:18:43 +0200] [27742] [INFO] Booting worker with pid: 27742
[2023-08-19 22:18:43 +0200] [27743] [INFO] Booting worker with pid: 27743
[2023-08-19 22:18:43 +0200] [27744] [INFO] Booting worker with pid: 27744


{'run_id': 'ec7b2b48613d43418fd0f4c965a7d8b3', 'metrics': 0.6363636255264282}

In [29]:
# serve the model
mlflow.serve_model(best_metric["run_id"])

In [30]:
tokenizer = Tokenizer(
    num_words=model_config["vocab_size"], oov_token=model_config["oov_token"]
)

In [31]:
# predict a sentence
tokenizer.fit_on_texts(clean_sentences)
clean_sequences = tokenizer.texts_to_sequences(clean_sentences)
clean_sequences_padded = pad_sequences(
    clean_sequences,
    maxlen=model_config["max_input_length"],
    padding=model_config["padding_type"],
    truncating=model_config["trunc_type"],
)
clean_sequences_padded = np.array(clean_sequences_padded)
# predictions = model.predict(clean_sequences_padded)
# predictions = [np.argmax(prediction) for prediction in predictions]

1. Emergency
2. Normal
3. Non Emergency [Doctor]
4. Non Emergency [No Doctor]

In [32]:
# def get_prediction_from_server(row, mlflow: mlflow_api.MLFlow):
#     """
#     This functions receives response from the machine learning server
#     """

#     row = row[:-1]
#     data = {"dataframe_split": {"columns": list(X_train.columns), "data": [row]}}
#     response = mlflow.get_predictions(data)

#     return response.json()

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00,  5.91it/s]
2023/08/19 22:18:53 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 10.73it/s]   
2023/08/19 22:18:54 INFO mlflow.pyfunc.backend: === Running command 'exec gunicorn --timeout=60 -b 127.0.0.1:1234 -w 1 ${GUNICORN_CMD_ARGS} -- mlflow.pyfunc.scoring_server.wsgi:app'
[2023-08-19 22:18:54 +0200] [27767] [INFO] Starting gunicorn 21.2.0
[2023-08-19 22:18:54 +0200] [27767] [INFO] Listening at: http://127.0.0.1:1234 (27767)
[2023-08-19 22:18:54 +0200] [27767] [INFO] Using worker: sync
[2023-08-19 22:18:54 +0200] [27768] [INFO] Booting worker with pid: 27768
