In [1]:
from experiment.api import mlflow as mlflow_api
from experiment.utils import transformation

import pandas as pd
import numpy as np
import pathlib
import json
import collections

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
mlflow = mlflow_api.MLFlow()

# clean the environment without garbage collection
mlflow.clean()

kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]


In [3]:
# run the tracking server in background
mlflow.run_server()

[2023-08-22 15:13:25 +0200] [48481] [INFO] Starting gunicorn 21.2.0
[2023-08-22 15:13:25 +0200] [48481] [INFO] Listening at: http://0.0.0.0:9999 (48481)
[2023-08-22 15:13:25 +0200] [48481] [INFO] Using worker: sync
[2023-08-22 15:13:25 +0200] [48482] [INFO] Booting worker with pid: 48482
[2023-08-22 15:13:25 +0200] [48483] [INFO] Booting worker with pid: 48483
[2023-08-22 15:13:25 +0200] [48484] [INFO] Booting worker with pid: 48484
[2023-08-22 15:13:25 +0200] [48485] [INFO] Booting worker with pid: 48485


In [None]:
model_config_path = (
    transformation.get_project_root() / "data" / "input" / "model_config" / "nlp_experiment.json"
)
model_config = mlflow.get_model_config(model_config_path)

In [None]:
clean_annotations = pd.read_csv(
    transformation.get_project_root() / "data" / "output" / "clean_annotations.csv"
)

# skip the last report
sentences = clean_annotations["relevant_text"].to_list()[:-1]
clean_labels = clean_annotations["classifications"].to_list()[:-1]

In [None]:
item_counts = collections.Counter(clean_annotations["classifications"].to_list())

# 1. Emergency
# 2. Normal
# 3. Non Emergency [Doctor]
# 4. Non Emergency [No Doctor]
for item, count in item_counts.items():
    print(f"Item {item} occurs {count} times in the list.")

In [None]:
sentences[:5]

In [None]:
clean_sentences = transformation.sentence_cleaning_pipeline(sentences)
max_input_length = transformation.find_longest_sentence_length(clean_sentences)

mlflow.set_model_config(model_config_path, {"max_input_length": max_input_length})

In [None]:
# sentence count
print(f"clean sentences: {len(clean_sentences)}")
clean_sentences[:5]

In [None]:
# # manual test the word lemmatizer
# import simplemma
# word = "hemisferde"
# simplemma.lemmatize(word, lang="tr").lower()

In [None]:
training_size = int(len(clean_sentences) * model_config["training_percent"] / 100)
training_sentences = clean_sentences[0:training_size]
testing_sentences = clean_sentences[training_size:]
training_labels = clean_labels[0:training_size]
testing_labels = clean_labels[training_size:]

In [None]:
tokenizer = Tokenizer(
    num_words=model_config["vocab_size"], oov_token=model_config["oov_token"]
)

tokenizer.fit_on_texts(training_sentences)

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(
    training_sequences,
    maxlen=max_input_length,
    padding=model_config["padding_type"],
    truncating=model_config["trunc_type"],
)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(
    testing_sequences,
    maxlen=max_input_length,
    padding=model_config["padding_type"],
    truncating=model_config["trunc_type"],
)

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
tokenizer_path =  transformation.get_project_root() / "tmp" / "data" / 'tokenizer.json'

with open(tokenizer_path, 'w') as file:
    file.write(tokenizer.to_json())

In [None]:
# training_padded

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            model_config["vocab_size"],
            model_config["embedding_dim"],
            input_length=max_input_length,
        ),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(4, activation="softmax"),
    ]
)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

In [None]:
model.summary()

In [None]:
history = model.fit(
    training_padded,
    training_labels,
    epochs=model_config["num_epochs"],
    batch_size=model_config["batch_size"],
    validation_data=(testing_padded, testing_labels),
    verbose=None,
)

In [None]:
transformation.plot_graphs(history, "accuracy")

In [None]:
transformation.plot_graphs(history, "loss")

In [None]:
# log a model run
log_dict = {
    "params": model_config,
    "metrics": {
        "accuracy": history.history["accuracy"][-1],
        "loss": history.history["loss"][-1],
        "val_loss": history.history["val_loss"][-1],
        "val_accuracy": history.history["val_accuracy"][-1],
    },
}

extra_artifacts = {
    "tokenizer": {
        "local_path": tokenizer_path,
        "save_path": "data"
    }
}

run_id = mlflow.log_experiment_run(
    model=model,
    experiment_name="NLP Experiments",
    run_name=f"RNN: first_run",
    log_dict=log_dict,
    registered_model_name="rnn_experiments",
    extra_artifacts=extra_artifacts,
    tags={"model": "deep_learning"},
)