# Prepare environment

In [None]:
!source .venv/bin/activate

# Imports

In [None]:
import influxdb_client, os, time
from influxdb_client import InfluxDBClient, Point, WritePrecision
from influxdb_client.client.write_api import SYNCHRONOUS
import numpy as np
import pandas as pd
import mlflow
from mlflow.models import infer_signature, set_signature
import mlflow.keras
import yaml
from pickle import dump
import math
import os
import shutil
import matplotlib.pyplot as plt 
from datetime import datetime, timedelta

with open('Presences_keras_random_forest.yaml', 'r') as file:
    variables = yaml.safe_load(file)

print(f"{variables}")


# Load data

In [None]:
client = influxdb_client.InfluxDBClient(
    url=variables["influx_db"]["url"],
    token=variables["influx_db"]["token"],
    org=variables["influx_db"]["org"],
    verify_ssl=False,
    timeout=180000)

query_api = client.query_api()

# |> range(start: 2024-02-20T17:00:00Z)
start_time = "2024-02-20T17:00:00Z"
query_start = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%SZ") - timedelta(days=5) 

base_query = """
        from(bucket: "homeassistant")
            |> range(start: )
            |> filter(fn: (r) => r["entity_id"] == "entity_name")
            |> filter(fn: (r) => r["_field"] == "value")
            |> fill(usePrevious: true)
            |> drop(columns: ["result", "table", "_start", "_stop", "_field", "source","domain","_measurement", "friendly_name"])
            |> pivot(rowKey: ["_time"], columnKey: ["entity_id"], valueColumn: "_value")
            |> yield(name: "last")"""
base_query = base_query.replace("start: ", f'start: {datetime.strftime(query_start, "%Y-%m-%dT%H:%M:%SZ")}')
for nb, entity in enumerate(variables["data"]):
    print(entity)
    entity = entity.split(":")
    query = base_query.replace("entity_name", entity[0], 1)
    if len(entity)> 1 and {entity[1]} != "":
        query = query.replace('r["_field"] == "value"', f'r["_field"] == "{entity[1]}"')
        query = query.replace('columnKey: ["entity_id"]', f'columnKey: ["_field"]')
        query = query.replace(
            'drop(columns: ["result", "table", "_start", "_stop", "_field", "source","domain","_measurement", "friendly_name"])',
            'drop(columns: ["result", "table", "_start", "_stop", "entity_id", "source","domain","_measurement", "friendly_name"])'
        )
    if len(entity)> 2 and {entity[2]} != "":
       query = query.replace('fn: last,', f'fn: {entity[2]},')
    # print(query)
    df = query_api.query_data_frame(query, org=variables["influx_db"]["org"])
    print(df.head())
    try:
        df.set_index('_time', inplace=True)
        df.drop(["result", "table"], axis=1, inplace=True)
        if nb == 0:
            full_df = df.copy()
        else:
            full_df = full_df.join(df,on="_time", how='outer')
    except KeyError:
        print(f"{entity[1]} was not found")
        if len(entity)> 2:
            full_df[entity[1]] =  entity[2]
        else: 
            full_df[entity[1]] = np.nan
full_df.head()

# Prepare data

In [None]:
print(full_df.shape)
print(full_df.head())
feature_names = variables["features"]

for feature in variables["numeric_features"]:
    full_df[feature] = pd.Series.interpolate(full_df[feature])

full_df.ffill(inplace=True)
full_df["home_status"] = full_df["in_bed"] + full_df["presence"]
full_df = full_df[full_df['ha_started']==1]
full_df.reset_index(inplace=True)
full_df = full_df[full_df['_time']> start_time]
new_df = full_df[variables["features"] + variables["targets"]].copy()
print(new_df.head())
new_df.dropna(inplace=True)
new_df.reset_index(inplace=True)
new_df.drop_duplicates(feature_names, inplace=True, ignore_index=True)
print(new_df.dtypes)


print(new_df.shape)

target = new_df[variables["targets"]]
numeric_features = new_df[feature_names]
numeric_features.head(20)



# mlflow setup

In [None]:
os.environ["MLFLOW_TRACKING_INSECURE_TLS"] = "true"
os.environ["MLFLOW_TRACKING_URI"]="http://192.168.0.2:5051"
mlflow.set_tracking_uri(uri="http://192.168.0.2:5051")
mlflow.set_tracking_uri(uri=variables["mlflow"]["url"])
mlflow.set_experiment("Presence detection Keras.")


# Data transformation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    numeric_features, target, test_size=0.2, random_state=42
)
print(X_train.shape)
print(X_test.shape)


# Model, pipeline setup and fit

In [None]:
# Maximum number of decision trees. The effective number of trained trees can be smaller if early stopping is enabled.
NUM_TREES = 250
# Minimum number of examples in a node.
MIN_EXAMPLES = 6
# Maximum depth of the tree. max_depth=1 means that all trees will be roots.
MAX_DEPTH = 5
# Ratio of the dataset (sampling without replacement) used to train individual trees for the random sampling method.
SUBSAMPLE = 0.65
# Control the sampling of the datasets used to train individual trees.
SAMPLING_METHOD = "RANDOM"
# Ratio of the training dataset used to monitor the training. Require to be >0 if early stopping is enabled.
VALIDATION_RATIO = 0.2

cpu_count = os.cpu_count()
n_jobs = int(math.floor((cpu_count)))

with mlflow.start_run() as run:

    mlflow.autolog()
    import tensorflow_decision_forests as tfdf
    from sklearn.metrics import accuracy_score, log_loss, precision_score,recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
    from sklearn.metrics import make_scorer, classification_report
    
    # Create a Random Search tuner with 50 trials and automatic hp configuration.
    tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True, trial_num_threads=16)
    
    model = tfdf.keras.GradientBoostedTreesModel(
        tuner=tuner,
        validation_ratio=VALIDATION_RATIO,
        task=tfdf.keras.Task.CLASSIFICATION,
        verbose=2,
        num_threads=n_jobs,
    )

    model.compile(metrics=["accuracy", precision_score])

    os.environ["MLFLOW_TRACKING_INSECURE_TLS"] = "true"
    mlflow.set_tracking_uri(uri=variables["mlflow"]["url"])
    mlflow.set_experiment("Presence detection Keras.")
    # define the search
    history = model.fit(
        X_train.values,
        y_train.values,
        # callbacks=[mlflow.keras.MLflowCallback()],
    )
    # accuracy, f1_score, _ = model.evaluate(X_test.values, y_test.values)
    

    # print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    # mlflow.log_metric("Test accuracy", accuracy)

    #model = model.export_model()
    # summarize the loaded model
    model.summary()
    signature = infer_signature(numeric_features, model.predict(numeric_features))
    mlflow.tensorflow.log_model(
        model, 'model',
        # signature=signature,
        extra_pip_requirements=["tensorflow-decision-forests"]
    )
    pred = model.predict(X_test)
    pred = np.argmax (pred, axis = 1)
    class_report = classification_report(y_test, pred, output_dict=True)
    print(class_report)
    
    ConfusionMatrixDisplay.from_predictions(
        pred,
        y_test.values,
        cmap=plt.cm.Blues,
        normalize='true',
    )
    plt.savefig("test_confusion_matrix.png")
    
    inspector = model.make_inspector()
    print(inspector.evaluation())
    print(inspector.variable_importances())
    mlflow.log_metrics(inspector.evaluation().to_dict())
    class_report.pop("accuracy")
    for class_or_avg, metrics_dict in class_report.items():
        for metric, value in metrics_dict.items():
            mlflow.log_metric(class_or_avg + '_' + metric,value)
    



In [None]:
run_dict =  mlflow.last_active_run().to_dictionary() 
print(run_dict)
shutil.copy("test_confusion_matrix.png", f"/mnt/nfs/mlflow/{run_dict['info']['experiment_id']}/{run_dict['info']['run_id']}/artifacts/")
signature = infer_signature(numeric_features, model.predict(numeric_features))
set_signature(f"runs:/{run_dict['info']['run_id']}/model", signature)

In [None]:
# Display the tuning logs.
tuning_logs = model.make_inspector().tuning_logs()
print(tuning_logs.head())
plt.figure(figsize=(10, 5))
plt.plot(tuning_logs["score"], label="current trial")
plt.plot(tuning_logs["score"].cummin(), label="best trial")
plt.xlabel("Tuning step")
plt.ylabel("Tuning score")
plt.legend()
plt.savefig("tuning_steps.png")
shutil.copy("tuning_steps.png", f"/mnt/nfs/mlflow/{run_dict['info']['experiment_id']}/{run_dict['info']['run_id']}/artifacts/")


In [None]:
ConfusionMatrixDisplay.from_predictions(
        pred,
        y_test.values,
        cmap=plt.cm.Blues,
    #    normalize='true',
    )

In [None]:
# If model log fails run this:
# import tensorflow
# tensorflow.saved_model.save(
#     model,
#     f"/mnt/nfs/mlflow/{run_dict['info']['experiment_id']}/{run_dict['info']['run_id']}/artifacts/tfdf_model",
# )
