# Prepare environment

In [None]:
!source .venv/bin/activate

# Imports

In [None]:
import influxdb_client, os, time
from influxdb_client import InfluxDBClient, Point, WritePrecision
from influxdb_client.client.write_api import SYNCHRONOUS
import numpy as np
import pandas as pd
import mlflow
from mlflow.models import infer_signature, set_signature
import mlflow.keras
import yaml
from pickle import dump
import math
import os
import shutil
import matplotlib.pyplot as plt 
from datetime import datetime, timedelta

with open('Presences_keras.yaml', 'r') as file:
    variables = yaml.safe_load(file)

print(f"{variables}")


# Load data

In [None]:
client = influxdb_client.InfluxDBClient(
    url=variables["influx_db"]["url"],
    token=variables["influx_db"]["token"],
    org=variables["influx_db"]["org"],
    verify_ssl=False,
    timeout=180000)

query_api = client.query_api()

# |> range(start: 2024-02-20T17:00:00Z)
start_time = "2024-03-23T00:00:00Z"
query_start = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%SZ") - timedelta(days=5) 

base_query = """
        from(bucket: "homeassistant")
            |> range(start: )
            |> filter(fn: (r) => r["entity_id"] == "entity_name")
            |> filter(fn: (r) => r["_field"] == "value")
            |> fill(usePrevious: true)
            |> drop(columns: ["result", "table", "_start", "_stop", "_field", "source","domain","_measurement", "friendly_name"])
            |> pivot(rowKey: ["_time"], columnKey: ["entity_id"], valueColumn: "_value")
            |> yield(name: "last")"""
base_query = base_query.replace("start: ", f'start: {datetime.strftime(query_start, "%Y-%m-%dT%H:%M:%SZ")}')
for nb, entity in enumerate(variables["data"]):
    print(entity)
    entity = entity.split(":")
    query = base_query.replace("entity_name", entity[0], 1)
    if len(entity)> 1 and {entity[1]} != "":
        query = query.replace('r["_field"] == "value"', f'r["_field"] == "{entity[1]}"')
        query = query.replace('columnKey: ["entity_id"]', f'columnKey: ["_field"]')
        query = query.replace(
            'drop(columns: ["result", "table", "_start", "_stop", "_field", "source","domain","_measurement", "friendly_name"])',
            'drop(columns: ["result", "table", "_start", "_stop", "entity_id", "source","domain","_measurement", "friendly_name"])'
        )
    if len(entity)> 2 and {entity[2]} != "":
       query = query.replace('fn: last,', f'fn: {entity[2]},')
    # print(query)
    df = query_api.query_data_frame(query, org=variables["influx_db"]["org"])
    print(df.head())
    try:
        df.set_index('_time', inplace=True)
        df.drop(["result", "table"], axis=1, inplace=True)
        if nb == 0:
            full_df = df.copy()
        else:
            full_df = full_df.join(df,on="_time", how='outer')
    except KeyError:
        print(f"{entity[1]} was not found")
        if len(entity)> 2:
            full_df[entity[1]] =  entity[2]
        else: 
            full_df[entity[1]] = np.nan
full_df.head()

# Prepare data

In [None]:
print(full_df.shape)
print(full_df.head())
feature_names = variables["features"]

for feature in variables["numeric_features"]:
    full_df[feature] = pd.Series.interpolate(full_df[feature])

full_df.ffill(inplace=True)
full_df["home_status"] = full_df["in_bed"] + full_df["presence"]
full_df = full_df[full_df['ha_started']==1]
full_df.reset_index(inplace=True)
full_df = full_df[full_df['_time']> start_time]
new_df = full_df[variables["features"] + variables["targets"]].copy()
print(new_df.head())
new_df.dropna(inplace=True)
new_df.reset_index(inplace=True)
new_df.drop_duplicates(feature_names, inplace=True, ignore_index=True)
print(new_df.dtypes)


print(new_df.shape)

target = new_df[variables["targets"]]
numeric_features = new_df[feature_names]
numeric_features.head(20)



# mlflow setup

In [None]:
os.environ["MLFLOW_TRACKING_INSECURE_TLS"] = "true"
os.environ["MLFLOW_TRACKING_URI"]="http://192.168.0.2:5051"
mlflow.set_tracking_uri(uri="http://192.168.0.2:5051")
mlflow.set_tracking_uri(uri=variables["mlflow"]["url"])
mlflow.set_experiment("Presence detection generalization keras.")

mlflow.autolog()

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Flatten, Conv1D, MaxPooling1D, Normalization, Dropout, Reshape, Conv2D, MaxPooling2D, Input
from tensorflow.keras.metrics import F1Score, Recall, Precision


# Data transformation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    numeric_features, target, test_size=0.2, random_state=42
)
print(X_train.shape)
print(X_test.shape)

# Model, pipeline setup and fit

In [None]:
# Define input shape
input_shape = (X_train.values.shape[1],) 

mean = np.mean(X_train.values)
variance = np.var(X_train.values)
print(f"mean: {mean}")
print(f"variance: {variance}")

mlflow.set_tag("type", "CNN", synchronous=False)

# Define the model
model = Sequential()
model.add(Input(shape=input_shape))
model.add(Normalization(mean=mean, variance=variance, axis=None))
model.add(Reshape((3,6)))
model.add(LSTM(512, return_sequences=True))
mlflow.set_tag("type", "RNN", synchronous=False)
# model.add(Reshape((3,3,2)))
# model.add(Conv2D(256, (1,1), activation='relu'))
# model.add(MaxPooling2D(1,1))
# model.add(Conv2D(256, (1,1), activation='relu'))
# model.add(MaxPooling2D(3,3))
# mlflow.set_tag("type", "CNN", synchronous=False)

model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(9, activation='sigmoid'))
model.summary()
# model = Sequential([
#     Input(shape=input_shape),
#     Normalization(mean=mean, variance=variance, axis=None),
# ##############################    
#    Reshape((3,3,2)),
#    Conv2D(256, (1,1), activation='relu'),
#    MaxPooling2D(1,1),
#    Conv2D(256, (1,1), activation='relu'),
#    MaxPooling2D(3,3),
# #    mlflow.set_tag("type", "CNN", synchronous=False)
# ############################
#     # Reshape((3,6)),
#     # LSTM(512, input_shape=input_shape,  return_sequences=True),
# #    mlflow.set_tag("type", "RNN", synchronous=False)
# ############################
#     Flatten(),
#     Dense(64, activation='relu'),
#     Dropout(0.5),
#     Dense(32, activation='relu'),
#     Dropout(0.5),
#     Dense(16, activation='relu'),
#     Dropout(0.5),
#     Dense(9, kernel_initializer='normal', activation='sigmoid')  # 2 output targets
# ])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'binary_crossentropy'])


In [None]:
# Train the model
batch_size = 25
epochs = 125

history = model.fit(X_train.values, y_train.values, batch_size=batch_size, epochs=epochs, validation_split=0.2)

run_dict =  mlflow.last_active_run().to_dictionary() 
print(run_dict)

mlflow.end_run()

In [None]:
run_dict =  mlflow.last_active_run().to_dictionary() 
print(run_dict)
signature = infer_signature(numeric_features, model.predict(numeric_features))
set_signature(f"runs:/{run_dict['info']['run_id']}/model", signature)

In [None]:
import matplotlib.pyplot as plt
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

mlflow.end_run()

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay, jaccard_score
metrics_dict = {}
y_pred = model.predict(X_test)
print(y_pred.shape)
print(y_pred[0])
for idx, array in enumerate(y_pred):
    # print(y_pred[idx].shape)
    y_pred[idx] = (y_pred[idx]>0.7).astype(int)

print(y_pred[0])

print(y_pred.shape)

metrics_dict["jaccard score macro"] = jaccard_score(y_test.values, y_pred, average='macro')
print(metrics_dict["jaccard score macro"] )

jaccard_score_array = jaccard_score(y_test.values, y_pred, average=None)
print(jaccard_score_array)
cm = multilabel_confusion_matrix(y_test, y_pred)
cm_dict = {}
jaccard_score_dict = {}
for idx, target in enumerate(variables["targets"]):
    cm_dict[target] = cm[idx]
    jaccard_score_dict[target] = jaccard_score_array[idx]
    cm_disp = ConfusionMatrixDisplay(cm_dict[target])
    cm_disp.plot(cmap=plt.cm.Blues,)
    plt.title(f"{target}")
    plt.savefig(f"test_confusion_matrix_{target}.png")
    shutil.copy(f"test_confusion_matrix_{target}.png", f"/mnt/nfs/mlflow/{run_dict['info']['experiment_id']}/{run_dict['info']['run_id']}/artifacts/")

print(cm)
mlflow.log_metrics(metrics_dict, run_id=f"{run_dict['info']['run_id']}")
mlflow.log_table(cm_dict, artifact_file="confusion_matrix", run_id=f"{run_dict['info']['run_id']}")
mlflow.log_table(jaccard_score_dict, artifact_file="jaccard_score", run_id=f"{run_dict['info']['run_id']}")

In [None]:
# with mlflow.start_run() as run:
#     mlflow.autolog()
#     import autokeras as ak
#     os.environ["MLFLOW_TRACKING_INSECURE_TLS"] = "true"
#     mlflow.set_tracking_uri(uri=variables["mlflow"]["url"])
#     mlflow.set_experiment("Presence detection generalization keras.")
#     # define the search
#     search = ak.AutoModel(
#         inputs=[ak.Input()],
#         outputs=[
#             ak.ClassificationHead(
#                 loss="binary_crossentropy", metrics=["accuracy"]
#             ),
#         ],
#     overwrite=True,
#     max_trials=100,
#     )
#     # perform the search
#     history = search.fit(x=numeric_features.values, y=target.values, validation_split=0.2,)

#     accuracy, _ = search.evaluate(numeric_features.values, target.values, verbose=0)
#     print('Accuracy: %.3f' % accuracy)

#     # get the best performing model
#     model = search.export_model()
#     signature = infer_signature(numeric_features, model.predict(numeric_features))
#     # summarize the loaded model
# model.summary()

In [None]:
# run_dict =  mlflow.last_active_run().to_dictionary() 
# print(run_dict)
# signature = infer_signature(numeric_features, model.predict(numeric_features))
# set_signature(f"runs:/{run_dict['info']['run_id']}/model", signature)