- This notebook is used to experiment with new parameters and evaluate relevant metrics.
- Start with the file _readme_ in the parent folder. This file provides context and troubleshooting information.
- Continue with parameters_training.py. Once you have followed the instructions there, click 'Run all' in this notebook. Check out the Table of Contents on the left to navigate to different sections.
- Once this notebook has been successfully run, query results using the sensor_drift_training_visualisation notebook.

#### Installing required libraries/modules and running required notebooks

In [None]:
# Install packages that are not already part of the Databricks environment
# The first one is a modified wheel created from https://pypi.org/project/pyhydroqc/. The purpose of this modified wheel is to include parameters in the mlflow runs as part of the experimentation phase

%pip install install /dbfs/FileStore/pyhydroqc-mod-0.0.4-py2.py3-none-any.whl
%pip install detecta==0.0.5
%pip install ipytest==0.13.3

In [None]:
%run ./sensor_utilities_training

In [None]:
# Import libraries

from parameters_training import site_params,senid, calib_params
import os
import pandas as pd
import pyhydroqc
import mlflow
import pandas as pd
import matplotlib.pyplot as plt
import math
from pyhydroqc import anomaly_utilities
from pyhydroqc import calibration
from pyhydroqc import model_workflow
from pyhydroqc import rules_detect
from pyhydroqc import modeling_utilities
from pyhydroqc.model_workflow import ModelType
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from detecta import detect_cusum
import plotly.express as px
from datetime import datetime
import ipytest

In [None]:
#  Gather and clean data from a specified environment and sensor ID. The function doesn't take any arguments and returns two pandas DataFrames: `cleaned_df` and `filtered_df`.

# Get the environment name from the environment variable
env = os.getenv('platform_env')

# Call the 'call' function to upload the etdl file
etdl_path = read_parquet_file('userupload', 'etdl', env, '/sensor_drift/measurements')

# Clean the data using the 'clean2' function
cleaned_df = clean_data(etdl_path, f"{senid.senid}", f"{senid.start_date}", f"{senid.end_date}")

# Filter the data using the 'filter_ds2' function
filtered_df = filter_data(etdl_path, f"{senid.senid}", f"{senid.start_date}", f"{senid.end_date}")

# Set the 'datetime' column as the index of the cleaned DataFrame
cleaned_df = cleaned_df.set_index('datetime')



#### Making sure quality coded data including non acceptable and doubtiful is being used (92 and 93)

In [None]:
ipytest.autoconfig()

In [None]:
# Check if any of the observations in the cleaned dataframe have a quality code of True (92/93)
try:
    assert (cleaned_df[f'{senid.prm[0]}_qual'].eq(True)).any(), "Invalid Operation"
except AssertionError as msg:
    print(msg)

In [None]:
%%ipytest -qq

# Define a test function to check if the cleaned_df has any True values in the specified column
def test_example():
    assert(cleaned_df[f'{senid.prm[0]}_qual'].eq(True)).any()

### Loading water quality measurement data and preprocessing

This section also runs the arima model, including calibration parameters, ranges and persistence

In [None]:
# Call data from blob

# Define the site name
site = 'East Trinity'

# Get the list of sensors
sensors = senid.prm

# Define the name for the dataset
name = f"{senid.senid}_{senid.start_date}_{senid.end_date}"

# Get the data for the sensors from the cleaned dataframe
sensor_array = get_data(sensors=sensors, df=cleaned_df)

# Print the data for each sensor in the sensor_array
for sensor in sensor_array:
    print(sensor + str(sensor_array[sensor]))

# Set the first sensor in the list as the n_wqp
n_wqp = sensors[0]


In [None]:
# Apply ranges and persistence preprocessing

range_count = dict()
persist_count = dict()
rules_metrics = dict()

# Apply range check and update range count
for snsr in sensor_array:
    sensor_array[snsr], range_count[snsr] = pyhydroqc.range_check(df=sensor_array[snsr],
                                                                 maximum=site_params[site][snsr].max_range,
                                                                 minimum=site_params[site][snsr].min_range)

    # Apply persistence check and update persistence count
    sensor_array[snsr], persist_count[snsr] = pyhydroqc.persistence(df=sensor_array[snsr],
                                                                   length=120, #site_params[site][snsr].persist,
                                                                   output_grp=True)

    # Interpolate missing values
    sensor_array[snsr] = pyhydroqc.interpolate(df=sensor_array[snsr])

print('Rules based detection complete.\n')

# Print site parameters and calibration parameters
for snsr in sensors:
    print(snsr + str(site_params[site][snsr]))

print('calib' + str(calib_params))

### Model detections

In [None]:
# Get arima detections
arima = dict()
for sensor in sensors:
    arima[sensor] = pyhydroqc.arima_detect(df=sensor_array[sensor], sensor=sensor, params=site_params[site][sensor], rules=False, plots=False, summary=False, compare=True)

In [None]:
# Initialize an empty dictionary to store the results for all sensors

results_all = dict()

# Loop over each sensor in the provided list
for sensor in sensors:
    # Initialize a dictionary to store the models for the current sensor
    models = dict()
    # Store the ARIMA model for the current sensor in the models dictionary
    models['arima'] = arima[sensor].df
    
    # Use the PyHydroQC library to aggregate the results for the current sensor
    # The results are stored in the results_all dictionary
    results_all[sensor] = pyhydroqc.aggregate_results(df=sensor_array[sensor], 
                                                     models=models, 
                                                     verbose=True, 
                                                     compare=True)
# Get the sensor data for the last sensor in the list    
df_test = sensor_array[sensor]
# Rename the 'raw' column to the name of the sensor and reset the DataFrame's index
df_test = df_test.rename(columns={"raw": f"{n_wqp}"}).reset_index()

# Convert the DataFrame to a Spark DataFrame and save it as a Parquet file in the specified location
spark.createDataFrame(df_test).write.mode("overwrite").parquet(f'abfss://lake-userupload@etdllake{env}.dfs.core.windows.net/sensor_drift/training/{name}_test')

# Get the sensor data for the last sensor in the list
df_sensor_array = sensor_array[sensor]

# Convert the results for the last sensor to a DataFrame and reset the DataFrame's index
df_results = pd.DataFrame(results_all[sensor][0]).reset_index()


#### Visualise model results

In [None]:
# This cell is optional and is used to visualize labeled anomalies in the data.

# Convert the results for the last sensor to a DataFrame and reset the DataFrame's index
test = pd.DataFrame(results_all[sensor][0]).reset_index()

# Create a scatter plot of the 'observed' values over time, with the points colored by whether they are labeled as anomalies
fig = px.scatter(test, x="datetime", y="observed", color = "labeled_anomaly")

# Display the plot
fig.show()

# This cell is optional and is used to visualize detected anomalies in the data.

# Create a scatter plot of the 'observed' values over time, with the points colored by whether they are detected as events
fig = px.scatter(test, x="datetime", y="observed", color = "detected_event")

# Display the plot
fig.show()

### Cummulative sum analyses

In [None]:
# This cell is used to preprocess the data for CUSUM (Cumulative Sum) analyses.

# Get the first sensor's data from the sensor_array dictionary
first_sensor = list(sensor_array.values())[0]

# Reset the index of the 'observed' column and store it in a new DataFrame gg
first_sensor_reset = first_sensor['observed'].reset_index()

# Keep only the 'observed' column in the DataFrame
first_sensor_reset = first_sensor_reset[['observed']]

# Initialize a StandardScaler, which will standardize features by removing the mean and scaling to unit variance
std_scaler = StandardScaler()

# Initialize a MinMaxScaler, which scales and translates each feature individually such that it is in the given range on the training set, i.e between zero and one.
mmscaler = MinMaxScaler()

# Apply the StandardScaler to the 'observed' column and convert the result to a numpy array
df_scaled = std_scaler.fit_transform(first_sensor_reset.to_numpy())

# Convert the scaled data back to a DataFrame and name the column 'observed'
df_scaled = pd.DataFrame(df_scaled, columns=['observed'])

In [None]:
# This cell is used to run CUSUM (Cumulative Sum) analyses on the preprocessed data.

# Check the value of n_wqp and run the appropriate CUSUM analysis
if f"{n_wqp}" == 'ph':
    # If n_wqp is 'ph', run CUSUM with a threshold of 1 and drift of 0.4
    ta, tai, taf, amp = detect_cusum(df_scaled, 1, 0.4, True, True)
    
elif f"{n_wqp}" == 'cond':
    # If n_wqp is 'cond', run CUSUM with a threshold of 1 and drift of 0.009
    ta, tai, taf, amp = detect_cusum(df_scaled, 1, 0.009, True, True)
    
elif f"{n_wqp}" == 'temp':
    # If n_wqp is 'temp', run CUSUM with a threshold of 1 and drift of 0.001
    ta, tai, taf, amp = detect_cusum(df_scaled, 1, 0.001, True, True)

# Convert the start and finish indices of the detected changes to DataFrames
dfs = pd.DataFrame(tai, columns= ['start'])
dfe = pd.DataFrame(taf,  columns= ['finish'])

# Join the start and finish DataFrames to create a DataFrame of the detected changes
end = dfs.join(dfe)

# Get the start and finish times of the detected changes from the test DataFrame
s = test.loc[end.start]
s = s[['datetime']].rename(columns = {'datetime':'start'}).reset_index()

f= test.loc[end.finish]
f = f[['datetime']].rename(columns = {'datetime':'final'}).reset_index()

# Concatenate the start and finish times to create a DataFrame of the detected changes with timestamps
inter = pd.concat([s, f], axis=1)

# Keep only the 'start' and 'final' columns
inter2 = inter[['start', 'final']]

# Convert the DataFrame to a Spark DataFrame and save it as a Parquet file in the specified location
spark.createDataFrame(inter2).write.mode("overwrite").parquet(f'abfss://lake-userupload@etdllake{env}.dfs.core.windows.net/sensor_drift/training/{name}_timestamp')

### R section

#### Installing required libraries

In [None]:
%r

# Import libraries

library(sparklyr)
library(glue)
library(dplyr)
library(lubridate)
install.packages(c("mltest", "mlflow", "reticulate"))
library(mlflow)
library(mltest)
library(caret)
library(tidyr)
library(reticulate)

#### Read files produced with Python (above), variables defined by user, and files containing calibration, tides and rain data

In [None]:
%r
# This cell is used to load the required data from Parquet files.

# Get the short name of the TensorFlow environment
env = Sys.getenv('tfenvironmentnameshort')

# Connect to Spark using the Databricks method
sc <- spark_connect(method = "databricks")

# Define the paths to the Parquet files
file_path <-  glue('abfss://lake-raw@etdllake{env}.dfs.core.windows.net/sensor_drift/datasets/rain_historical/') 
file_path3 <-  glue('abfss://lake-raw@etdllake{env}.dfs.core.windows.net/sensor_drift/datasets/cal_tps/')
file_path5 <- glue('abfss://lake-raw@etdllake{env}.dfs.core.windows.net/sensor_drift/datasets/tides_r/')

# Load the Parquet files into Spark DataFrames
rainr_spark <- spark_read_parquet(sc, 
                            path = file_path,
                            header = TRUE,
                            infer_schema = TRUE)

cal_spark <- spark_read_parquet(sc, 
                            path = file_path3,
                            header = TRUE,
                            infer_schema = TRUE)

tides_r_spark <- spark_read_parquet(sc, 
                            path = file_path5,
                            header = TRUE,
                            infer_schema = TRUE)

# Collect the Spark DataFrames into R DataFrames
rainr<-collect(rainr_spark)
calr<-collect(cal_spark)
tides_r<- collect(tides_r_spark)

In [None]:
%r


# Import Python module containing the parameters defined by the user (location, sensor, model). These were defined in parameters_inference.py
py <- import("parameters_training")  # Replace "module_name" with the name of the Python module

# Get the sensor ID from the imported Python module
x <- py$senid

# Call a Python function from the imported module to get the sensor ID
senid <- x$senid

# Construct a string of parameters including the sensor ID, start date, and end date
parameters <- glue("{x$senid}_{py$start_date}_{py$end_date}")

# Get the measurement parameter from the imported Python module
meas<- x$prm

# If there are multiple measurement parameters, keep only the first one
meas<-meas[1]

# Set the name to the constructed string of parameters
name<-parameters

# Set the sensor_b to the sensor ID
sensor_b<-senid

# Set r to the measurement parameter
r<-meas

In [None]:
%r

# This cell is used to load the model detections and CUSUM detections from Parquet files.

# Define the path to the training data
training_path <-  glue('abfss://lake-userupload@etdllake{env}.dfs.core.windows.net/sensor_drift/training/')

# Load the model detections from a Parquet file into a Spark DataFrame
model_detections_spark <- spark_read_parquet(sc, name = "model_detections_table",  path = glue('{training_path}/{name}_test'), header=TRUE)

# Collect the Spark DataFrame into an R DataFrame
model_detections <- collect(model_detections_spark)

# Load the CUSUM detections from a Parquet file into a Spark DataFrame
cusum_detections_spark <- spark_read_parquet(sc, name = "cusum_detections_table",  path = glue('{training_path}/{name}_timestamp'), header=TRUE)

# Collect the Spark DataFrame into an R DataFrame
cusum_detections <- collect(cusum_detections_spark)

# Add a 'date' column to the model_detections DataFrame, which is the 'datetime' column converted to date format
model_detections_with_date <- model_detections %>%
  mutate(date = as_date(datetime))

# Loop over each date and interval to look for false positives
intervals <- split(cusum_detections, factor(cusum_detections$start))
final_intervals <- vector("list", length(intervals))
for (i in seq_along(intervals)) {
  final_intervals[[i]] <- interval((intervals[[i]]$start), (intervals[[i]]$final))
}

#### Merging detections from model with cusum analyses

In [None]:
%r

# This cell is used to identify which model detections are within the CUSUM detections.

# Check if the 'datetime' values in model_detections are within the 'fin' interval
model_detections_within_interval <- model_detections$datetime %within% fin

# Convert the result to a data frame
model_detections_df <- as.data.frame(model_detections_within_interval)

# Filter the data frame to keep only the rows where 'model_detections_df' is TRUE
true_detections_df <- model_detections_df %>% filter(model_detections_df == TRUE)

# Combine 'model_detections' and 'model_detections_df' into a single data frame
combined_detections_df <- cbind(model_detections, model_detections_df)

# Convert the 'detected_event' column to logical values
combined_detections_df$d_event <- as.logical(combined_detections_df$detected_event)

# Filter the data frame to keep only the rows where 'd_event' is TRUE
true_events_df <- combined_detections_df %>% filter(d_event == TRUE)

# Load the quality data from a CSV file into a Spark DataFrame
quality_data_spark_df <- spark_read_csv(sc, name = "test_table",  path = glue('{path_training}/{name}_qual.csv'), header=TRUE)

# Collect the Spark DataFrame into an R DataFrame
quality_data_df  <- collect(quality_data_spark_df)

# Select the 'datetime' and quality columns from 'quality_data_df', and convert 'datetime' to datetime format
quality_data_selected_df <- quality_data_df %>% select(datetime, contains(glue('{r}_qual'))) %>% mutate(datetime = as_datetime(datetime))

# Join 'combined_detections_df' and 'quality_data_selected_df' on the 'datetime' column
combined_quality_detections_df <- left_join(combined_detections_df, quality_data_selected_df)

# Add a 'drift_model_cusum' column to 'combined_quality_detections_df', which is TRUE if either 'd_event' or 'model_detections_df' is TRUE, and FALSE otherwise
model_cusum_df <- combined_quality_detections_df %>% mutate(drift_model_cusum = ifelse(d_event == 'TRUE' | model_detections_df == 'TRUE', "TRUE", "FALSE"))

# Add a 'date' column to 'model_cusum_df', which is the 'datetime' column converted to date format
model_cusum_df <- model_cusum_df %>% mutate(date = as_date(datetime))

# Label according to requirements of machine-label doubtful '82' and anomaly '83'

model_cusum_df <- model_cusum_df %>% mutate(sensor_qual= ifelse(d_event == 'TRUE' & model_detections_df == 'TRUE', '83',
                                                      ifelse(d_event == 'FALSE' & model_detections_df == 'TRUE', '82', model_cusum_df[, 15])))

true_cusum_df <- model_cusum_df %>% filter(drift_model_cusum == 'TRUE')
model_cusum_df %>% group_by(sensor_qual) %>% summarise(n=n())

####  Include tidal, rainfall and calibration data to account for false positives

In [None]:
%r

# This cell is used to filter the tide data to account for false positives.

# Add a 'date' column to 'tides_r', which is the 'time' column converted to date format
# Group by 'date' and filter to keep only the rows where 'tide2' is 'HH' or 'LL'
# Then, ungroup the data
tides_with_date_df <- tides_r %>%
  mutate(date = as_date(time)) %>%
  group_by(date) %>%
  filter(tide2 == 'HH' | tide2 == 'LL') %>%
  ungroup()

# Filter 'tides_with_date_df' to keep only the rows where 'level' is less than -1.2
low_tide_levels_df <- tides_with_date_df %>% filter(level < -1.2)

#max 2.14
#min -1.76
#median 0.132

In [None]:
%r

# This cell is used to preprocess the rain data and join it with the model detections.

# Convert 'data_timestamp' to datetime format, add a 'date' column which is 'data_timestamp' rounded down to the nearest day
# Group by 'date' and calculate the total rain amount for each day, ignoring NA values
# Convert 'date' to date format
rain_data_df = rainr %>%
  mutate(data_timestamp = ymd_hms(data_timestamp)) %>%
  mutate(date = floor_date(data_timestamp, unit = "day")) %>%
  group_by(date) %>%
  summarise(ramount_24hrtot = sum(ramount_10mintot, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(date = as_date(date))

# Filter 'rainr' to keep only the rows where 'ramount_24hrtot' is not NA and 'station' is one of the specified values
# Add a 'date' column which is 'data_timestamp' converted to date format, and keep only the 'date' and 'ramount_24hrtot' columns
filtered_rain_data_df <- rainr %>%
  filter(ramount_24hrtot != 'NA') %>%
  filter(station == 'FirewoodBund1wqt3' | station == 'HillsBundwqt1' | station == 'etdl_path SPH') %>%
  mutate(date = as_date(data_timestamp)) %>%
  select(date, ramount_24hrtot)

# Combine 'filtered_rain_data_df' and 'rain_data_df' into a single data frame
combined_rain_data_df <- rbind(filtered_rain_data_df, rain_data_df)

# Join 'filtered_rain_data_df' and 'rain_data_df' on common columns
joined_rain_data_df <- full_join(filtered_rain_data_df, rain_data_df)

# Convert 'datetime' to date format in 'model_cusum'
model_cusum_df <- model_cusum %>% mutate(date = as_date(datetime))

# Join 'model_cusum_df' and 'joined_rain_data_df' on the 'date' column
rain_and_model_data_df = left_join(model_cusum_df, joined_rain_data_df, by = 'date')

In [None]:
%r

# This cell is used to account for calibration in the sensor data.

# Filter 'calr' to keep only the rows where 'series' is equal to 'sensor_b'
sensor_data_df <- calr %>% filter(series == glue('{sensor_b}'))

# Split 'sensor_data_df' into a list of data frames, each containing the data for one calibration start time
calibration_datasets_list <- split(sensor_data_df, factor(sensor_data_df$calibration_starttime))

# Initialize a list to store the calibration intervals
calibration_intervals_list <- vector("list", length(calibration_datasets_list))

# For each data frame in 'calibration_datasets_list', create an interval from the calibration start time to the calibration end time
for (i in seq_along(calibration_datasets_list)) {
  calibration_intervals_list[[i]] <- interval((calibration_datasets_list[[i]]$calibration_starttime), (calibration_datasets_list[[i]]$calibration_endtime))
}

# Check if the 'datetime' values in 'model_detections' are within the calibration intervals
model_detections_within_calibration <- model_detections$datetime %within% calibration_intervals_list

# Convert the result to a data frame
model_detections_within_calibration_df <- as.data.frame(model_detections_within_calibration)

# Filter 'model_detections_within_calibration_df' to keep only the rows where 'model_detections_within_calibration_df' is TRUE
true_detections_df <- model_detections_within_calibration_df %>% filter(model_detections_within_calibration_df == TRUE)

# Combine 'model_cusum' and 'model_detections_within_calibration_df' into a single data frame
combined_detections_df <- cbind(model_cusum, model_detections_within_calibration_df)

# Convert the 'detected_event' column to logical values
combined_detections_df$d_event <- as.logical(combined_detections_df$detected_event)

# Filter 'combined_detections_df' to keep only the rows where 'd_event' is TRUE
true_events_df <- combined_detections_df %>% filter(d_event == TRUE)

# Print the structure of 'true_events_df'
str(true_events_df)

# Add a 'drift_cal' column to 'combined_detections_df', which is 'FP' if both 'd_event' and 'model_detections_within_calibration_df' are TRUE, and 'd_event' otherwise
calibrated_detections_df <- combined_detections_df %>% mutate(drift_cal = ifelse(d_event == "TRUE" & model_detections_within_calibration_df == "TRUE", "FP", combined_detections_df$d_event))

# Add a 'date' column to 'calibrated_detections_df', which is the 'datetime' column converted to date format
calibrated_detections_with_date_df <- calibrated_detections_df %>% mutate(date = as_date(datetime))

In [None]:
%r

# The following code is used to add a 'drift_status' column to the 'calibrated_detections_with_date_df' dataframe.
# This column indicates whether the sensor was drifting at the time of each measurement.
# If 'drift_model_cusum' is 'TRUE' and 'ramount_24hrtot' is greater than 10, 'drift_status' is 'FP'.
# Otherwise, 'drift_status' is the same as 'drift_model_cusum'.

rain_data_with_drift <- calibrated_detections_with_date_df %>%
mutate(drift_status = ifelse(drift_model_cusum == 'TRUE' & ramount_24hrtot > 10, "FP", calibrated_detections_with_date_df$drift_model_cusum))


In [None]:
%r

# Accounting for tidal changes

# Filter the tide data to keep only the rows where the level is less than -1.3 or greater than 1.5
filtered_tide_data_df <- tides_with_date_df %>% filter(level < -1.3 | level > 1.5) %>% select(date, level, tide)

# Select the necessary columns from the model_cusum_df dataframe
model_data_df <- model_cusum_df %>% select(date, labeled_anomaly, drift_model_cusum)

# Merge the model data with the filtered tide data
merged_model_tide_df <- left_join(model_data_df, filtered_tide_data_df)

# Filter the merged data to keep only the rows where tide is not 'NA' and drift_model_cusum is 'TRUE'
filtered_merged_df <- merged_model_tide_df %>% filter(tide != 'NA' & drift_model_cusum == 'TRUE')

# Add a 'tide_drift' column to the merged data, which indicates whether the sensor was drifting at the time of each measurement
tidal_drift_df <- merged_model_tide_df %>% mutate(tide_drift = ifelse(drift_model_cusum == 'TRUE' & (level < -1.3 | level > 1.5), 'FALSE', drift_model_cusum))

# Filter the data to keep only the rows where drift_model_cusum is 'TRUE' and tide_drift is not 'NA'
filtered_tidal_drift_df <- tidal_drift_df %>% filter(drift_model_cusum == 'TRUE' & tide_drift != 'NA')

# Group the data by date and keep only the unique rows
unique_filtered_tidal_drift_df <- filtered_tidal_drift_df %>% group_by(date) %>% unique()

# Create a vector of unique dates
date_vector <- unique_filtered_tidal_drift_df$date

# Check if the dates in model_cusum are in the date_vector
date_in_model_cusum <- model_cusum$date %in% date_vector

# Convert the result to a dataframe
date_in_model_cusum_df <- as.data.frame(date_in_model_cusum)

# Merge the model_cusum dataframe with the date_in_model_cusum_df dataframe
merged_model_cusum_date_df <- cbind(model_cusum, date_in_model_cusum_df)

# Add a 'tides' column to the data, which indicates whether the sensor was drifting and the tide was high or low at the time of each measurement
tides_df <- merged_model_cusum_date_df %>% mutate(tides = ifelse(drift_model_cusum == 'TRUE' & date_in_model_cusum == 'TRUE', 'FP', drift_model_cusum))

# Filter the data to keep only the rows where tides is 'FP'
filtered_tides_df <- tides_df %>% filter(tides == 'FP')

# Merging all together

# Select the necessary columns from the tides_df, rain, and cali dataframes
tides_datetime_df <- tides_df %>% select(datetime, tides)
rain_datetime_df <- rain %>% select(datetime, rain)
calibration_datetime_df <- cali %>% select(datetime, drift_cal)

# Merge the tides and rain data
merged_tides_rain_df <- merge(tides_datetime_df, rain_datetime_df, by = 'datetime')

# Merge the result with the calibration data
merged_all_df <- merge(merged_tides_rain_df, calibration_datetime_df, by = 'datetime')

# Filter the data to keep only the rows where tides, rain, or drift_cal is 'FP', then group by datetime and keep only the first row of each group
# Finally, add a 'final_d' column which is 'FALSE' for all rows, and select the 'datetime' and 'final_d' columns
final_df <- merged_all_df %>% filter(tides == 'FP' | rain == 'FP' | drift_cal == 'FP') %>% group_by(datetime) %>%
slice_head() %>% ungroup() %>% mutate(final_d = 'FALSE') %>% select(datetime, final_d)

#### Wrangling and writing back to Blob Storage

In [None]:
%r

# Select relevant columns

# Join model_cusum_df with todo dataframe and select relevant columns
model_todo_joined_df <- model_cusum_df %>% select(1:3, 16,18) %>% left_join(todo) %>% ungroup()

# If final_d is NA, replace it with the value from drift_model_cusum column
model_todo_filled_df <- model_todo_joined_df %>% mutate(final_d = ifelse(is.na(final_d), model_todo_joined_df$drift_model_cusum, model_todo_joined_df$final_d))

# Filter rows where drift_model_cusum is 'TRUE'
filtered_model_todo_df <- model_todo_filled_df %>% filter(drift_model_cusum == 'TRUE')

# Rename with relevant column names

# Rename columns starting with "cond", "ph", "temp" and "sensor" to "EC", "pH", "WaterTemp" and "{ss_name}_quality" respectively
# Also, add a new column Location_code with the value of sensor_b
renamed_model_todo_df <- model_todo_filled_df %>%
rename_if(startsWith(names(.), "cond"), ~paste0("EC")) %>%
rename_if(startsWith(names(.), "ph"), ~paste0("pH")) %>%
rename_if(startsWith(names(.), "temp"), ~paste0("WaterTemp")) %>%
mutate(Location_code = sensor_b)

# Get the name of the second column
sensor_name <- names(renamed_model_todo_df[2])

# Rename the column starting with "sensor" to "{sensor_name}_quality" and add a new column Location_code with the value of sensor_b
quality_renamed_df <- renamed_model_todo_df %>% rename_if(startsWith(names(.), "sensor"), ~paste0(glue('{sensor_name}_quality'))) %>% mutate(Location_code = sensor_b)

# Gather the "{sensor_name}_quality" column into two new columns: measurement_quality and measurement_quality_value
quality_gathered_df <- gather(quality_renamed_df, measurement_quality, measurement_quality_value, glue('{sensor_name}_quality'))

# Gather the "{sensor_name}" column into two new columns: measurement and measurement_value
measurement_gathered_df <- gather(quality_gathered_df, measurement, measurement_value, glue('{sensor_name}'))

# Convert final_d to logical
renamed_model_todo_df$final_d <- as.logical(renamed_model_todo_df$final_d)

In [None]:
%r

# Test metrics

as.integer(as.logical(df6$sensor_qual))
as.integer(as.logical(df3$sensor_qual))

example2<- ml_test(predicted=df3$final_d, true = df3$labeled_anomaly)


In [None]:
%r

# Visualise

p <- ggplot(df2, aes(x=datetime, y=df2[,2], col=final_d))+geom_point(size=1) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"))

p <- ggplot(df6, aes(x=datetime, y=measurement_value, col=measurement_quality_value))+geom_point(size=1)



In [None]:
%r

# Write to blob storage

filename = 'test'

if(nrow(df6) == 0){
  print("This data frame is empty")
}else{
  d3 <- copy_to(sc, df6, overwrite= TRUE)

  spark_write_parquet(
  d3,
  glue('abfss://lake-raw@etdllake{env}.dfs.core.windows.net/sensor_drift/results2'),
  header = TRUE,
  delimiter = ",",
  quote = "\"",
  escape = "\\",
  charset = "UTF-8",
  null_value = NULL,
  options = list(),
  partition_by = NULL, mode = "overwrite"
)
}


In [None]:
%r

# Write results back to lake to then run aggregates

file_path6 <- glue('abfss://lake-raw@etdllake{env}.dfs.core.windows.net/sensor_drift/results2')


results_drift <- spark_read_parquet(sc, 
                            path = file_path6,
                            header = TRUE,
                            infer_schema = TRUE)

results_drift<- collect(results_drift)

In [None]:
%r

# Register aggregate run

# Create a run name by appending '_aggregate' to the name
run = glue('{name}_aggregate')

# Start a new MLflow run with the specified experiment ID
mlflow_start_run(experiment_id= 'ecf79be485134557b8485621a6c18924')

# Set the run name tag for the current MLflow run
mlflow_set_tag("mlflow.runName",run)

# Log the first value of the 'F2' column from the 'example2' dataframe as a metric in the current MLflow run
mlflow_log_metric("f2 events", example2$F2[1])

# Log the second value of the 'F2' column from the 'example2' dataframe as a metric in the current MLflow run
mlflow_log_metric("f2 observations)", example2$F2[2])

# End the current MLflow run
mlflow_end_run()